20895 lines
609 KiB
C++
20895 lines
609 KiB
C++
//-*- coding: utf-8 -*-
|
|
|
|
#include "Proxy.h"
|
|
|
|
class Address *g_address; // for debug
|
|
|
|
#define CRID_ANY 0
|
|
#define CRID_US 226
|
|
|
|
//
|
|
// if you have "in <city/adm1 name>" in same sentence as street then
|
|
// require that that item be a city/adm1 in any address you try to do.
|
|
// i would set "int64_t inPrepPhrase" to be the city/adm1 place hash.
|
|
// so if it is not zero, check for it. but add it with addProperPlaces()
|
|
// first to see if it added anything!! then we can
|
|
//
|
|
|
|
//and fix it so "1914" years and older years are pub dates!
|
|
//and inclide days of the week in pub dates like "sunday, april 11, 2004"
|
|
//too!!
|
|
//do not allow lower case 'or' in place name!
|
|
//do not allow place names starting with "arrangements by" or "sponsored by"
|
|
|
|
|
|
// test on http://alibi.com/index.php?scn=cal
|
|
// test on http://www.burtstikilounge.com/burts/
|
|
|
|
// TODO: FOR ADDRESS overlap detection, just hash every word index for
|
|
// every Place which can not be shared. then store the score and
|
|
// Address ptr as the data value, so we can do a quick compare!
|
|
|
|
// TODO: also add conflicting addresses with the same score as winners.
|
|
// if we can't resolve a winner then we should just eliminate both/all
|
|
// to be on the safe side. like the alibi.com page has both albuquerque
|
|
// and santa fe in the <title> tag so it is really just lucky that we
|
|
// pick albuquerque most of the time... we might be able to bring in
|
|
// street name to city map to help us fix this one. if both cities have
|
|
// the same street name, then nuke both! any other ideas?
|
|
|
|
// TODO: for the abqjournal.com page we need to determine the most popular
|
|
// city/adm1 pair over the whole page and use that as another default
|
|
// option. also consider if we should have several and score them...
|
|
|
|
// TODO: for all the phrases in "small" sections and all phrases following
|
|
// "at" or "at the" look those phrases up in placedb as place names
|
|
// to get their addresses. also confirm the place names we extract
|
|
// that are immediately before street names. also get all the possible
|
|
// city/adm1/ctry tuples that each place name might have. if these
|
|
// are not right next to it then i guess you need to get them from
|
|
// the title and tagdb. that way the placedb lookup can integrate
|
|
// the tuples into the key and greatly narrow the list. we may have
|
|
// to then do multiple lookups for the same place name in placedb,
|
|
// so another reason we should distribute them and keep them in memory
|
|
// or at least on an SSD. use *namedb* to index place names just like
|
|
// indexdb. then we can conduct a search for a place name on namedb
|
|
// and get the corresponding keys of the place records in placedb.
|
|
// namedb will need to be mostly in memory then!
|
|
|
|
// TODO: verify street addresses we do extract by looking up each one in
|
|
// placedb by the street. each street may have multiple city/adm1/ctry
|
|
// tuples, so this lookup should narrow it down!
|
|
|
|
// test zipcode hyphen fix on abqjournal.com/contact.html
|
|
|
|
#include "gb-include.h"
|
|
#include "Address.h"
|
|
#include "Sections.h"
|
|
//#include "DateParse2.h"
|
|
#include "Abbreviations.h"
|
|
#include "Phrases.h"
|
|
//#include "Weights.h"
|
|
#include "XmlDoc.h" // hashWords()
|
|
#include "Hostdb.h"
|
|
#include "Placedb.h"
|
|
#include "sort.h"
|
|
#include "HttpServer.h"
|
|
|
|
//#define CF_UNIQUE (((uint64_t)1LL)<<63)
|
|
|
|
bool getBestLatLon ( RdbList *list ,
|
|
double *bestLat ,
|
|
double *bestLon ,
|
|
int32_t *numVotes ,
|
|
int32_t niceness ,
|
|
int32_t winnerSnh ) ;
|
|
char *getLatLonPtrFromStr ( char *data ) ;
|
|
void getLatLonFromStr ( char *data , double *lat , double *lon);
|
|
char *getStateAbbr ( uint64_t bit ) ;
|
|
int64_t getWordXorHash ( char *s ) ;
|
|
int64_t getWordXorHash2 ( char *s ) ;
|
|
int32_t getStateOffset ( int64_t *h ) ;
|
|
class StateDesc *getStateDescFromBits ( uint64_t bit ) ;
|
|
// returns 0 if not a state:
|
|
uint64_t getStateBitFromHash ( int64_t *h ) ;
|
|
static bool setHashes ( class Place *p , Words *ww , int32_t niceness ) ;
|
|
|
|
static bool addIndicator ( char *s , char bit , float boost );
|
|
static bool addIndicator ( int64_t h , char bit , float boost );
|
|
//static void printAddress ( class Address *A , class SafeBuf *pbuf , int32_t i);
|
|
static void printPlaces ( PlaceMem *pm , SafeBuf *pbuf ,
|
|
class Sections *sections ,
|
|
class Address *base ) ;
|
|
static bool getZipLatLon ( char *zip ,
|
|
int32_t zipLen ,
|
|
float *zipLat ,
|
|
float *zipLon ) ;
|
|
|
|
//
|
|
// new stuff
|
|
//
|
|
static bool generatePlacesFile ( ) ;
|
|
static bool loadPlaces ( ) ;
|
|
class PlaceDesc *getState_new ( uint64_t pd64 , uint8_t crid , int32_t niceness );
|
|
PlaceDesc *getState2_new ( char *state , uint8_t crid , int32_t niceness ) ;
|
|
class PlaceDesc *getCity_new ( uint64_t ch64 ,
|
|
char *stateAbbr ,
|
|
uint8_t crid ,
|
|
int32_t niceness ) ;
|
|
class PlaceDesc *getCity2_new ( char *city ,
|
|
char *stateAbbr ,
|
|
uint8_t crid ,
|
|
int32_t niceness ) ;
|
|
PlaceDesc *getCity3_new ( uint64_t ch64 ,
|
|
uint64_t stateHash64,
|
|
uint8_t crid ,
|
|
int32_t niceness ) ;
|
|
bool getLongestPlaceName_new ( int32_t i,
|
|
int32_t alnumPos,
|
|
Words *w,
|
|
// must match! PDF_CITY|STATE|COUNTRY
|
|
uint8_t placeType,
|
|
uint8_t crid, // can be CRID_ANY
|
|
char *stateAbbr, // can be NULL
|
|
uint64_t *placeHash64,
|
|
int32_t *placeAlnumA,
|
|
int32_t *placeAlnumB,
|
|
int32_t *placeA,
|
|
int32_t *placeB ,
|
|
// set to most popular match
|
|
PlaceDesc **pdp ) ;
|
|
bool getZip_new ( int32_t a ,
|
|
int32_t alnumPos ,
|
|
Words *words ,
|
|
uint64_t *zipHash64 ,
|
|
uint64_t *zipCityHash64 ,
|
|
uint64_t *zipStateHash64 ,
|
|
int32_t *zipAlnumA,
|
|
int32_t *zipAlnumB,
|
|
int32_t *zipA,
|
|
int32_t *zipB ,
|
|
float *zipLat,
|
|
float *zipLon) ;
|
|
|
|
PlaceDesc *getMostPopularPlace_new ( int64_t cityHash64,
|
|
uint8_t crid ,
|
|
uint8_t placeType,
|
|
int32_t niceness );
|
|
|
|
char *g_pbuf = NULL;
|
|
int32_t g_pbufSize = 0;
|
|
HashTableX g_nameTable;
|
|
|
|
char *PlaceDesc::getOfficialName ( ) {
|
|
return g_pbuf + m_officialNameOffset;
|
|
}
|
|
|
|
char *PlaceDesc::getStateName ( ) {
|
|
// get our state abbr
|
|
char buf[3];
|
|
buf[0] = m_adm1[0];
|
|
buf[1] = m_adm1[1];
|
|
buf[2] = '\0';
|
|
// does this convert to lowercase? yes... it should
|
|
uint64_t placeHash64 = getWordXorHash ( buf );
|
|
// look up the place desc for the state
|
|
PlaceDesc *sd = getPlaceDesc ( placeHash64 ,
|
|
PDF_STATE,
|
|
m_crid,
|
|
buf, // state abbr
|
|
0 ); // niceness
|
|
if ( ! sd ) return NULL;
|
|
return sd->getOfficialName();
|
|
}
|
|
|
|
const char *PlaceDesc::getCountryName ( ) {
|
|
return g_countryCode.getName ( m_crid );
|
|
}
|
|
|
|
HashTableX g_indicators;
|
|
static HashTableX g_timeZones;
|
|
static HashTableX g_cities;
|
|
static HashTableX g_states;
|
|
static HashTableX g_aliases;
|
|
static HashTableX g_zips;
|
|
|
|
char *g_cityBuf = NULL;
|
|
int32_t g_cityBufSize = 0;
|
|
|
|
// . NOW each slot in the g_cities has a ptr to a CityDesc in SafeBuf g_cityBuf
|
|
// . so now we can put all the alternate names and aliases into the same table
|
|
class CityDesc {
|
|
public:
|
|
// set bit for each state that the city is in
|
|
uint64_t m_adm1Bits;
|
|
// for chicago, we would pick "13" since s_states[13] is illinois
|
|
char m_mostPopularState;
|
|
// "us.nm,us.ny,es.a1,...|en-nl-fi=cincinnati,es-de=cincinnatus,..."
|
|
char m_data[];
|
|
};
|
|
|
|
//bool setFromStr(Address *a,char *s,pbits_t flags ,
|
|
// Place *places , int32_t *np , int32_t maxPlaces, int32_t niceness );
|
|
|
|
static uint64_t getAddressHash ( Place *street ,
|
|
Place *city ,
|
|
Place *adm1 ,
|
|
Place *zip ) ;
|
|
|
|
static void verifiedWrapper ( void *state ) ;
|
|
static void gotMsg2cReplyWrapper ( void *state , void *state2 ) ;
|
|
static void gotList2c ( void *state , RdbList *xxx , Msg5 *yyy ) ;
|
|
static void sendBackAddress ( class State2c *st ) ;
|
|
|
|
Place *g_pa = NULL;
|
|
|
|
#define MIN_POP_COUNT 500
|
|
|
|
//#define MAX_STREETS 300
|
|
//#define MAX_PLACES 3500
|
|
// i raised from 15 to 25 since "Virginia Beach" city was not being picked up
|
|
// on socialmediabeach.com
|
|
#define MAX_CITIES 25
|
|
#define MAX_ADM1 80 // 1500
|
|
#define MAX_ZIPS 5
|
|
|
|
// stock g_zips with these zip code descriptors
|
|
class ZipDesc {
|
|
public:
|
|
// . this is unique within the country code only
|
|
// . see /gb/geo/geonames/admin1Codes.txt for the list
|
|
// . remove the "CC." country code prefixing each
|
|
// . example from that file: "NL.09 Utrecht\n"
|
|
char m_adm1[2];
|
|
// a single byte country id (converted to from a 2 char country id)
|
|
//uint8_t m_crid;
|
|
// hash of the city it is in
|
|
int64_t m_cityHash;
|
|
// offset into g_cityBuf of the city name
|
|
int32_t m_cityOffset;
|
|
// now we use the adm1 bits since US-only now
|
|
uint64_t m_adm1Bits;
|
|
// lat/lon of centroid. for sorting by dist when user's zip is known
|
|
float m_latitude;
|
|
float m_longitude;
|
|
|
|
//void reset() {m_crid = 0; m_adm1[0] = m_adm1[1] = 0;};
|
|
void reset() {m_adm1Bits = 0;m_adm1[0]=0; m_adm1[1]=0;};
|
|
};
|
|
|
|
|
|
static char *s_days[] = {
|
|
"sunday",
|
|
"monday",
|
|
"tuesday",
|
|
"wednesday",
|
|
"thursday",
|
|
"friday",
|
|
"saturday",
|
|
|
|
"sundays",
|
|
"mondays",
|
|
"tuesdays",
|
|
"wednesdays",
|
|
"thursdays",
|
|
"fridays",
|
|
"saturdays",
|
|
NULL
|
|
};
|
|
|
|
|
|
static StateDesc s_states[] = {
|
|
{"al","alabama","ala"},
|
|
{"ak","alaska","alas"},
|
|
{"az","arizona","ariz"},
|
|
{"ar","arkansas","ark"},
|
|
{"ca","california","calif"},
|
|
{"co","colorado","colo"},
|
|
{"ct","connecticut","conn"},
|
|
{"de","delaware","del"},
|
|
{"dc","district of columbia","d.c."},
|
|
{"fl","florida","fla"},
|
|
{"ga","georgia",NULL},
|
|
{"hi","hawaii","h.i."},
|
|
{"id","idaho","ida"},
|
|
{"il","illinois","ill"},
|
|
{"in","indiana","ind"},
|
|
{"ia","iowa",NULL},
|
|
{"ks","kansas","kan"},
|
|
{"ky","kentucky",NULL},
|
|
{"la","louisiana",NULL},
|
|
{"me","maine",NULL},
|
|
{"md","maryland",NULL},
|
|
{"ma","massachusetts","mass"},
|
|
{"mi","michigan","mich"},
|
|
{"mn","minnesota","minn"},
|
|
{"ms","mississippi","miss"},
|
|
{"mo","missouri",NULL},
|
|
{"mt","montana","mont"},
|
|
{"ne","nebraska","nebr"},
|
|
{"nv","nevada","nev"},
|
|
{"nh","new hampshire","n.h."},
|
|
{"nj","new jersey","n.j."},
|
|
{"nm","new mexico","n.m."},
|
|
{"ny","new york","n.y."},
|
|
{"nc","north carolina","n.c."},
|
|
{"nd","north dakota","n.d."},
|
|
{"oh","ohio",NULL},
|
|
{"ok","oklahoma","okla"},
|
|
{"or","oregon","ore"},
|
|
{"pa","pennsylvania","penn"},
|
|
{"ri","rhode island","r.i."},
|
|
{"sc","south carolina","s.c."},
|
|
{"sd","south dakota","s.d."},
|
|
{"tn","tennessee","tenn"},
|
|
{"tx","texas","tex"},
|
|
{"ut","utah",NULL},
|
|
{"vt","vermont",NULL},
|
|
{"va","virginia","virg"},
|
|
{"wa","washington","wash"},
|
|
{"wv","west virginia","w.v."},
|
|
{"wi","wisconsin","wis"},
|
|
{"wy","wyoming","wyo"}
|
|
};
|
|
|
|
#include "StopWords.h"
|
|
static HashTableX s_doyTable;
|
|
static bool s_doyInit = false;
|
|
int32_t getDayOfWeek ( int64_t h ) {
|
|
if ( ! s_doyInit ) {
|
|
s_doyInit = initWordTable(&s_doyTable, s_days ,
|
|
//sizeof(s_days),
|
|
"doytbl");
|
|
if ( ! s_doyInit ) return -1;
|
|
}
|
|
// . get from table
|
|
// . score should be 1 for sunday i guess
|
|
int32_t score = s_doyTable.getScore ( &h );
|
|
// make it 0-6
|
|
score = (score-1) % 7;
|
|
// that's it
|
|
return score;
|
|
}
|
|
|
|
// http://www.dailylobo.com/calendar/
|
|
// http://www.abqthemag.com/events.html
|
|
// http://www.abqjournal.com/calendar/default.php
|
|
// http://www.abqjournal.com/calendar/month.htm (243k! do not truncate!!)
|
|
// http://www.kasa.com/subindex/entertainment/events_calendar
|
|
// http://www.trumba.com/calendars/KRQE_Calendar.rss (rss)
|
|
// http://www.koat.com/calendar/index.html
|
|
// http://www.trumba.com/calendars/albuquerque-area-events-calendar.rss.
|
|
// http://www.google.com/calendar/embed?mode=AGENDA&height=700&wkst=1&bgcolor=%23FFFFFF&src=vn90mq4n30kodohqjv8cdn5cfg%40group.calendar.google.com&color=%237A367A
|
|
// http://www.krqe.com/subindex/features/events_calendar
|
|
// http://www.alibi.com/index.php?scn=cal
|
|
// http://www.publicbroadcasting.net/kunm/events.eventsmain
|
|
// http://www.publicbroadcasting.net/kunm/events.eventsmain?action=showCategoryListing&newSearch=true&categorySearch=4025
|
|
// http://www.770kob.com/article.asp?id=521586
|
|
// http://events.kgoradio.com/
|
|
// http://www.livenation.com/venue/journal-pavilion-tickets (journal pavilion)
|
|
// http://www.livenation.com/venue/kiva-auditorium-tickets
|
|
// http://events.kqed.org/events/
|
|
// http://www.sfbg.com/entry.php?entry_id=8401&catid=85&l=1
|
|
// http://events.sfgate.com/ (zvents.com)
|
|
// http://events.sfgate.com/search?cat=1
|
|
// http://entertainment.signonsandiego.com/search/?type=event
|
|
// http://www.sdcitybeat.com/cms/event/search/?menu=Events
|
|
// ** http://www.sandiegometro.com/calendar/arts.php
|
|
|
|
// address parsing test cases:
|
|
// http://yellowpages.superpages.com/listings.jsp?CS=L&MCBP=true&search=Find+It&SRC=&C=bicycles&STYPE=S&L=Albuquerque+NM+&x=0&y=0
|
|
|
|
// address examples:
|
|
|
|
// BRAZIL:
|
|
// Marina Costa e Silva
|
|
// Rua Afonso Canargo, 805
|
|
// Santana
|
|
// 85070-200 Guarapuava - PR
|
|
|
|
// University of New Mexico
|
|
// Department of Physics and Astronomy
|
|
// MSC07 4220
|
|
// 800 Yale Blvd NE
|
|
// Albuquerque, New Mexico 87131-0001 USA
|
|
|
|
// US-380
|
|
// Lincoln, NM
|
|
// Saturday, August 8, 2009
|
|
|
|
|
|
static bool s_init = false;
|
|
|
|
Addresses::Addresses ( ) {
|
|
m_buf = NULL;
|
|
m_bufSize = 0;
|
|
m_calledGeocoder = false;
|
|
m_xd = NULL;
|
|
m_msg2c = NULL;
|
|
m_sorted = NULL;
|
|
m_sortedValid = false;
|
|
m_breached = false;
|
|
m_numValid = 0;
|
|
}
|
|
|
|
Addresses::~Addresses ( ) {
|
|
reset();
|
|
}
|
|
|
|
void Addresses::reset ( ) {
|
|
if ( m_buf && m_bufSize )
|
|
mfree ( m_buf , m_bufSize , "adata");
|
|
m_buf = NULL;
|
|
m_bufSize = 0;
|
|
m_sb.purge();
|
|
//m_ptValid = false;
|
|
//m_msg2c.m_requests = 0;
|
|
//m_msg2c.m_replies = 0;
|
|
m_firstBreach = true;
|
|
m_breached = false;
|
|
m_numValid = 0;
|
|
m_calledGeocoder = false;
|
|
if ( m_msg2c ) {
|
|
mdelete ( m_msg2c , sizeof(Msg2c),"aamsg2c");
|
|
delete (m_msg2c);
|
|
m_msg2c = NULL;
|
|
}
|
|
// free buf
|
|
if ( m_sorted )
|
|
mfree ( m_sorted , m_sortedSize , "asortbuf");
|
|
m_sorted = NULL;
|
|
m_sortedValid = false;
|
|
m_uniqueStreetHashes = 0;
|
|
}
|
|
|
|
static int64_t h_court;
|
|
static int64_t h_i;
|
|
static int64_t h_interstate;
|
|
static int64_t h_page ;
|
|
static int64_t h_corner ;
|
|
static int64_t h_between ;
|
|
static int64_t h_btwn ;
|
|
static int64_t h_bet ;
|
|
static int64_t h_streets ;
|
|
static int64_t h_sts ;
|
|
static int64_t h_at ;
|
|
static int64_t h_come ;
|
|
static int64_t h_is ;
|
|
static int64_t h_located ;
|
|
static int64_t h_intersection;
|
|
static int64_t h_law ;
|
|
static int64_t h_address ;
|
|
static int64_t h_added ;
|
|
static int64_t h_copy ;
|
|
static int64_t h_search ;
|
|
static int64_t h_find ;
|
|
static int64_t h_go ;
|
|
static int64_t h_town ;
|
|
static int64_t h_city ;
|
|
static int64_t h_street ;
|
|
static int64_t h_telephone;
|
|
static int64_t h_tel ;
|
|
static int64_t h_ph ;
|
|
static int64_t h_fax ;
|
|
static int64_t h_where ;
|
|
static int64_t h_location;
|
|
static int64_t h_venue ;
|
|
static int64_t h_map ;
|
|
static int64_t h_office ;
|
|
static int64_t h_center ;
|
|
static int64_t h_mailing ;
|
|
static int64_t h_mail ;
|
|
static int64_t h_snail ;
|
|
static int64_t h_edit ;
|
|
static int64_t h_email ;
|
|
static int64_t h_phone ;
|
|
static int64_t h_inc ;
|
|
static int64_t h_llc ;
|
|
static int64_t h_review ;
|
|
static int64_t h_reviews ;
|
|
static int64_t h_write ;
|
|
static int64_t h_add ;
|
|
static int64_t h_view ;
|
|
static int64_t h_favorites ;
|
|
static int64_t h_more ;
|
|
static int64_t h_info ;
|
|
static int64_t h_information ;
|
|
static int64_t h_the ;
|
|
static int64_t h_in ;
|
|
static int64_t h_a ;
|
|
static int64_t h_paseo ;
|
|
static int64_t h_de ;
|
|
static int64_t h_del ;
|
|
static int64_t h_all ;
|
|
static int64_t h_rights ;
|
|
static int64_t h_reserved ;
|
|
static int64_t h_contact ;
|
|
static int64_t h_us ;
|
|
static int64_t h_by ;
|
|
static int64_t h_of ;
|
|
static int64_t h_for ;
|
|
static int64_t h_arrangements ;
|
|
static int64_t h_arranged ;
|
|
static int64_t h_sponsored ;
|
|
static int64_t h_to ;
|
|
static int64_t h_every ;
|
|
static int64_t h_p ;
|
|
static int64_t h_b ;
|
|
static int64_t h_hwy ;
|
|
static int64_t h_state ;
|
|
static int64_t h_county ;
|
|
static int64_t h_cnty ;
|
|
static int64_t h_cty ;
|
|
static int64_t h_road ;
|
|
static int64_t h_route ;
|
|
static int64_t h_rte ;
|
|
static int64_t h_rt ;
|
|
static int64_t h_highway ;
|
|
static int64_t h_hiway ;
|
|
static int64_t h_cr ;
|
|
static int64_t h_o ;
|
|
static int64_t h_po ;
|
|
static int64_t h_post ;
|
|
static int64_t h_box ;
|
|
static int64_t h_top ;
|
|
static int64_t h_one ;
|
|
static int64_t h_noon ;
|
|
static int64_t h_midnight ;
|
|
static int64_t h_daily ;
|
|
static int64_t h_st ;
|
|
static int64_t h_nd ;
|
|
static int64_t h_rd ;
|
|
static int64_t h_th ;
|
|
static int64_t h_away ;
|
|
static int64_t h_results ;
|
|
static int64_t h_days ;
|
|
static int64_t h_blocks ;
|
|
static int64_t h_block ;
|
|
static int64_t h_miles ;
|
|
static int64_t h_mile ;
|
|
static int64_t h_year ;
|
|
static int64_t h_years ;
|
|
static int64_t h_yr ;
|
|
static int64_t h_yrs ;
|
|
static int64_t h_hours ;
|
|
static int64_t h_hrs ;
|
|
static int64_t h_hour ;
|
|
static int64_t h_hr ;
|
|
static int64_t h_mi ;
|
|
static int64_t h_kilometers;
|
|
static int64_t h_km ;
|
|
static int64_t h_copyright ;
|
|
static int64_t h_and ;
|
|
static int64_t h_or ;
|
|
static int64_t h_suite ;
|
|
static int64_t h_ste ;
|
|
static int64_t h_bldg ;
|
|
static int64_t h_bld ;
|
|
static int64_t h_building ;
|
|
static int64_t h_unit ;
|
|
static int64_t h_room ;
|
|
static int64_t h_pier ;
|
|
static int64_t h_rm ;
|
|
static int64_t h_run ;
|
|
static int64_t h_ne ;
|
|
static int64_t h_nw ;
|
|
static int64_t h_se ;
|
|
static int64_t h_sw ;
|
|
static int64_t h_n ;
|
|
static int64_t h_s ;
|
|
static int64_t h_e ;
|
|
static int64_t h_w ;
|
|
static int64_t h_north;
|
|
static int64_t h_northeast;
|
|
static int64_t h_northwest;
|
|
static int64_t h_east;
|
|
static int64_t h_west;
|
|
static int64_t h_south;
|
|
static int64_t h_southeast;
|
|
static int64_t h_southwest;
|
|
static int64_t h_heart ;
|
|
static int64_t h_core ;
|
|
static int64_t h_least ;
|
|
static int64_t h_most ;
|
|
static int64_t h_this ;
|
|
static int64_t h_appeared ;
|
|
static int64_t h_role ;
|
|
static int64_t h_studied;
|
|
static int64_t h_prize;
|
|
static int64_t h_finish;
|
|
static int64_t h_door;
|
|
static int64_t h_entrance;
|
|
static int64_t h_area;
|
|
static int64_t h_left ;
|
|
static int64_t h_right ;
|
|
static int64_t h_stare ;
|
|
static int64_t h_sea ;
|
|
static int64_t h_discount ;
|
|
static int64_t h_discounted ;
|
|
static int64_t h_www;
|
|
static int64_t h_gaze ;
|
|
static int64_t h_look ;
|
|
static int64_t h_looking;
|
|
static int64_t h_be ;
|
|
static int64_t h_determined ;
|
|
static int64_t h_call ;
|
|
static int64_t h_details;
|
|
static int64_t h_tba;
|
|
static int64_t h_avenue;
|
|
static int64_t h_ave;
|
|
static int64_t h_register;
|
|
static int64_t h_sign;
|
|
static int64_t h_up;
|
|
static int64_t h_signup;
|
|
static int64_t h_tickets;
|
|
static int64_t h_purchase;
|
|
static int64_t h_get;
|
|
static int64_t h_enroll;
|
|
static int64_t h_buy;
|
|
static int64_t h_presale ;
|
|
static int64_t h_pre ;
|
|
static int64_t h_sale ;
|
|
static int64_t h_on ;
|
|
static int64_t h_sales ;
|
|
static int64_t h_end ;
|
|
static int64_t h_begin ;
|
|
static int64_t h_start ;
|
|
static int64_t h_am;
|
|
static int64_t h_fm;
|
|
|
|
// . first identifies all the "Places" using the rules above
|
|
// . then clusters the "Places" together into an "Address"
|
|
// . we use the address at the top of the page, and the site contact info,
|
|
// etc. to be defaults, so we can inherit, city, state, etc. from those
|
|
// . returns false if blocked, true otherwise. sets g_errno on error.
|
|
bool Addresses::set ( Sections *sections ,
|
|
Words *words ,
|
|
Bits *bits ,
|
|
TagRec *gr ,
|
|
Url *url ,
|
|
int64_t docId ,
|
|
//char *coll ,
|
|
collnum_t collnum ,
|
|
int32_t domHash32 ,
|
|
int32_t ip ,
|
|
//int32_t tagPairHash ,
|
|
int32_t niceness ,
|
|
SafeBuf *pbuf ,
|
|
void *state ,
|
|
void (*callback) (void *state) ,
|
|
uint8_t contentType ,
|
|
// from XmlDoc::ptr_addressReply in a title rec
|
|
//char *addressReply ,
|
|
//int32_t addressReplySize ,
|
|
//bool addressReplyValid ,
|
|
char *siteTitleBuf ,
|
|
int32_t siteTitleBufSize ,
|
|
XmlDoc *xd ) {
|
|
|
|
reset();
|
|
|
|
// save stuff
|
|
m_xd = xd;
|
|
m_sections = sections;
|
|
m_words = words;
|
|
m_wptrs = words->m_words;
|
|
m_wlens = words->m_wordLens;
|
|
m_nw = words->m_numWords;
|
|
m_wids = words->getWordIds();
|
|
m_tids = words->getTagIds();
|
|
m_bits = bits;
|
|
m_gr = gr;
|
|
m_url = url;
|
|
m_docId = docId;
|
|
m_collnum = collnum;
|
|
m_domHash32 = domHash32;
|
|
m_ip = ip;
|
|
//m_tagPairHash = tagPairHash;
|
|
m_niceness = niceness;
|
|
m_pbuf = pbuf;
|
|
m_state = state;
|
|
m_callback = callback;
|
|
m_contentType = contentType;
|
|
|
|
//m_addressReply = addressReply;
|
|
//m_addressReplySize = addressReplySize;
|
|
//m_addressReplyValid = addressReplyValid;
|
|
|
|
m_siteTitleBuf = siteTitleBuf;
|
|
m_siteTitleBufSize = siteTitleBufSize;
|
|
|
|
m_sb.purge();
|
|
|
|
static bool s_setHashes = false;
|
|
if ( ! s_setHashes ) {
|
|
// flag it
|
|
s_setHashes = true;
|
|
// int16_tcuts
|
|
h_i = hash64n ("i");
|
|
h_court = hash64n ("court");
|
|
h_interstate = hash64n ("interstate");
|
|
h_page = hash64n ("page");
|
|
h_corner = hash64n ("corner");
|
|
h_between = hash64n ( "between");
|
|
h_btwn = hash64n ( "btwn");
|
|
h_bet = hash64n ( "bet");
|
|
h_streets = hash64n ( "streets");
|
|
h_sts = hash64n ( "sts");
|
|
h_at = hash64n ( "at" );
|
|
h_come = hash64n ("come");
|
|
h_is = hash64n ( "is" );
|
|
h_located = hash64n ( "located" );
|
|
h_intersection = hash64n("intersection");
|
|
h_law = hash64 ( "law" ,3);
|
|
h_address = hash64 ( "address",7);
|
|
h_added = hash64 ( "added",5);
|
|
h_copy = hash64 ( "copy",4);
|
|
h_search = hash64 ( "search",6);
|
|
h_find = hash64 ( "find",4);
|
|
h_go = hash64 ( "go",2);
|
|
h_town = hash64n ( "town");
|
|
h_city = hash64n ( "city");
|
|
h_street = hash64 ( "street",6);
|
|
h_telephone = hash64 ( "telephone",9);
|
|
h_tel = hash64 ( "tel",3);
|
|
h_ph = hash64 ( "ph",2);
|
|
h_fax = hash64 ( "fax",3);
|
|
h_where = hash64 ( "where",5);
|
|
h_location= hash64 ( "location",8);
|
|
h_venue = hash64n("venue");
|
|
h_map = hash64 ( "map" ,3);
|
|
h_office = hash64 ( "office" ,6);
|
|
h_center = hash64n ("center");
|
|
h_mailing = hash64 ( "mailing" ,7);
|
|
h_mail = hash64 ( "mail" ,4);
|
|
h_snail = hash64 ( "snail" ,5);
|
|
h_edit = hash64 ( "edit" ,4);
|
|
h_email = hash64 ( "email" ,5);
|
|
h_phone = hash64 ( "phone" ,5);
|
|
h_inc = hash64 ( "inc" ,3);
|
|
h_llc = hash64 ( "llc" ,3);
|
|
h_review = hash64 ( "review" ,6);
|
|
h_reviews = hash64 ( "reviews" ,7);
|
|
h_write = hash64 ( "write", 5);
|
|
h_add = hash64 ( "add",3 );
|
|
h_view = hash64 ( "view", 4);
|
|
h_favorites = hash64 ( "favorites", 9);
|
|
h_more = hash64 ( "more",4 );
|
|
h_info = hash64 ( "info",4 );
|
|
h_information = hash64 ( "information", 11);
|
|
h_the = hash64 ( "the" ,3);
|
|
h_in = hash64 ( "in" ,2);
|
|
h_a = hash64 ( "a" ,1);
|
|
h_paseo = hash64n ( "paseo");
|
|
h_de = hash64n ( "de");
|
|
h_del = hash64n ( "del");
|
|
h_all = hash64 ( "all" ,3);
|
|
h_rights = hash64 ( "rights" ,6);
|
|
h_reserved = hash64 ( "reserved" ,8);
|
|
h_contact = hash64 ( "contact" , 7);
|
|
h_us = hash64 ( "us" , 2);
|
|
h_by = hash64 ( "by" ,2);
|
|
h_of = hash64 ( "of" ,2);
|
|
h_for = hash64 ( "for" ,3);
|
|
h_arrangements = hash64("arrangements",12);
|
|
h_arranged = hash64("arranged",8);
|
|
h_sponsored = hash64("sponsored",9);
|
|
h_to = hash64 ( "to" ,2);
|
|
h_every = hash64 ( "every",5);
|
|
h_p = hash64 ( "p" ,1);
|
|
h_b = hash64n ( "b" );
|
|
h_hwy = hash64 ( "hwy" ,3);
|
|
h_state = hash64 ( "state" ,5);
|
|
h_county = hash64 ( "county" , 6 );
|
|
h_cnty = hash64 ( "cnty" , 4 );
|
|
h_cty = hash64 ( "cty" , 3 );
|
|
h_road = hash64 ( "road" ,4);
|
|
h_route = hash64 ( "route" ,5);
|
|
h_rte = hash64 ( "rte" ,3);
|
|
h_rt = hash64 ( "rt" ,2);
|
|
h_highway = hash64 ( "highway" ,7);
|
|
h_hiway = hash64 ( "hiway" ,5);
|
|
h_cr = hash64 ( "cr" ,2);
|
|
h_o = hash64 ( "o" ,1);
|
|
h_po = hash64 ( "po" ,2);
|
|
h_post = hash64 ( "post" ,4);
|
|
h_box = hash64 ( "box" ,3);
|
|
h_top = hash64n ( "top" );
|
|
h_one = hash64 ( "one" ,3);
|
|
h_noon = hash64n ( "noon" );
|
|
h_midnight = hash64n ( "midnight" );
|
|
h_daily = hash64n ( "daily" );
|
|
h_st = hash64 ( "st" ,2);
|
|
h_nd = hash64 ( "nd" ,2);
|
|
h_rd = hash64 ( "rd" ,2);
|
|
h_th = hash64 ( "th" ,2);
|
|
h_away = hash64 ( "away" ,4);
|
|
h_results = hash64 ( "results" , 7 );
|
|
h_days = hash64 ( "days", 4 );
|
|
h_blocks = hash64 ( "blocks",6);
|
|
h_block = hash64 ( "block",5);
|
|
h_miles = hash64 ( "miles",5);
|
|
h_mile = hash64n ( "mile");
|
|
h_year = hash64n("year");
|
|
h_years = hash64n("years");
|
|
h_yr = hash64n("yr");
|
|
h_yrs = hash64n("yrs");
|
|
h_hours = hash64 ( "hours",5);
|
|
h_hrs = hash64 ( "hrs",3);
|
|
h_hour = hash64n ( "hour");
|
|
h_hr = hash64n ( "hr");
|
|
h_mi = hash64 ( "mi",2);
|
|
h_kilometers= hash64 ( "kilometers",10);
|
|
h_km = hash64 ( "km",2);
|
|
h_copyright = hash64 ( "copyright",9);
|
|
h_and = hash64 ( "and" , 3 );
|
|
h_or = hash64 ( "or" , 2 );
|
|
h_suite = hash64 ( "suite",5);
|
|
h_ste = hash64 ( "ste",3);
|
|
h_bldg = hash64 ( "bldg",4);
|
|
h_bld = hash64n ( "bld");
|
|
h_building = hash64 ( "building",8);
|
|
h_unit = hash64 ( "unit",4);
|
|
h_room = hash64 ( "room",4);
|
|
h_pier = hash64 ( "pier",4);
|
|
h_rm = hash64 ( "rm",2);
|
|
h_run = hash64n ("run");
|
|
h_ne = hash64 ( "ne" ,2);
|
|
h_nw = hash64 ( "nw" ,2);
|
|
h_se = hash64 ( "se" ,2);
|
|
h_sw = hash64 ( "sw" ,2);
|
|
h_n = hash64 ( "n" ,1);
|
|
h_s = hash64 ( "s" ,1);
|
|
h_e = hash64 ( "e" ,1);
|
|
h_w = hash64 ( "w" ,1);
|
|
h_north = hash64n("north");
|
|
h_south = hash64n("south");
|
|
h_east = hash64n("east");
|
|
h_west = hash64n("west");
|
|
h_northeast = hash64n("northeast");
|
|
h_northwest = hash64n("northwest");
|
|
h_southeast = hash64n("southeast");
|
|
h_southwest = hash64n("southwest");
|
|
h_heart = hash64n ( "heart" );
|
|
h_core = hash64n ( "core" );
|
|
h_least = hash64n ( "least" );
|
|
h_most = hash64n ( "most" );
|
|
h_this = hash64n ( "this" );
|
|
h_north = hash64n ( "north" );
|
|
h_south = hash64n ( "south" );
|
|
h_east = hash64n ( "east" );
|
|
h_west = hash64n ( "west" );
|
|
h_appeared = hash64n ( "appeared" );
|
|
h_role = hash64n ( "role" );
|
|
h_studied = hash64n ( "studied" );
|
|
h_prize = hash64n ( "prize" );
|
|
h_finish = hash64n("finish");
|
|
h_door = hash64n("door");
|
|
h_entrance = hash64n("entrance");
|
|
h_area = hash64n("area");
|
|
h_left = hash64n ( "left" );
|
|
h_right = hash64n ( "right" );
|
|
h_stare = hash64n ( "stare" );
|
|
h_sea = hash64n ( "sea" );
|
|
h_discount = hash64n("discount");
|
|
h_discounted = hash64n("discounted");
|
|
h_www = hash64n("www");
|
|
h_gaze = hash64n ( "gaze" );
|
|
h_look = hash64n ( "look" );
|
|
h_looking = hash64n ( "looking" );
|
|
h_be = hash64n("be");
|
|
h_determined = hash64n("determined");
|
|
h_call = hash64n("call");
|
|
h_details = hash64n("details");
|
|
h_tba = hash64n("tba");
|
|
h_avenue = hash64n("avenue");
|
|
h_ave = hash64n("ave");
|
|
|
|
h_register = hash64n("register");
|
|
h_sign = hash64n("sign");
|
|
h_up = hash64n("up");
|
|
h_signup = hash64n("signup");
|
|
h_tickets = hash64n("tickets");
|
|
h_purchase = hash64n("purchase");
|
|
h_get = hash64n("get");
|
|
h_enroll = hash64n("enroll");
|
|
h_buy = hash64n("buy");
|
|
h_presale = hash64n("presale");
|
|
h_pre = hash64n("pre");
|
|
h_sale = hash64n("sale");
|
|
h_on = hash64n("on");
|
|
h_sales = hash64n("sales");
|
|
h_end = hash64n("end");
|
|
h_begin = hash64n("begin");
|
|
h_start = hash64n("start");
|
|
h_am = hash64n("am");
|
|
h_fm = hash64n("fm");
|
|
}
|
|
|
|
//m_msg2c.m_mcast.reset();
|
|
// sanity check -- did set2() corrupt our junk?
|
|
//if ( m_msg2c.m_mcast.m_ownMsg && m_msg2c.m_mcast.m_msgSize > 5000 ){
|
|
// char *xx=NULL;*xx=0; }
|
|
// returns false and sets g_errno on error
|
|
bool status = set2 ( );
|
|
// sanity check -- did set2() corrupt our junk?
|
|
//if ( m_msg2c.m_mcast.m_ownMsg && m_msg2c.m_mcast.m_msgSize > 5000 ){
|
|
// char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// return true on error now
|
|
if ( ! status ) return true;
|
|
|
|
// . ok, go no further if from msg13
|
|
// . it will have to check m_good or something, not m_valid
|
|
if ( ! m_sections ) return true;
|
|
|
|
// if valid and empty, we are done
|
|
//if ( m_addressReplyValid && ! m_addressReply ) return true;
|
|
|
|
/*
|
|
-- mdw took this out because it had too many false positives. often
|
|
the place name 1 and/or 2 was wrong and was calling nonsense a
|
|
place! for many urls... and now that i removed the
|
|
SEC_CONTENDED_ADDRESS algo all the events on a page even if
|
|
different tag hashes, can share the same address. to replace
|
|
that algo i am ignore events with SEC_TITLE_OUTLINKED if the
|
|
event title is an outlink to another page, and also i am trying
|
|
to identify all place names in events. this outlinked bit should
|
|
fix the http://www.zvents.com/albuquerque-nm/events/show/88543421-the-love-song-of-j-robert-oppenheimer-by-carson-kreitzer url, since it has a
|
|
little section that has "You may Also Like..." for events at
|
|
different venues, mentioned by name.
|
|
//
|
|
// . SELF-VERIFICATION LOOPS
|
|
//
|
|
// . now use the addresses that were inlined to verify those
|
|
// that were not inlined, assuming the place name matches
|
|
// . this will allow "The Filling Station" to be verified in
|
|
// http://www.zvents.com/albuquerque-nm/events/show/
|
|
// 88543421-the-love-song-of-j-robert-oppenheimer-by-carson-kreitzer
|
|
// . first scan the addresses for inlined ones
|
|
// . logic taken basically from hashForPlacedb()
|
|
//
|
|
// init the table
|
|
HashTableX pt;
|
|
// returns true with g_errno set on error
|
|
if ( ! pt.set ( 8,4,256,NULL,0,false,m_niceness) ) return true;
|
|
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// get it
|
|
Address *a = &m_addresses[i];
|
|
// must be inlined
|
|
if ( ! ( a->m_flags & AF_INLINED ) ) continue;
|
|
// sometimes a street can exist in two cities or states
|
|
if ( a->m_flags & AF_AMBIGUOUS ) continue;
|
|
// must not have a place name in place of the street name
|
|
if ( a->m_street.m_flags2 & PLF2_IS_NAME ) continue;
|
|
// hash into table only if valid
|
|
int64_t h1 = a->m_name1.m_hash;
|
|
// adjust it since setHashes() xors in 0x123456 for street
|
|
// names that are actually place names in disguise
|
|
h1 ^= 0x123456;
|
|
// incorporate the adm1 and city and ctry
|
|
h1 = hash64 ( a->m_city.m_hash , h1 );
|
|
h1 = hash64 ( a->m_adm1.m_hash , h1 );
|
|
h1 = hash64 ( a->m_ctry.m_hash , h1 );
|
|
// put it in
|
|
if ( a->m_name1.m_strlen && ! pt.addKey ( (char *)&h1, &a ) )
|
|
return true;
|
|
// same for second place name
|
|
int64_t h2 = a->m_name2.m_hash;
|
|
// adjust it since setHashes() xors in 0x123456 for street
|
|
// names that are actually place names in disguise
|
|
h2 ^= 0x123456;
|
|
// incorporate the adm1 and city and ctry
|
|
h2 = hash64 ( a->m_city.m_hash , h2 );
|
|
h2 = hash64 ( a->m_adm1.m_hash , h2 );
|
|
h2 = hash64 ( a->m_ctry.m_hash , h2 );
|
|
// hash into table only if valid
|
|
if ( a->m_name2.m_strlen && ! pt.addKey ( (char *)&h2, &a ) )
|
|
return true;
|
|
}
|
|
|
|
// now scan our addresses that have a place name in place of
|
|
// the street name and see if we can get a match
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// get it
|
|
Address *a = &m_addresses[i];
|
|
// we want a place name in place of the street name now
|
|
if ( ! ( a->m_street.m_flags2 & PLF2_IS_NAME ) ) continue;
|
|
// . USE the STREET here, not the name
|
|
// . it should already have had the 0x123456 xor'ed in
|
|
// in the logic below because PLF2_IS_NAME is set.
|
|
int64_t h1 = a->m_street.m_hash;
|
|
// incorporate the adm1 and city and ctry
|
|
h1 = hash64 ( a->m_city.m_hash , h1 );
|
|
h1 = hash64 ( a->m_adm1.m_hash , h1 );
|
|
h1 = hash64 ( a->m_ctry.m_hash , h1 );
|
|
// note it
|
|
//logf(LOG_DEBUG,"add: lookuphash=%"XINT64"",a->m_street.m_hash);
|
|
// test that
|
|
//if ( a->m_street.m_hash == 0x14a2446f2d5a2647LL ) {
|
|
// setHashes ( &a->m_street );
|
|
// logf(LOG_DEBUG,"Add: had=%"XINT64"",a->m_street.m_hash);
|
|
//}
|
|
// get hash of street, i.e. hash of name
|
|
// see if we have that in the table
|
|
int32_t slot = pt.getSlot ( &h1 );
|
|
// skip if not there
|
|
if ( slot < 0 ) continue;
|
|
// kewl, we got a match, get the matching address
|
|
Address *ma = *(Address **)pt.getValueFromSlot ( slot );
|
|
//
|
|
// . now use it, i.e. replace ourselves with its info
|
|
// . this logic is from above.
|
|
//
|
|
// int16_tcuts
|
|
Place *name1 = &a->m_name1;
|
|
Place *street = &a->m_street;
|
|
// street name was name1
|
|
gbmemcpy ( name1 , street , sizeof(Place) );
|
|
// and set the street to what it should be
|
|
street->m_str = ma->m_street.m_str;
|
|
street->m_strlen = ma->m_street.m_strlen;
|
|
// let it fly
|
|
a->m_flags |= AF_VERIFIED_STREET;
|
|
a->m_flags |= AF_VERIFIED_STREET_NUM;
|
|
// do not verify place name though!
|
|
a->m_flags |= AF_VERIFIED_PLACE_NAME_1;
|
|
// so set hashes makes its own words class
|
|
street->m_a = -1;
|
|
street->m_b = -1;
|
|
// clear these, since PLF2_IS_NAME should be clear for us!!
|
|
// otherwise it causes setHashes() function below to set
|
|
// our hash as if we were a place name!!!
|
|
street->m_flags2 = 0;
|
|
// compute the street hash
|
|
// Events.cpp relies on this to make substitutions to places
|
|
// that have verified place names
|
|
setHashes(street);
|
|
// and in case hashForPlacedb() is called on us we
|
|
// have to tell it to not hash us!! so put flag back!!
|
|
street->m_flags2 |= PLF2_IS_NAME;
|
|
}
|
|
// free mem
|
|
pt.reset();
|
|
//
|
|
// END SELF-VERIFICATION LOOPS
|
|
//
|
|
*/
|
|
|
|
// update status
|
|
if ( m_xd ) // && ! m_addressReplyValid )
|
|
m_xd->setStatus ( "consulting placedb" );
|
|
|
|
// make a msg2c first
|
|
try { m_msg2c = new (Msg2c); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("addr: msg2c: new(%"INT32"): %s", (int32_t)sizeof(Msg2c),
|
|
mstrerror(g_errno));
|
|
// return true on error with g_errno set
|
|
return true;
|
|
}
|
|
mnew ( m_msg2c , sizeof(Msg2c) , "aamsg2c" );
|
|
|
|
// use niceness 0 if we are a turk injecting
|
|
/*
|
|
int32_t niceness2 = m_niceness;
|
|
if ( m_xd->m_oldsrValid &&
|
|
m_xd->m_oldsr.m_isInjecting &&
|
|
m_xd->m_oldsr.m_isPageInject )
|
|
niceness2 = 0;
|
|
|
|
if ( m_xd->m_oldsrValid &&
|
|
m_xd->m_oldsr.m_isInjecting &&
|
|
m_xd->m_oldsr.m_isPageReindex )
|
|
niceness2 = 0;
|
|
*/
|
|
|
|
// rather than look up stuff in placedb, if we have m_addressReply
|
|
// provided, then that data represents placedb when we first
|
|
// indexed this titleRec and we need to use that to ensure
|
|
// parsing consistency
|
|
if ( //! m_addressReplyValid &&
|
|
! m_msg2c->verifyAddresses ( this ,
|
|
m_collnum ,
|
|
m_domHash32 ,
|
|
m_ip ,
|
|
m_niceness ,
|
|
this ,
|
|
verifiedWrapper ) )
|
|
return false;
|
|
|
|
// . update addresses from the table
|
|
// . returns false and sets g_errno on error
|
|
updateAddresses ( );
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
void verifiedWrapper ( void *state ) {
|
|
// get us
|
|
Addresses *THIS = (Addresses *)state;
|
|
// update addresses from replies
|
|
if ( ! g_errno ) THIS->updateAddresses();
|
|
// try this now. return if it blocked
|
|
//if ( ! g_errno && ! THIS->getGeocoderLatLon() ) return;
|
|
// call callback
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
Address *g_aa = NULL;
|
|
|
|
// . return false with g_errno set on error
|
|
// . take the msg2c replies we got in m_sb.m_buf or in m_addressReply,
|
|
// which is a save of m_sb.m_buf in the titleRec (XmlDoc), and use
|
|
// those replies to set Address::m_flags bits.
|
|
// . also use those replies to update the place names in your addresses
|
|
// to verified place names
|
|
bool Addresses::updateAddresses ( ) {
|
|
// bail on error
|
|
if ( g_errno ) return false;
|
|
|
|
// sanity check - i think
|
|
//if
|
|
|
|
// loop over replies in the replyBuf
|
|
char *p = m_sb.getBufStart();
|
|
char *pend = p + m_sb.length();
|
|
|
|
// . but use this buffer from title rec if valid though
|
|
// . this will ensure parsing consistency
|
|
//if ( m_addressReplyValid ) {
|
|
// p = m_addressReply;
|
|
// pend = p + m_addressReplySize;
|
|
//}
|
|
|
|
// loop over the msg2c replies
|
|
for ( ; p < pend ; ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// parse this reply
|
|
int32_t addrNum = *(int32_t *)p; p += 4;
|
|
int32_t replySize = *(int32_t *)p; p += 4;
|
|
char *reply = p; p += replySize;
|
|
// sanity check
|
|
if ( addrNum >= m_am.getNumPtrs() ) { char *xx=NULL;*xx=0;}
|
|
if ( addrNum < 0 ) { char *xx=NULL;*xx=0;}
|
|
// skip if none!
|
|
if ( replySize == 0 ) continue;
|
|
// sanity check... why was this here? it was coring for
|
|
// a bunch of suites in 500 marquette ave.
|
|
//if ( replySize > 3000 ) { char *xx=NULL;*xx=0; }
|
|
if ( replySize > 5000 )
|
|
logf(LOG_DEBUG,"addr: got large addr reply of %"INT32" "
|
|
"bytes",replySize);
|
|
// sanity check
|
|
if ( replySize < 0 ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( p > pend ) { char *xx=NULL;*xx=0; }
|
|
// int16_tcut
|
|
Address *a = (Address *)m_am.getPtr(addrNum);
|
|
// make sure never got a reply for this
|
|
if ( a->m_flags & AF_GOT_REPLY ) { char *xx=NULL;*xx=0; }
|
|
// mark it
|
|
a->m_flags |= AF_GOT_REPLY;
|
|
|
|
// . parse it up
|
|
// . both reply types now have this same header
|
|
char *p = reply; // + 1;
|
|
// # of voters for the following lat/lon
|
|
int32_t numVotes = *(int32_t *)p; p += 4;
|
|
// then the lat lon
|
|
double lat = *(double *)p; p += sizeof(double);
|
|
double lon = *(double *)p; p += sizeof(double);
|
|
// sanity check
|
|
if ( p > reply + replySize ) { char *xx=NULL;*xx=0; }
|
|
// do not confuse with a->m_latitude/m_longitude
|
|
// because we do not want to re-serialize these back
|
|
// into the placedb record voting framework that
|
|
// would create some kind of feedback loop
|
|
a->m_importedLatitude = lat;
|
|
a->m_importedLongitude = lon;
|
|
a->m_importedVotes = numVotes;
|
|
|
|
// is the street really a place name (Tingley Colesium)
|
|
char isName = ( a->m_street->m_flags2 & PLF2_IS_NAME );
|
|
// deal with normal case
|
|
if ( ! isName ) {
|
|
// must be one byte
|
|
//if ( replySize != 1 ) { char *xx=NULL;*xx=0; }
|
|
// or in the flags
|
|
a->m_flags |= *p; p++; // *reply;
|
|
// then the alternate placedb names
|
|
char *str = p;
|
|
// set end
|
|
char *replyEnd = reply + replySize;
|
|
// and now we have a list of score/names separated
|
|
// by \0's
|
|
a->m_placedbNames = str;
|
|
a->m_placedbNamesEnd = replyEnd;
|
|
// assume no best
|
|
a->m_bestPlacedbName = NULL;
|
|
// max score
|
|
int32_t max = 0;
|
|
// set the best one
|
|
for ( ; ; str += gbstrlen(str) + 1 ) {
|
|
// stop if that was it
|
|
if ( str >= replyEnd ) break;
|
|
// get score
|
|
int32_t vote = *(int32_t *)str;
|
|
// skip vote
|
|
str += 4;
|
|
// skip if not max
|
|
if ( vote <= max ) continue;
|
|
// set max
|
|
max = vote;
|
|
// got new max
|
|
a->m_bestPlacedbName = str;
|
|
}
|
|
// if no, best, make this null too
|
|
if ( ! a->m_bestPlacedbName ) a->m_placedbNames = NULL;
|
|
// all done integrating this reply
|
|
continue;
|
|
}
|
|
// if the address parser changes a lot of times the addrNum
|
|
// is incorrect, so really we should do it by the unique
|
|
// hash of the entire string
|
|
//if ( replySize == 1 ) {
|
|
// log("addr: addr num out of sync with addr data. "
|
|
// "addr parser change and was not versioned.");
|
|
// continue;
|
|
//}
|
|
//if ( replySize == 1 ) { char *xx=NULL;*xx=0; }
|
|
// parse out street from reply (name1;name2;suite;street;...)
|
|
char *sp = p; // reply;
|
|
// reset count
|
|
int32_t scount = 0;
|
|
char *replyEnd = reply+replySize;
|
|
// advance
|
|
for ( ; sp < replyEnd && scount < 3 ; sp++ )
|
|
if ( *sp == ';' ) scount++;
|
|
// crazy! must be the street
|
|
if ( ! *sp ) {
|
|
// print it out
|
|
log("addr: no street for %s",p);
|
|
//char *xx=NULL;*xx=0; }
|
|
g_errno = EBADENGINEER;
|
|
return false;
|
|
}
|
|
// get end
|
|
char *spend = sp;
|
|
// advance to next ;
|
|
for ( ; *spend && *spend != ';' ; spend++ );
|
|
// sanity check
|
|
if ( ! *spend ) {
|
|
// print it out
|
|
log("addr: no street end for %s",p);
|
|
//char *xx=NULL;*xx=0; }
|
|
g_errno = EBADENGINEER;
|
|
return false;
|
|
}
|
|
|
|
// int16_tcuts
|
|
//Place *name1 = a->m_name1;
|
|
//Place *street = a->m_street;
|
|
// now we just ptr swap
|
|
a->m_name1 = a->m_street;
|
|
// make that street reference this address then
|
|
// i guess we are supplanting the Place::m_address setting
|
|
// logc below here
|
|
a->m_name1->m_address = a;
|
|
// but we need a new street place
|
|
//if ( m_np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
Place *street = (Place *)m_pm.getMem(sizeof(Place));
|
|
if ( ! street ) return false;
|
|
a->m_street = street;
|
|
// street name was name1
|
|
//gbmemcpy ( name1 , street , sizeof(Place) );
|
|
// and set the street to what it should be
|
|
street->m_str = sp;
|
|
street->m_strlen = spend - sp;
|
|
// this means from placedb i guess... HACK!
|
|
street->m_bits |= PLF_FROMTAG;//|PLF_FROMTITLE;
|
|
// let it fly
|
|
a->m_flags |= AF_VERIFIED_STREET;
|
|
a->m_flags |= AF_VERIFIED_STREET_NUM;
|
|
a->m_flags |= AF_VERIFIED_PLACE_NAME_1;
|
|
// so set hashes makes its own words class
|
|
street->m_a = -1;
|
|
street->m_b = -1;
|
|
// clear these, since PLF2_IS_NAME should be clear for us!!
|
|
// otherwise it causes setHashes() function below to set
|
|
// our hash as if we were a place name!!!
|
|
street->m_flags2 = 0;
|
|
// fix this before doing hash, otherwise setHashes() is wrong
|
|
street->m_type = PT_STREET;
|
|
// compute the street hash
|
|
// Events.cpp relies on this to make substitutions to places
|
|
// that have verified place names
|
|
setHashes(street, m_words, m_niceness );
|
|
// and in case hashForPlacedb() is called on us we
|
|
// have to tell it to not hash us!! so put flag back!!
|
|
street->m_flags2 |= PLF2_IS_NAME;
|
|
// . what is this then??
|
|
// . we use this for setting the lat/lon, etc.
|
|
a->m_hash = getAddressHash ( a->m_street,
|
|
a->m_city,
|
|
a->m_adm1,
|
|
a->m_zip );
|
|
//if ( m_np < MAX_PLACES ) continue;
|
|
//log("addr: hit np limit");
|
|
//break;
|
|
}
|
|
|
|
Section **sp = m_sections->m_sectionPtrs;
|
|
//
|
|
// . auto verify place names if in <eventVenue> tag
|
|
// . supports injection of our xml format
|
|
//
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *aa = (Address *)m_am.getPtr(i);
|
|
// get place name
|
|
Place *name1 = aa->m_name1;
|
|
// skip if none
|
|
if ( ! name1 ) continue;
|
|
// now we always set this so we can make it a turk
|
|
// venue candidate
|
|
name1->m_unverifiedAddress = aa;
|
|
// set this too!
|
|
if ( aa->m_name2 ) aa->m_name2->m_unverifiedAddress = aa;
|
|
// get word pos
|
|
int32_t a = name1->m_a;
|
|
// skip if not in doc
|
|
if ( a < 0 ) continue;
|
|
// get section its in
|
|
Section *ns = sp[a];
|
|
// go up if sentence or implied
|
|
for ( ; ns ; ns = ns->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// need a tag
|
|
if ( m_tids[ns->m_a] ) break;
|
|
}
|
|
// stop if not in tag at all
|
|
if ( ! ns ) continue;
|
|
// get tag word then
|
|
a = ns->m_a;
|
|
// get tagid, must be xml
|
|
if ( m_tids[a] != TAG_XMLTAG ) continue;
|
|
// get tag name
|
|
if ( ! strncasecmp(m_wptrs[a],"<eventVenue",11) )
|
|
// it's a match!
|
|
aa->m_flags |= AF_VERIFIED_PLACE_NAME_1;
|
|
}
|
|
|
|
|
|
/*
|
|
// loop over all addresses
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// get address
|
|
Address *a = &m_addresses[i];
|
|
// get the reply byte
|
|
char *replyFlags = (char *)m_avt.getValue(&a->m_avtKey);
|
|
// skip if not there
|
|
if ( ! replyFlags ) continue;
|
|
// grab em
|
|
a->m_flags |= *replyFlags;
|
|
// skip if not ambiguous
|
|
//if ( ! ( a->m_flags & AF_AMBIGUOUS ) ) continue;
|
|
// needs to have verified at least the street/city/ctry
|
|
//if ( ! ( a->m_flags & AF_VERIFIED_STREET ) ) continue;
|
|
// ok, remove the ambiguous flag
|
|
//a->m_flags &= ~AF_AMBIGUOUS;
|
|
}
|
|
*/
|
|
|
|
|
|
// . now re-set the AF_AMBIGUIOUS flags
|
|
// . we do this again now that we have set a lot of Address::m_flags
|
|
// like AF_VERIFIED_PLACE_NAME_1 etc from the msg2c replies
|
|
// (or msg2c replies saved in the titleRec/XmlDoc)
|
|
setAmbiguousFlags();
|
|
|
|
// keep count if unique street hashes
|
|
int32_t count = 0;
|
|
// keep a table
|
|
char tmp[5000];
|
|
HashTableX ds; ds.set(8,0,300,tmp,5000,false,m_niceness,"addr-strhsh");
|
|
// count how many distinct street hashes we have
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get address
|
|
Address *a = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// get street hash
|
|
int64_t sh = a->m_street->m_hash;
|
|
// skip if already got
|
|
if ( ds.isInTable ( &sh ) ) continue;
|
|
// add it. i guess ignore if on error
|
|
if ( ! ds.addKey ( &sh ) ) return false;
|
|
// count it
|
|
count++;
|
|
}
|
|
// set it
|
|
m_uniqueStreetHashes = count;
|
|
|
|
// int16_tcuts
|
|
int32_t x , y;
|
|
wbit_t *bits = m_bits->m_bits;
|
|
unsigned char vflags = 0;
|
|
vflags |= AF_VERIFIED_STREET;
|
|
vflags |= AF_VERIFIED_PLACE_NAME_1;
|
|
vflags |= AF_VERIFIED_PLACE_NAME_2;
|
|
vflags |= AF_INLINED;
|
|
// now that we have verified the addresses, set the D_IS_IN_ADDRESS
|
|
// bit for those words in verified addresses... but only for
|
|
// words in verified portions or any portion of an inlined address
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *a = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// must have something verified or be inlined
|
|
if ( ! ( a->m_flags & vflags ) ) continue;
|
|
// is it inlined
|
|
bool inlined = (a->m_flags & AF_INLINED);
|
|
// . even if inlined, if its a "fake" street it
|
|
// needs to be verified
|
|
// . fixes "RAFFLE ... Rio Rancho NM" for trumba.com which
|
|
// thought that "RAFFLE" was a "street" and we ended up
|
|
// setting D_IS_IN_ADDRESS for it, and then in Events.cpp
|
|
// it got demoted for being a title even though it was
|
|
// part of the actual event title!
|
|
if ( inlined && (a->m_street->m_flags2 & PLF2_IS_NAME) )
|
|
inlined = false;
|
|
// get flags
|
|
if ( inlined || (a->m_flags & AF_VERIFIED_STREET) ) {
|
|
// loop over words in street
|
|
x = a->m_street->m_a;
|
|
y = a->m_street->m_b;
|
|
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
|
|
for ( ; x >= 0 && x < y ; x++ )
|
|
bits[x] |= D_IS_IN_ADDRESS;
|
|
}
|
|
// now all place names must be verified only to avoid
|
|
// false positives in the event title scoring algo
|
|
if ( a->m_name1 ){//(a->m_flags & AF_VERIFIED_PLACE_NAME_1) ) {
|
|
// loop over words in street
|
|
x = a->m_name1->m_a;
|
|
y = a->m_name1->m_b;
|
|
// verified or not?
|
|
wbit_t af ;
|
|
if ( a->m_flags & AF_VERIFIED_PLACE_NAME_1 )
|
|
af = D_IS_IN_VERIFIED_ADDRESS_NAME;
|
|
else
|
|
af = D_IS_IN_UNVERIFIED_ADDRESS_NAME;
|
|
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
|
|
if ( ! a->m_name1->m_str ) { x = 0; y = 0; }
|
|
for ( ; x >= 0 && x < y ; x++ )
|
|
bits[x] |= af;//D_IS_IN_VERIFIED_ADDRESS_NAME;
|
|
}
|
|
if ( (a->m_flags & AF_VERIFIED_PLACE_NAME_2) ) {
|
|
// loop over words in street
|
|
x = a->m_name2->m_a;
|
|
y = a->m_name2->m_b;
|
|
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
|
|
if ( ! a->m_name2->m_str ) { x = 0; y = 0; }
|
|
for ( ; x >= 0 && x < y ; x++ )
|
|
bits[x] |= D_IS_IN_VERIFIED_ADDRESS_NAME;
|
|
}
|
|
// suite
|
|
if ( a->m_suite ) {
|
|
x = a->m_suite->m_a;
|
|
y = a->m_suite->m_b;
|
|
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! a->m_suite->m_str ) { x = 0; y = 0; }
|
|
for ( ; x >= 0 && x < y ; x++ )
|
|
bits[x] |= D_IS_IN_ADDRESS;
|
|
}
|
|
|
|
// verified if anything was
|
|
if ( a->m_city ) {
|
|
x = a->m_city->m_a;
|
|
y = a->m_city->m_b;
|
|
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
|
|
for ( ; x>= 0 && x < y ; x++ )
|
|
bits[x] |= D_IS_IN_ADDRESS;
|
|
}
|
|
|
|
if ( a->m_adm1 ) {
|
|
x = a->m_adm1->m_a;
|
|
y = a->m_adm1->m_b;
|
|
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
|
|
for ( ; x >= 0 && x < y ; x++ )
|
|
bits[x] |= D_IS_IN_ADDRESS;
|
|
}
|
|
|
|
// zip
|
|
if ( a->m_zip ) {
|
|
x = a->m_zip->m_a;
|
|
y = a->m_zip->m_b;
|
|
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! a->m_zip->m_str ) { x = 0; y = 0; }
|
|
for ( ; x >= 0 && x < y ; x++ )
|
|
bits[x] |= D_IS_IN_ADDRESS;
|
|
}
|
|
}
|
|
|
|
/////////////////////////////////////
|
|
//
|
|
// hash the words in such address names into this hash table, name tble
|
|
//
|
|
/////////////////////////////////////
|
|
HashTableX nt1;
|
|
//HashTableX nt2;
|
|
HashTableX nt3;
|
|
char ntbuf1[5000];
|
|
//char ntbuf2[5000];
|
|
char ntbuf3[5000];
|
|
nt1.set ( 8,8,256,ntbuf1,5000,true,m_niceness,"addr-nt1");
|
|
//nt2.set ( 8,4,256,ntbuf2,5000,true,m_niceness);
|
|
nt3.set ( 8,4,256,ntbuf3,5000,true,m_niceness,"addr-nt3");
|
|
int32_t goodCount = 0;
|
|
// hash words of the addresses
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// is it inlined
|
|
bool inlined = (ad->m_flags & AF_INLINED);
|
|
// is its name verified?
|
|
bool vn1 = ( ad->m_flags & AF_VERIFIED_PLACE_NAME_1) ;
|
|
bool vn2 = ( ad->m_flags & AF_VERIFIED_PLACE_NAME_2) ;
|
|
bool vs = ( ad->m_flags & AF_VERIFIED_STREET);
|
|
|
|
// must be inlined or verified or after "at"
|
|
|
|
// add place name even if not verified, because if we match
|
|
// an unverified place name the alias must have its
|
|
// PLF2_AFTER_AT flag set, meaning it was after the word "at"
|
|
// so it is a lot less likely to be a false positive.
|
|
// this fixes the solstics seed swap url:
|
|
// events.sfgate.com/san-francisco-ca/events/show/
|
|
// 88884664-solstice-seed-swap because it was not allowing
|
|
// "exploratorium" to be an alias with the exploratorium
|
|
// inlined address because its place name was not verified.
|
|
// so down below we make sure to only allow such aliasing if
|
|
// the place name alias is "after an at"... so it is clearly
|
|
// a place name and not just menu cruft.
|
|
if ( ! inlined && ! vn1 && ! vn2 && ! vs ) continue;
|
|
|
|
// . i don't want aliases to a po box
|
|
// . fixes adobetheater.org which aliases
|
|
// "at the adobe theater" to the po box address at the
|
|
// bottom of the page because it is a better match than
|
|
// the placedbName "adobe theater" that we have as an
|
|
// alternative name for the non-pobox address...
|
|
if ( ad->m_street->m_flags2 & PLF2_IS_POBOX ) continue;
|
|
|
|
// do not add if ambiguous and known to be BAD city/state
|
|
if ( ad->m_flags3 & AF2_BADCITYSTATE ) continue;
|
|
|
|
// sometimes a street can exist in two cities or states
|
|
//if ( ad->m_flags & AF_AMBIGUOUS ) continue;
|
|
// count
|
|
goodCount++;
|
|
|
|
uint64_t v = ((uint64_t)((uint32_t)(PTRTYPE)ad)); // WRONG!MDW
|
|
|
|
// . hash place name 1
|
|
// . use "0" for the name number
|
|
if ( ad->m_name1 &&
|
|
! hashPlaceName (&nt1,
|
|
m_words,
|
|
ad->m_name1->m_a,
|
|
ad->m_name1->m_b,
|
|
v))
|
|
return false;
|
|
|
|
// use "1" for the name number
|
|
if ( ad->m_name2 &&
|
|
! hashPlaceName (&nt1,
|
|
m_words,
|
|
ad->m_name2->m_a,
|
|
ad->m_name2->m_b,
|
|
v| (1LL<<32) ) )
|
|
return false;
|
|
|
|
// hash the verified alternative names
|
|
char *s = ad->m_placedbNames;
|
|
char *send = ad->m_placedbNamesEnd;
|
|
uint64_t count = 2;
|
|
// scan them
|
|
for ( ; s && s < send ; count++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip score
|
|
s += 4;
|
|
// empty? strange...
|
|
if ( ! *s ) { char *xx=NULL;*xx=0; }
|
|
// hash that
|
|
Words tmp;
|
|
if ( ! tmp.set9 ( s, m_niceness ) ) return false;
|
|
int32_t nw = tmp.m_numWords;
|
|
if ( ! hashPlaceName (&nt1,&tmp,0,nw,v|(count<<32)) )
|
|
return false;
|
|
// skip that and the \0
|
|
s += gbstrlen(s) + 1;
|
|
}
|
|
|
|
// hash their street hash and street num hash
|
|
int64_t ch = 0;
|
|
ch ^= ad->m_street->m_hash;
|
|
ch ^= ad->m_street->m_streetNumHash;
|
|
ch ^= ad->m_street->m_streetIndHash;
|
|
if ( ! nt3.addKey ( &ch , &ad ) ) return false;
|
|
// hash the street as a name!
|
|
if ( ! nt3.addKey(&ad->m_street->m_wordHash64,&ad))
|
|
return false;
|
|
// . and exact name too for placedb verified names
|
|
// . it includes a xor'ed 0x123456 in its hash to distinguish
|
|
// from street names that are the same name
|
|
if ( vn1 && ! nt3.addKey ( &ad->m_name1->m_hash , &ad ) )
|
|
return false;
|
|
if ( vn2 && ! nt3.addKey ( &ad->m_name2->m_hash , &ad ) )
|
|
return false;
|
|
}
|
|
// . if we had no inlined or verified addresses, bail at this point
|
|
// . no, might be able to add some lat/lon only addresses below!
|
|
//if ( goodCount == 0 ) {
|
|
// // validate this
|
|
// m_numSorted = 0;
|
|
// m_sortedValid = true;
|
|
// return true;
|
|
//}
|
|
|
|
|
|
/////////////////////////////////////
|
|
//
|
|
// Lastly, set Street/Place::m_alias and m_address
|
|
//
|
|
// So now streets point to the inlined/verified address that uses them.
|
|
//
|
|
/////////////////////////////////////
|
|
|
|
// make the match table
|
|
char mtbuf[5000];
|
|
HashTableX mt;
|
|
mt.set ( 8,4,32,mtbuf,5000,true,m_niceness,"plmtchtbl");
|
|
|
|
//Section **sp = m_sections->m_sectionPtrs;
|
|
|
|
//
|
|
// no! scan the streets since maybe alias did not pair up with
|
|
// a city/adm1 and make it into the m_addresses[] array
|
|
//
|
|
for ( int32_t i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Place *street = (Place *)m_sm.getPtr(i);
|
|
// skip if already has an address set from above in this func
|
|
if ( street->m_address ) continue;
|
|
// is it a name?
|
|
bool isName = street->m_flags2 & PLF2_IS_NAME ;
|
|
// if we are a street like "111 Maple SE" for panjea.org
|
|
// because it is listed twice! one time is inlined and the
|
|
// other is not!
|
|
if ( ! isName ) {
|
|
// make special hash
|
|
int64_t ch = 0;
|
|
ch ^= street->m_hash;
|
|
ch ^= street->m_streetNumHash;
|
|
ch ^= street->m_streetIndHash;
|
|
Address **pad = (Address **) nt3.getValue ( &ch );
|
|
if ( ! pad ) continue;
|
|
if ( (*pad)->m_street->m_a == street->m_a )
|
|
street->m_address = *pad;
|
|
else
|
|
street->m_alias = *pad;
|
|
continue;
|
|
}
|
|
// need a place name
|
|
//if ( ! isName ) continue;
|
|
|
|
// match name to name of address that was verified in placedb
|
|
Address **pad = (Address **) nt3.getValue ( &street->m_hash );
|
|
// sometimes what is really the street has isName set to
|
|
// true. we do not know its a street name in this context
|
|
// because it does not end in an indicator. but the address
|
|
// we are trying to alias to it does end in an indicator
|
|
// or in a city/state. like santafe.org for
|
|
// "1160 Camino Cruz Blanca". it is used twice on the page.
|
|
// the first time it is clearly a street, the 2nd time is
|
|
// why we are doing this! Same for "705 Camino Lejo" on
|
|
// that page as well!
|
|
if ( ! pad ) {
|
|
pad =(Address **) nt3.getValue (&street->m_wordHash64);
|
|
// are we a street address ourself?
|
|
if ( pad ) {
|
|
street->m_alias = *pad;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
|
|
if ( pad &&
|
|
(*pad)->m_name1 &&
|
|
(*pad)->m_name1->m_a == street->m_a ) {
|
|
street->m_address = *pad;
|
|
continue;
|
|
}
|
|
if ( pad &&
|
|
(*pad)->m_name2 &&
|
|
(*pad)->m_name2->m_a == street->m_a ) {
|
|
street->m_address = *pad;
|
|
continue;
|
|
}
|
|
|
|
|
|
// . and make it after at i guess
|
|
// . no we need "Explora" as an alias too!
|
|
// . no! for "santa fe playhouse" it is not preceeded by an at
|
|
// ... so i hope commenting this out is ok
|
|
//if ( ! afterAt ) continue;
|
|
// grabs its name
|
|
int32_t a = street->m_a;
|
|
int32_t b = street->m_b;
|
|
// . are we after at?
|
|
// . this also includes being after "location: " and some
|
|
// other strong place indicators
|
|
bool afterAt = street->m_flags2 & PLF2_AFTER_AT ;
|
|
// reset mt
|
|
mt.clear();
|
|
// count its words
|
|
int32_t need = 0;
|
|
// scan its words
|
|
for ( int32_t k = a ; k < b ; k++ ) {
|
|
// skip if not word
|
|
if ( ! m_wids[k] ) continue;
|
|
// . we do not need to match an initial the
|
|
// . fix for aliasing "The Adobe Theater" to
|
|
// "Adobe Theater" for adobetheater.org
|
|
if ( need == 0 && m_wids[k] == h_the ) continue;
|
|
// count it
|
|
need++;
|
|
// get possible candidates
|
|
int32_t slot1 = nt1.getSlot ( &m_wids[k] );
|
|
// if no match, forget it! we need to match
|
|
// all our words
|
|
//if ( slot1 < 0 ) break;
|
|
// loop
|
|
for(;slot1>=0;slot1=nt1.getNextSlot(slot1,&m_wids[k])){
|
|
// get the value
|
|
uint64_t val =
|
|
*(uint64_t *)nt1.getValueFromSlot(slot1);
|
|
// lower 32 bits is the address ptr
|
|
Address *cand = (Address *)(val & 0xffffffff);
|
|
// upper 32 bits is the name number
|
|
int32_t nn = (val >> 32);
|
|
// sanity check
|
|
if ( nn < 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( nn > 10000 ) { char *xx=NULL;*xx=0; }
|
|
// get street flags
|
|
pflags_t sf = cand->m_street->m_flags2;
|
|
// if name number is 0, then place name 1 must
|
|
// be verified or at least "after at"
|
|
if ( nn==0 &&
|
|
!(sf&AF_VERIFIED_PLACE_NAME_1)&&
|
|
!afterAt )
|
|
continue;
|
|
// same goes for place name 2
|
|
if ( nn==1 &&
|
|
!(sf&AF_VERIFIED_PLACE_NAME_2)&&
|
|
!afterAt )
|
|
continue;
|
|
|
|
// other nn's are place names with 2+ votes
|
|
// from placedb in Address::m_placedbNames
|
|
// so let them ride.
|
|
|
|
// store in match table, add one point
|
|
if(!mt.addTerm((int64_t *)&val))return false;
|
|
}
|
|
}
|
|
|
|
// scan match table for best matches
|
|
int32_t dups = 0;
|
|
Address *best = NULL;
|
|
int32_t bestScore = 0;
|
|
Section *bestContainer = NULL;
|
|
int32_t bestnn = -1;
|
|
// int16_tcut
|
|
char vmask1 = 0;
|
|
vmask1 |= AF_VERIFIED_PLACE_NAME_1;
|
|
vmask1 |= AF_VERIFIED_PLACE_NAME_2;
|
|
vmask1 |= AF_VERIFIED_STREET;
|
|
for ( int32_t y = 0 ; y < mt.m_numSlots ; y++ ) {
|
|
// skip if empty bucket/slot
|
|
if ( ! mt.m_flags[y] ) continue;
|
|
// get score
|
|
int32_t score = mt.getScoreFromSlot ( y );
|
|
// need to match all of our words
|
|
if ( score < need ) continue;
|
|
// skip if not max
|
|
//if ( score < max ) continue;
|
|
// get the address ptr that has this score
|
|
//Address *matcher = *( Address **)mt.getKey ( y );
|
|
uint64_t v = *(uint64_t *)mt.getKey ( y );
|
|
// get name number
|
|
int32_t nn = v>>32;
|
|
// sanity check
|
|
if ( nn < 0 || nn > 10000 ) { char *xx=NULL;*xx=0; }
|
|
// get matching address
|
|
Address *matcher = (Address *)(v & 0xffffffff);
|
|
|
|
// get our alias section
|
|
Section *ads = sp[street->m_a];//ad->m_section;
|
|
|
|
// . telescope our alias up
|
|
// . see which address it hits first, "best" or
|
|
// "matcher"
|
|
// . if it hits both at the same time then it is
|
|
// ambiguous and we can't make a decision
|
|
// . keep telescoping out matcher until it contains
|
|
// the alias
|
|
Section *sm = matcher->m_section;
|
|
for ( ; sm ; sm = sm->m_parent )
|
|
if ( sm->contains ( ads ) ) break;
|
|
|
|
// we got one, or tied for max
|
|
if ( ! best ) {
|
|
bestScore = score;
|
|
best = matcher;
|
|
bestContainer = sm;
|
|
bestnn = nn;
|
|
continue;
|
|
}
|
|
|
|
// if our container is smaller we win!
|
|
if ( bestContainer->contains ( sm ) ) {
|
|
bestScore = score;
|
|
best = matcher;
|
|
bestContainer = sm;
|
|
dups = 0;
|
|
bestnn = nn;
|
|
continue;
|
|
}
|
|
|
|
// if we contain him, he stays winning
|
|
if ( sm->contains ( bestContainer ) )
|
|
continue;
|
|
|
|
//
|
|
// otherwise we are brothers or in the same section
|
|
//
|
|
|
|
// if it is a dup of the best just ignore it
|
|
if ( matcher->m_street->m_hash ==
|
|
best->m_street->m_hash &&
|
|
matcher->m_street->m_streetNumHash ==
|
|
best->m_street->m_streetNumHash &&
|
|
matcher->m_street->m_streetIndHash ==
|
|
best->m_street->m_streetIndHash )
|
|
continue;
|
|
|
|
// ok, it is a tie! we won't be able to alias him!
|
|
dups++;
|
|
}
|
|
// if winner is ambiguous, this address, "ad", has no alias
|
|
if ( dups ) continue;
|
|
// or if no winner
|
|
if ( ! best ) continue;
|
|
|
|
// . trumba.com had an address like
|
|
// "Aztec, NM<br />398 S Light Plant Rd, Aztec, NM 87410-1826"
|
|
// and then referred to NM below, and we thought it was
|
|
// an alias for that address!
|
|
// . BUT it turns out that when i fixed the bug above for
|
|
// incorrectly checking to make sure that matching places
|
|
// had verified place name 1 or 2, then that fixed this bug,
|
|
// but if the place name had the word "NM" or "Aztec" in it
|
|
// AND was verified, i would expect us to need this code
|
|
// so let's make sure we are "after at" if only doing a
|
|
// partial alias
|
|
if ( ! afterAt ) {
|
|
// get alnum words in best
|
|
//int32_t aw1 = 0;
|
|
//int32_t aw2 = 0;
|
|
Place *n1 = best->m_name1;
|
|
Place *n2 = best->m_name2;
|
|
//if ( n1 ) aw1 = n1->m_alnumB - n1->m_alnumA;
|
|
//if ( n2 ) aw2 = n2->m_alnumB - n2->m_alnumA;
|
|
// crap, what if we matched a str in m_placedbName,
|
|
// we don't know which one we matched! yes we do,
|
|
// its # "nn-2" in the string
|
|
char *ps = NULL;
|
|
int32_t pslen;
|
|
if ( bestnn == 0 ) {ps=n1->m_str; pslen=n1->m_strlen;}
|
|
if ( bestnn == 1 ) {ps=n2->m_str; pslen=n2->m_strlen;}
|
|
// subtract
|
|
bestnn -= 2;
|
|
// otherwise, gotta cycle
|
|
char *s = best->m_placedbNames;
|
|
char *send = best->m_placedbNamesEnd;
|
|
// scan them and set "aw"
|
|
for ( ; bestnn>= 0 && s && s < send ; bestnn-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip score
|
|
s += 4;
|
|
// point to it
|
|
char *wp = s;
|
|
// get this
|
|
int32_t slen = gbstrlen(s);
|
|
// skip that and the \0
|
|
s += slen + 1 ;
|
|
// skip if not 0
|
|
if ( bestnn > 0 ) continue;
|
|
// set the process string
|
|
ps = wp;
|
|
pslen = slen;
|
|
// and break for processing
|
|
break;
|
|
}
|
|
|
|
// make into word array
|
|
Words tmp;
|
|
if ( ! tmp.setx (ps,pslen,m_niceness)) return false;
|
|
// count the alnumwords, but ignore "the"
|
|
int32_t aw = 0;
|
|
for (int32_t x=0;x<tmp.m_numWords;x++) {
|
|
if ( ! tmp.m_wordIds[x] ) continue;
|
|
if ( tmp.m_wordIds[x] == h_the) continue;
|
|
aw++;
|
|
}
|
|
|
|
bool fullMatch = false;
|
|
if ( aw == need ) fullMatch = true;
|
|
if ( ! fullMatch ) continue;
|
|
}
|
|
|
|
// int16_tcut
|
|
char vmask2 = 0;
|
|
vmask2 |= AF_VERIFIED_PLACE_NAME_1;
|
|
vmask2 |= AF_VERIFIED_PLACE_NAME_2;
|
|
vmask2 |= AF_VERIFIED_STREET;
|
|
Address *ak = NULL;
|
|
// might not be ordered by position
|
|
int32_t k = 0;
|
|
// get the min position right above us
|
|
int32_t abovePos = -1;
|
|
Address *above = NULL;
|
|
int32_t belowPos = -1;
|
|
Address *below = NULL;
|
|
// now the winner must also be the first verified address
|
|
// above or below us!!!
|
|
for ( k = 0 ; k < m_am.getNumPtrs() ; k++ ) {
|
|
// get it
|
|
ak = (Address *)m_am.getPtr(k);//&m_addresses[k];
|
|
// ignore if a place name
|
|
if ( ak->m_street->m_flags2 & PLF2_IS_NAME )
|
|
continue;
|
|
// skip if not inlined or verified
|
|
bool inlined = (ak->m_flags & AF_INLINED);
|
|
// is its name verified?
|
|
bool verified = ( ak->m_flags & vmask2);
|
|
// skip if not either!
|
|
if ( ! inlined && ! verified ) continue;
|
|
// ignore if after us, must be ABOVE us since we
|
|
// are referencing it as an alias
|
|
if ( ak->m_street->m_a < a ) {
|
|
// skip if doesn't beat the current "above" one
|
|
if ( ak->m_street->m_a <= abovePos ) continue;
|
|
// set it
|
|
above = ak;
|
|
abovePos = ak->m_street->m_a;
|
|
continue;
|
|
}
|
|
// ok, below winner?
|
|
// skip if doesn't beat the current "above" one
|
|
if ( belowPos >= 0 &&
|
|
ak->m_street->m_a >= belowPos ) continue;
|
|
// set it
|
|
below = ak;
|
|
belowPos = ak->m_street->m_a;
|
|
|
|
}
|
|
// skip if not one before us
|
|
if ( ! above && ! below ) continue;
|
|
|
|
// try "above"
|
|
if ( above ) {
|
|
// skip if not a match with the winner, "best"
|
|
if ( best ->m_street->m_hash !=
|
|
above->m_street->m_hash )
|
|
above = NULL;
|
|
if ( above &&
|
|
best ->m_street->m_streetNumHash !=
|
|
above->m_street->m_streetNumHash )
|
|
above = NULL;
|
|
if ( above &&
|
|
best ->m_street->m_streetIndHash !=
|
|
above->m_street->m_streetIndHash )
|
|
above = NULL;
|
|
}
|
|
|
|
// try "below"
|
|
if ( below ) {
|
|
// skip if not a match with the winner, "best"
|
|
if ( best ->m_street->m_hash !=
|
|
below->m_street->m_hash )
|
|
below = NULL;
|
|
if ( below &&
|
|
best ->m_street->m_streetNumHash !=
|
|
below->m_street->m_streetNumHash )
|
|
below = NULL;
|
|
if ( below &&
|
|
best ->m_street->m_streetIndHash !=
|
|
below->m_street->m_streetIndHash )
|
|
below = NULL;
|
|
}
|
|
|
|
// pick the non null one
|
|
if ( ! above && ! below ) continue;
|
|
|
|
// ok, use him as our alias
|
|
if ( above ) street->m_alias = above;
|
|
else if ( below ) street->m_alias = below;
|
|
}
|
|
|
|
Place *prev = NULL;
|
|
////////////////////////////////
|
|
//
|
|
// set m_alias for intersections
|
|
//
|
|
////////////////////////////////
|
|
for ( int32_t i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Place *street = (Place *)m_sm.getPtr(i);
|
|
// if intersection, check if alias of prev street
|
|
if ( ! ( street->m_flags2 & PLF2_INTERSECTION ) ) {
|
|
// update this so its a real street always
|
|
prev = street;
|
|
continue;
|
|
}
|
|
// if we are actually in an address like
|
|
// "CORNER OF HWY 64& HWY 38\0 EAGLE NEST, NM 87718"
|
|
// then skip it as well!
|
|
if ( street->m_address ) continue;
|
|
// try next street
|
|
Place *next = NULL;
|
|
//Place *prev = NULL;
|
|
// if we can get it, get it
|
|
//if ( i - 1 >= 0 ) prev = &m_streets[i-1];
|
|
if ( i + 1 < m_sm.getNumPtrs() )
|
|
next = (Place *)m_sm.getPtr(i+1);
|
|
// ignore if also intersection
|
|
if ( prev && (prev->m_flags2 & PLF2_INTERSECTION)) prev = NULL;
|
|
if ( next && (next->m_flags2 & PLF2_INTERSECTION)) next = NULL;
|
|
// try prev first
|
|
Place *first = prev;
|
|
// declare up here
|
|
int32_t a;
|
|
int32_t b;
|
|
bool good;
|
|
int64_t commonIds[32];
|
|
int32_t nc;
|
|
// loop over both
|
|
subloop:
|
|
// need a street above us to be alias of
|
|
if ( ! first ) goto done;
|
|
// must be an address
|
|
if ( !first->m_address && !first->m_alias ) goto done;
|
|
// must match up
|
|
a = first ->m_b;
|
|
b = street->m_a;
|
|
// swap em
|
|
|
|
// forget it if too big
|
|
if ( b - a > 200 ) continue;
|
|
// scan to make sure only good words in between
|
|
int32_t j; for ( j = a ; j < b ; j++ ) {
|
|
// skip if not wid
|
|
if ( ! m_wids[j] ) continue;
|
|
// must be special word
|
|
if ( m_wids[j] == h_of ) continue;
|
|
if ( m_wids[j] == h_at ) continue;
|
|
if ( m_wids[j] == h_intersection ) continue;
|
|
if ( m_wids[j] == h_corner ) continue;
|
|
if ( m_wids[j] == h_sw ) continue;
|
|
if ( m_wids[j] == h_ne ) continue;
|
|
if ( m_wids[j] == h_nw ) continue;
|
|
if ( m_wids[j] == h_se ) continue;
|
|
break;
|
|
}
|
|
// set if good - if only words we permit in between
|
|
good = (j >= b);
|
|
|
|
// if that failed we could still success by containing
|
|
// a street name in common!
|
|
if ( ! good ) {
|
|
nc = getCommonWordIds ( street->m_a ,
|
|
street->m_b ,
|
|
first->m_a ,
|
|
first->m_b ,
|
|
m_wids ,
|
|
commonIds ,
|
|
32 ,
|
|
m_niceness );
|
|
for ( int32_t k = 0 ; k < nc ; k++ ) {
|
|
// get it
|
|
int64_t cid = commonIds[k];
|
|
// skip if indicator, must be non-indicator
|
|
IndDesc *id;
|
|
id = (IndDesc *)g_indicators.getValue(&cid);
|
|
if ( id ) continue;
|
|
// that is good enough!
|
|
good = true;
|
|
break;
|
|
}
|
|
}
|
|
// if it was not an alias, go on to next place
|
|
if ( ! good ) goto done;
|
|
// assign our m_alias
|
|
if ( first->m_address )
|
|
street->m_alias = first->m_address;
|
|
else if ( first->m_alias )
|
|
street->m_alias = first->m_alias;
|
|
continue;
|
|
done:
|
|
// give up if really done
|
|
if ( first == next ) continue;
|
|
// try next now
|
|
first = next;
|
|
goto subloop;
|
|
}
|
|
|
|
////////////////////////////////
|
|
//
|
|
// set m_alias for intersections more loosely
|
|
//
|
|
////////////////////////////////
|
|
//
|
|
// fixes "14th and Curtis, Denver CO" on denver.org
|
|
// which is a proper address and has the full address next to it
|
|
//
|
|
for ( int32_t i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Place *street = (Place *)m_sm.getPtr(i);
|
|
// if intersection, check if alias of prev street
|
|
if ( ! ( street->m_flags2 & PLF2_INTERSECTION ) ) {
|
|
// update this so its a real street always
|
|
prev = street;
|
|
continue;
|
|
}
|
|
// must be full address for this algo
|
|
if ( ! street->m_address ) continue;
|
|
// try next street
|
|
Place *next = NULL;
|
|
// if we can get it, get it
|
|
if ( i + 1 < m_sm.getNumPtrs() )
|
|
next = (Place *)m_sm.getPtr(i+1);
|
|
// ignore if also intersection
|
|
if ( prev && (prev->m_flags2 & PLF2_INTERSECTION)) prev = NULL;
|
|
if ( next && (next->m_flags2 & PLF2_INTERSECTION)) next = NULL;
|
|
// try prev first
|
|
Place *first = prev;
|
|
if ( ! first ) first = next;
|
|
if ( ! first ) continue;
|
|
|
|
subloop2:
|
|
|
|
char cmpbuf[1024];
|
|
HashTableX cmp;
|
|
cmp.set(8,0,32,cmpbuf,1024,false,m_niceness,"strtcmp");
|
|
// see if matches one non-indicator in street
|
|
for ( int32_t j = first->m_a ; j < first->m_b ; j++ ) {
|
|
// get it
|
|
int64_t h = m_wids[j];
|
|
// skip punct
|
|
if ( ! h ) continue;
|
|
// skip if indicator
|
|
if ( g_indicators.isInTable(&h) ) continue;
|
|
// hash it otherwise
|
|
if ( ! cmp.addKey(&h) ) return false;
|
|
}
|
|
// assume intersection does not match any words
|
|
bool matched = false;
|
|
// now compare to our intersection streets
|
|
for ( int32_t j = street->m_a ; j < street->m_b ; j++ ) {
|
|
// get it
|
|
int64_t h = m_wids[j];
|
|
// skip punct
|
|
if ( ! h ) continue;
|
|
// skip if indicator
|
|
if ( g_indicators.isInTable(&h) ) continue;
|
|
// hash it otherwise
|
|
if ( ! cmp.isInTable(&h) ) continue;
|
|
// got a match!
|
|
matched = true;
|
|
// all done
|
|
break;
|
|
}
|
|
// if no match, forget the alias
|
|
if ( ! matched ) {
|
|
// give up if really done
|
|
if ( first == next ) continue;
|
|
// or if nex tis NULL
|
|
if ( ! next ) continue;
|
|
// try next now
|
|
first = next;
|
|
goto subloop2;
|
|
}
|
|
// it matched!
|
|
if ( first->m_address )
|
|
street->m_alias = first->m_address;
|
|
else if ( first->m_alias )
|
|
street->m_alias = first->m_alias;
|
|
}
|
|
|
|
|
|
////////////////////////////////
|
|
//
|
|
// set D_IS_IN_ADDRESS[_NAME] for places that alias an address
|
|
//
|
|
////////////////////////////////
|
|
|
|
// . now scan the places. if not in an address, but aliases one then
|
|
// we need to set D_IS_IN_ADDRESS[_NAME] for it...
|
|
// . this fixes the aliased streets and names in ceder.net from
|
|
// being event titles...
|
|
for ( int32_t i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Place *street = (Place *)m_sm.getPtr(i);//&m_streets[i];
|
|
// skip if no alias
|
|
Address *alias = street->m_alias;
|
|
if ( ! alias ) continue;
|
|
// is it a name?
|
|
bool isName = street->m_flags2 & PLF2_IS_NAME ;
|
|
// if a street, set this
|
|
wbit_t flag;
|
|
if ( isName ) flag = D_IS_IN_VERIFIED_ADDRESS_NAME;
|
|
else flag = D_IS_IN_ADDRESS;
|
|
// set bits for alias
|
|
int32_t x = street->m_a;
|
|
int32_t y = street->m_b;
|
|
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
|
|
for ( ; x >= 0 && x < m_nw && x < y ; x++ )
|
|
bits[x] |= flag;
|
|
}
|
|
|
|
|
|
////////////////////////////////
|
|
//
|
|
// set m_numNonDupAddresses
|
|
//
|
|
////////////////////////////////
|
|
m_numNonDupAddresses = 0;
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() - 1 ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Address *aa = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// get street position
|
|
int32_t a = aa->m_street->m_a;
|
|
// sanity check
|
|
if ( a < 0 ) continue;
|
|
// get section
|
|
Section *ss = sp[a];
|
|
// skip if dup
|
|
//if ( ss->m_flags & SEC_DUP ) continue;
|
|
if ( ss->m_votesForDup > 0 ) continue;
|
|
// count it otherwise
|
|
m_numNonDupAddresses++;
|
|
}
|
|
|
|
///////////////////////////////
|
|
//
|
|
// set Address::m_flags AF_VENUE_DEFAULT bit
|
|
//
|
|
///////////////////////////////
|
|
|
|
m_numVenues = 0;
|
|
// what are the addresses of this website? (assuming this website
|
|
// is essentially the website of a venue or physical place)
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// is its name verified?
|
|
bool vn1 = (ad->m_flags & AF_VERIFIED_PLACE_NAME_1) ;
|
|
bool vn2 = (ad->m_flags & AF_VERIFIED_PLACE_NAME_2) ;
|
|
// we might have some alternative verified names too!
|
|
bool vn3 = (bool) ad->m_bestPlacedbName;
|
|
// must be inlined or verified
|
|
if ( ! vn1 && ! vn2 && ! vn3 ) continue;
|
|
// if address used the dc[] array that consist of elements
|
|
// from the venue tag in tagdb, then do not add it back
|
|
// to tagdb
|
|
//bool add = true;
|
|
//if ( ad->m_street->m_a < 0 ) add = false;
|
|
// see if its place name 1 is in the siteTitleBuf
|
|
char *p1 = NULL;
|
|
char *p2 = NULL;
|
|
if ( vn1 && ad->m_name1 ) p1 = ad->m_name1->m_str;
|
|
if ( vn2 && ad->m_name2 ) p2 = ad->m_name2->m_str;
|
|
// temp null term
|
|
char c1;
|
|
char c2;
|
|
int32_t plen1;
|
|
int32_t plen2;
|
|
if ( p1 ) plen1 = ad->m_name1->m_strlen;
|
|
if ( p2 ) plen2 = ad->m_name2->m_strlen;
|
|
char *saved1 = NULL;
|
|
char *saved2 = NULL;
|
|
if ( p1 ) saved1 = &p1[plen1];
|
|
if ( p2 ) saved2 = &p2[plen2];
|
|
if ( p1 ) { c1 = *saved1; *saved1 = 0; }
|
|
if ( p2 ) { c2 = *saved2; *saved2 = 0; }
|
|
// . skip "the"
|
|
// . fixes "the adobe theater" in title and "adobe theater"
|
|
// being the verified place name for adobetheater.org
|
|
if ( p1 && strncasecmp(p1,"the ",4) == 0 ) p1 += 4;
|
|
if ( p2 && strncasecmp(p2,"the ",4) == 0 ) p2 += 4;
|
|
// scan m_siteTitleBuf for either p1 or p2
|
|
char *d = m_siteTitleBuf;
|
|
char *dend = m_siteTitleBuf + m_siteTitleBufSize;
|
|
// loop over the \0 delimited list of titles
|
|
for ( ; d < dend ; d += gbstrlen(d) + 1 ) {
|
|
// skip "the"
|
|
if ( strncasecmp(d,"the ",4) == 0 ) d += 4;
|
|
// compare
|
|
bool match = false;
|
|
if ( p1 && gb_strcasestr ( d , p1 ) ) match = true;
|
|
if ( p2 && gb_strcasestr ( d , p2 ) ) match = true;
|
|
// loop over all possible alternative placedb names
|
|
// that have 2 or more votes as well
|
|
char *s = ad->m_placedbNames;
|
|
for ( ; s && s<ad->m_placedbNamesEnd;s+=gbstrlen(s)+1){
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip score of 4 bytes
|
|
s += 4;
|
|
// skip "the"
|
|
if ( strncasecmp(s,"the ",4)==0) s += 4;
|
|
// compare
|
|
if ( ! gb_strcasestr(d,s) ) continue;
|
|
// got a match
|
|
match = true;
|
|
// stop
|
|
break;
|
|
}
|
|
// go to next title if no match
|
|
if ( ! match ) continue;
|
|
// we got a match!
|
|
ad->m_flags |= AF_VENUE_DEFAULT;
|
|
// count it
|
|
m_numVenues++;
|
|
// done
|
|
break;
|
|
}
|
|
if ( saved1 ) *saved1 = c1;
|
|
if ( saved2 ) *saved2 = c2;
|
|
}
|
|
|
|
//int32_t imax = m_nw;
|
|
// skip if no streets... no might add a lat/lon "street" below
|
|
//if ( m_sm.getNumPtrs() <= 0 ) imax = 0;
|
|
|
|
/////////
|
|
//
|
|
// we gotta call this twice. once here and once below
|
|
//
|
|
/////////
|
|
if ( ! setFirstPlaceNums() ) return false;
|
|
|
|
/////////////////////////////
|
|
//
|
|
// scan for lat/lon coordinates
|
|
//
|
|
/////////////////////////////
|
|
|
|
// US lat from 24.450000 to 47.4666666
|
|
// US lon from -71.083333 to -114.1333333
|
|
|
|
// yellowpages.com:
|
|
// <span class="latitude" id="map-latitude">35.146292</span>
|
|
// <span class="longitude" id="map-longitude">-90.0148638</span>
|
|
|
|
// citysearch.com
|
|
// <span class="latitude">37.793126</span>
|
|
// <span class="longitude">-122.42289</span>
|
|
|
|
// yellowpages.aol.com
|
|
// <div style="display:none" class="result_json">{"lat":"35.084278",
|
|
// "lon":"-106.649467","cb":false,"photo":""}</div>
|
|
|
|
// www.superpages.com
|
|
// <a href="http://clicks.superpages.com/ct/clickThrough?SRC=portals
|
|
// ... &POI1lat=039396979&POI1lng=-076564398&POI1name=Baynesville+..
|
|
|
|
// www.yellowbook.com
|
|
// /listing-map.png?lat=35.0981&int32_t=-106.6694
|
|
|
|
// yelp.com
|
|
// use "center=" cgi parm on maps.google.com
|
|
|
|
// google maps link
|
|
// http://www.moma.org/visit/plan/gettinghere
|
|
// src="http://maps.google.com/maps/ms?ie=UTF8&hl=en&msa=0&msid=104870349047867594566.0004626e9d41225400a1c&ll=40.761325,-73.977642&sp...
|
|
|
|
char *bufEnd = m_words->getContentEnd();
|
|
char *bufStart = m_words->getContent ();
|
|
|
|
// now we do a generic scan for any numbers that look like lat/lon
|
|
p = m_words->getContent();
|
|
|
|
// must be latitude then longitude, in that order
|
|
int32_t lastScore = -1;
|
|
double lastVal ;
|
|
char *lastPos = NULL;
|
|
char lastType;
|
|
char *lastAddedPos = NULL;
|
|
int32_t lastAddedWord = -1;
|
|
int32_t lastAddedWordDist;
|
|
int32_t lastAddedCharDist;
|
|
bool addedSomething = false;
|
|
|
|
if ( ! p ) p = "\0";
|
|
|
|
for ( ; *p ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not digit
|
|
if ( ! is_digit(*p) ) continue;
|
|
// set start
|
|
char *start = p;
|
|
// avoid %3D from url encodings
|
|
if ( p > bufStart && p[-1] == '%' &&
|
|
p[0] == '3' &&
|
|
to_lower_a(p[1]) == 'd' ) {
|
|
// skip over that encoded equal sign
|
|
p += 2;
|
|
start += 2;
|
|
// skip over negative sign
|
|
if ( *p == '-' ) { p++; start++; }
|
|
// forget it if got a negative sign or a non-digit
|
|
if ( ! is_digit(*p) ) continue;
|
|
}
|
|
|
|
// negative sign?
|
|
if ( p>bufStart && p[-1] == '-' ) start--;
|
|
// reset counts
|
|
int32_t digitCount = 0;
|
|
int32_t decimalCount = 0;
|
|
// do not scan so far
|
|
char *pmax = p + 20;
|
|
if ( pmax > bufEnd ) pmax = bufEnd;
|
|
// scan until no digit or period
|
|
for ( ; *p && p < pmax ; p++ ) {
|
|
// count the digits
|
|
if ( is_digit(*p) ) {
|
|
digitCount++;
|
|
continue;
|
|
}
|
|
// decimal point is ok
|
|
if ( *p == '.' ) {
|
|
decimalCount++;
|
|
continue;
|
|
}
|
|
// stop on other crap
|
|
break;
|
|
}
|
|
// give up if end of doc
|
|
if ( ! *p ) break;
|
|
// give up if less than 6 digits encountered
|
|
if ( digitCount < 6 ) continue;
|
|
// some pages have no period in it
|
|
// and we just have to assume the first
|
|
// 3 digits are before the period. like for
|
|
// switchboard.com urls
|
|
if ( decimalCount >= 2 ) continue;
|
|
// convert
|
|
double dval = atod2(start,p-start);
|
|
// fix switchboard.com stuff which has no decimal pt
|
|
if ( decimalCount == 0 ) {
|
|
// how many digits to left of decimal
|
|
int32_t left = 3;
|
|
// make a divisor
|
|
double ddd = 1;
|
|
for ( int32_t vv = 0 ; vv<digitCount-left; vv++)
|
|
ddd *= 10;
|
|
// fix it
|
|
dval /= ddd;
|
|
}
|
|
// bail if bad
|
|
if ( dval < -180.0 || dval > 180.0 ) continue;
|
|
// the continental US ranges from
|
|
// latitude : 24 27/60 (http://en.wikipedia.org/wiki/Florida)
|
|
// latitude : 49 (http://en.wikipedia.org/wiki/Washington
|
|
// longitude: 71 5/60 (http://en.wikipedia.org/wiki/Maine)
|
|
// longitude: 114 8/60 (http://en.wikipedia.org/wiki/California
|
|
|
|
// which is lat from 24.450000 to 47.4666666
|
|
// which is lon from 71.083333 to 114.1333333
|
|
// in the usual decimal it is
|
|
// lat from 24.450000 to 47.4666666
|
|
// lon from -71.083333 to -114.1333333
|
|
char type = 0;
|
|
if ( dval >= 24.45 && dval <= 50.0 ) type = 1; // lat
|
|
if ( dval >= -125.0 && dval <= -66.1 ) type = 2; // lon
|
|
|
|
// this overrides though
|
|
char *r = start -1;
|
|
char *rend = start - 10;
|
|
if ( rend < bufStart + 5 ) rend = bufStart + 5;
|
|
for ( ; r >= rend ; r-- ) {
|
|
if ( ! is_alpha_a(*r) ) continue;
|
|
// <latitude> facebook/brazil
|
|
if ( to_lower_a(r[ 0]) == 'e' &&
|
|
to_lower_a(r[-1]) == 'd' &&
|
|
to_lower_a(r[-2]) == 'u' &&
|
|
to_lower_a(r[-3]) == 't' &&
|
|
to_lower_a(r[-4]) == 'i' &&
|
|
to_lower_a(r[-5]) == 't' ) {
|
|
type = 1;
|
|
break;
|
|
}
|
|
// <longitude> facebook/brazil
|
|
if ( to_lower_a(r[ 0]) == 'e' &&
|
|
to_lower_a(r[-1]) == 'd' &&
|
|
to_lower_a(r[-2]) == 'u' &&
|
|
to_lower_a(r[-3]) == 't' &&
|
|
to_lower_a(r[-4]) == 'i' &&
|
|
to_lower_a(r[-5]) == 'g' ) {
|
|
type = 2;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// bail if unknown lat or lon
|
|
if ( type == 0 ) continue;
|
|
|
|
|
|
// . need a latitude before longitude can be accepted
|
|
// . fixes www.happycow.net/gmaps/get-map-direct.php?vid=5447
|
|
// which had a bogus large number (no decimal) after the
|
|
// first legit lat/lon pair in the filename of a url i think
|
|
//if ( needLat && type == 2 ) continue;
|
|
|
|
// get word position for this function
|
|
int32_t wn2 = m_words->getWordAt ( start );
|
|
// sanity check
|
|
if ( wn2 < 0 ) { char *xx=NULL;*xx=0; }
|
|
// find nearest place. the associated place must be a verified
|
|
// place name or a true street.
|
|
Place *ap2 = getAssociatedPlace ( wn2 );
|
|
// get the address that contains the place
|
|
Address *aa = NULL;
|
|
// try address
|
|
if ( ! aa && ap2 ) aa = ap2->m_address;
|
|
// try alias
|
|
if ( ! aa && ap2 ) aa = ap2->m_alias;
|
|
// if this lat/lon had an associated place but the associated
|
|
// place had no address because it is like "at Effex"
|
|
// (after at) then allow it through. we should add the lat/lon
|
|
// as its own address and alias the simple place, ap2, to
|
|
// that. i.e. ap2->m_alias = newlatlonaddress
|
|
//if ( ! aa && ap2 ) continue;
|
|
// assign it
|
|
double *ptr = NULL;
|
|
if ( type == 1 && aa ) ptr = &aa->m_latitude;
|
|
if ( type == 2 && aa ) ptr = &aa->m_longitude;
|
|
|
|
// are we from google maps url?
|
|
// src="http://maps.google.com/maps/ms?ie=UTF8&hl=en&msa=0&msid=104870349047867594566.0004626e9d41225400a1c&ll=40.761325,-73.977642&sp...
|
|
// compute the score of the lat/lon pair
|
|
int32_t score = -1;
|
|
bool inFormat = false;
|
|
// . ll=lat,lon
|
|
// . this is the center of the map and almost always not
|
|
// exactly the exact place of the business which tends to be
|
|
// a little lower down below the center of the map, however
|
|
// if a query is specified then google highlights all
|
|
// locations on the map that match that query
|
|
if ( start - 10 >= bufStart &&
|
|
start[-1] == '=' &&
|
|
start[-2] == 'l' &&
|
|
start[-3] == 'l' &&
|
|
(start[-4] == ';'||start[-4]=='&') ) {
|
|
// this is the correct one
|
|
score = 100;
|
|
inFormat = true;
|
|
}
|
|
// cbll=lat,lon
|
|
else if ( start - 15 >= bufStart &&
|
|
start[-1] == '=' &&
|
|
start[-2] == 'l' &&
|
|
start[-3] == 'l' &&
|
|
start[-4] == 'b' &&
|
|
start[-5] == 'c' &&
|
|
(is_punct_a(start[-6])) ) {
|
|
// this is street view coords
|
|
score = 50;
|
|
inFormat = true;
|
|
}
|
|
// sll=lat,lon (this is not good!?!?!)
|
|
else if ( start - 15 >= bufStart &&
|
|
start[-1] == '=' &&
|
|
start[-2] == 'l' &&
|
|
start[-3] == 'l' &&
|
|
start[-4] == 's' ) {
|
|
// business search thingy? MAKE IT NEGATIVE SCORE!
|
|
score = -20;
|
|
inFormat = true;
|
|
}
|
|
// geocode=0,lat,lon
|
|
else if ( start - 20 >= bufStart &&
|
|
start[-1] == ',' &&
|
|
start[-2] == '0' &&
|
|
start[-3] == '=' &&
|
|
start[-4] == 'e' &&
|
|
start[-5] == 'd' &&
|
|
start[-6] == 'o' &&
|
|
start[-7] == 'c' &&
|
|
start[-8] == 'o' &&
|
|
start[-9] == 'e' &&
|
|
start[-10] == 'g' &&
|
|
(is_punct_a(start[-11])) ) {
|
|
// related to directions somehow
|
|
score = 30;
|
|
inFormat = true;
|
|
}
|
|
else
|
|
score = 10;
|
|
|
|
|
|
// save that
|
|
char *savePos = lastPos;
|
|
int32_t saveScore = lastScore;
|
|
char saveType = lastType;
|
|
double saveVal = lastVal;
|
|
|
|
// then update
|
|
lastPos = start;
|
|
lastScore = score;
|
|
lastType = type;
|
|
lastVal = dval;
|
|
|
|
// if first number, skip
|
|
if ( ! savePos ) continue;
|
|
|
|
// if too far apart, forget it! most likely not a lat/lon pair
|
|
//if ( start - savePos > 100 ) continue;
|
|
|
|
// skip if both are lats or both are lons
|
|
if ( saveType == type ) continue;
|
|
|
|
// if it is a google url thing then we need to wait for
|
|
// the longitude right after the latitude
|
|
if ( inFormat && type == 1 ) continue;
|
|
|
|
// a negative score curses the longitude that follows
|
|
if ( saveScore < 0 ) continue;
|
|
|
|
// get word # and associated place of previous lat/lon #
|
|
int32_t wn1 = m_words->getWordAt ( savePos );//start );
|
|
if ( wn1 < 0 ) { char *xx=NULL;*xx=0; }
|
|
// find nearest place. the associated place must be a verified
|
|
// place name or a true street.
|
|
Place *ap1 = getAssociatedPlace ( wn1 );
|
|
if ( ap1 != ap2 ) continue;
|
|
|
|
// super crazy? try to fiz graffiti.org which pairs together
|
|
// to bogus numbers that are really far apart
|
|
int32_t wordDist = wn2 - wn1;
|
|
if ( wordDist > 30 )
|
|
continue;
|
|
|
|
// better distance counting. should fix
|
|
// santafe.org/perl/page.cgi?p=maps;gid=2415 which
|
|
// has multiple lat/lon pairs all that had a different #
|
|
// of chars between them, but this will make their distances
|
|
// equal where they should be now
|
|
int32_t dist = 0;
|
|
bool inalnum = false;
|
|
bool inpunct = false;
|
|
for ( char *d = savePos ; d < start ; d++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if space
|
|
if ( is_wspace_a(*d) ) {
|
|
inalnum = false;
|
|
inpunct = false;
|
|
continue;
|
|
}
|
|
// count words
|
|
if ( is_alnum_a(*d) ) {
|
|
if ( inalnum ) continue;
|
|
inalnum = true;
|
|
inpunct = false;
|
|
dist++;
|
|
continue;
|
|
}
|
|
// punctuation
|
|
if ( inpunct ) continue;
|
|
inpunct = true;
|
|
inalnum = false;
|
|
dist++;
|
|
continue;
|
|
}
|
|
|
|
// this is likewise bad as well...
|
|
if ( dist > 30 )
|
|
continue;
|
|
|
|
|
|
bool addLatLonAddress = false;
|
|
if ( ! ap1 ) addLatLonAddress = true;
|
|
if ( ap1 && ! ap1->m_alias && ! ap1->m_address )
|
|
addLatLonAddress = true;
|
|
|
|
/////////////
|
|
//
|
|
// if neither lat nor lon has associated place then add addr
|
|
//
|
|
/////////////
|
|
if ( addLatLonAddress ) {
|
|
// if last address we added used the number at
|
|
// savePos then we can't both be right. so compare
|
|
if ( lastAddedPos == savePos &&
|
|
lastAddedWordDist == 0 &&
|
|
wordDist >= 2 )
|
|
continue;
|
|
if ( lastAddedPos == savePos &&
|
|
lastAddedCharDist > 1 &&
|
|
lastAddedCharDist < dist/2 &&
|
|
dist > 10 )
|
|
continue;
|
|
if ( lastAddedWord == wn1 &&
|
|
lastAddedWord == wn2 ) {
|
|
// nuke what we had added just before
|
|
if ( addedSomething ) {
|
|
addedSomething = false;
|
|
m_am.rewind(1);
|
|
m_sm.rewind(1);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
addedSomething = true;
|
|
// note what we add
|
|
if ( wn1 == wn2 ) lastAddedWord = wn1;
|
|
else lastAddedWord = -1;
|
|
lastAddedPos = start;
|
|
lastAddedWordDist = wordDist;
|
|
lastAddedCharDist = dist;
|
|
// set this to the added address
|
|
Address *retAddr = NULL;
|
|
// . now try to add place vec to our array of addresses
|
|
// . we now supply the containing section, "sec"
|
|
// so we can vote on which tag hash supplied the best
|
|
// addresses
|
|
if ( ! addAddress ( NULL,//name1 ,
|
|
NULL,//name2 ,
|
|
NULL,//suite ,
|
|
NULL,//street ,
|
|
NULL,//city ,
|
|
NULL,//adm1 ,
|
|
NULL,//zip ,
|
|
NULL , // ctry ,
|
|
NULL ,
|
|
-1, // startAlnum ,
|
|
AF2_LATLON ,
|
|
&retAddr ) ) return false;
|
|
// set lat/lon
|
|
if ( type == 2 ) {
|
|
retAddr->m_latitude = saveVal;
|
|
retAddr->m_longitude = dval;
|
|
}
|
|
else {
|
|
retAddr->m_latitude = dval;
|
|
retAddr->m_longitude = saveVal;
|
|
}
|
|
// add the lat or lon as a simple place
|
|
Place *pp = (Place *)m_sm.getMem(sizeof(Place));
|
|
if ( ! pp ) return false;
|
|
pp->m_address = retAddr;
|
|
// this seems good to do
|
|
retAddr->m_street = pp;
|
|
pp->m_str = savePos;//start;
|
|
pp->m_strlen = p - savePos;//start;
|
|
int64_t h1 = *(int64_t *)&retAddr->m_latitude;
|
|
int64_t h2 = *(int64_t *)&retAddr->m_longitude;
|
|
pp->m_hash = hash64h ( h1 , h2 );
|
|
pp->m_bits = 0; // |= PLF_FROMTAG;//|PLF_FROMTITLE;
|
|
pp->m_a = wn1;
|
|
pp->m_b = wn2+1;
|
|
pp->m_flags2 = 0;
|
|
pp->m_type = PT_LATLON;
|
|
pp->m_flags2 = 0; // PLF2_IS_NAME;
|
|
// address hash is usually set by calling
|
|
// getAddressHash() but just use the hash of the
|
|
// lat/lon from "street" we already computed
|
|
retAddr->m_hash = pp->m_hash;
|
|
//a->m_street = street;
|
|
Section *as = NULL;
|
|
if ( m_sections ) {
|
|
as = m_sections->m_sectionPtrs[pp->m_a];
|
|
retAddr->m_section = as;
|
|
}
|
|
// add the nearest city to that lat/lon so
|
|
// that Address::getTimeZone() works
|
|
float distInMilesSquared = 100.0;
|
|
uint32_t cid32 = getNearestCityId(retAddr->m_latitude ,
|
|
retAddr->m_longitude,
|
|
m_niceness ,
|
|
&distInMilesSquared);
|
|
// only set this if nearby...
|
|
if ( distInMilesSquared < 1000)
|
|
retAddr->m_cityId32 = cid32;
|
|
else
|
|
retAddr->m_cityId32 = 0;
|
|
// if we had "at Effex" then alias "Effex" to
|
|
// this lat/lon address
|
|
if ( ap1 ) ap1->m_alias = retAddr;
|
|
continue;
|
|
}
|
|
|
|
// if we had matching associated places but the associated
|
|
// place is not part of a good address, skip it
|
|
if ( ! aa ) continue;
|
|
|
|
// pick the highest score between us and the last guy,
|
|
// AS LONG AS WE ARE A LONGITUDE since google maps always
|
|
// has latitude then longitude
|
|
if ( saveScore > score && type == 2 )
|
|
score = saveScore;
|
|
|
|
// get our distance
|
|
//int32_t dist = start - savePos;
|
|
|
|
// if we are know to be right, and it wasn't we can override
|
|
// it without triggering the ambiguous flag
|
|
if ( score > aa->m_latLonScore ||
|
|
// if score is tied but distance is less than, we can
|
|
// win on that too!
|
|
( score == aa->m_latLonScore && dist<aa->m_latLonDist) ) {
|
|
if ( type == 2 ) {
|
|
aa->m_latitude = saveVal;
|
|
aa->m_longitude = dval;
|
|
}
|
|
else {
|
|
aa->m_latitude = dval;
|
|
aa->m_longitude = saveVal;
|
|
}
|
|
aa->m_latLonScore = score;
|
|
aa->m_latLonDist = dist;
|
|
continue;
|
|
}
|
|
|
|
// if we lost, bail
|
|
if ( score < aa->m_latLonScore || dist > aa->m_latLonDist)
|
|
continue;
|
|
|
|
// . if already has one set flag
|
|
// . but only mark it as ambiguous if the conflicting location
|
|
// is more than .010 of a degree off. this fixes abqcsl.org
|
|
// which has a few different &ll=x,y values in its goog url
|
|
// . don't worry about it now since we have a geocoder
|
|
// . this was causing a core because it was resetting the
|
|
// lat/lon of lat/lon only address for discovertherockies.com
|
|
// and was coring in Dates::getIntervals2() because the
|
|
// timezone was like "66" because the lat/lon was reset
|
|
// here to 888 or 999 or whatever
|
|
// . but we need this in case there is ambiguity as to
|
|
// which lat/lon pair is the real deal when there are
|
|
// multiple ones in the same vicinity...
|
|
// . so we have to nuke the address somehow if its lat/lon
|
|
// only
|
|
if ( *ptr != dval && fabs(*ptr - dval) > .010 ) {
|
|
*ptr = AMBIG_LATITUDE;
|
|
*ptr = AMBIG_LONGITUDE;
|
|
}
|
|
}
|
|
|
|
////////////////////////////////
|
|
//
|
|
// blank out the lat/lon if we do not have both for an address
|
|
//
|
|
////////////////////////////////
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// skip address if no lat/lon
|
|
bool haveBoth = true;
|
|
if ( ad->m_latitude == NO_LATITUDE ) haveBoth = false;
|
|
if ( ad->m_latitude == AMBIG_LATITUDE ) haveBoth = false;
|
|
if ( ad->m_longitude == NO_LONGITUDE ) haveBoth = false;
|
|
if ( ad->m_longitude == AMBIG_LONGITUDE ) haveBoth = false;
|
|
if ( haveBoth ) continue;
|
|
// blank out both otherwise
|
|
ad->m_latitude = NO_LATITUDE;
|
|
ad->m_longitude = NO_LONGITUDE;
|
|
}
|
|
|
|
////////////////////////////////
|
|
//
|
|
// blank out all lat/lon of two are identical
|
|
//
|
|
// if two different addresses have the same lat/lon then disregard
|
|
// all on that page
|
|
//
|
|
////////////////////////////////
|
|
class Coordinate { public: double lat; double lon; };
|
|
HashTableX dat;
|
|
char datbuf[2000];
|
|
dat.set ( 16 , 8 , 32 , datbuf , 2000 , false ,m_niceness,"latlontbl");
|
|
Coordinate nukeList[5000];
|
|
int32_t nc = 0;
|
|
// scan the addresses and hash the lat/lon of each one
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// skip address if no lat/lon
|
|
if ( ad->m_latitude == NO_LATITUDE ) continue;
|
|
// skip if its a plain lat/lon address
|
|
if ( ad->m_flags3 & AF2_LATLON ) continue;
|
|
// make the coordinate
|
|
Coordinate cc;
|
|
cc.lat = ad->m_latitude;
|
|
cc.lon = ad->m_longitude;
|
|
// get it as a hash
|
|
//int64_t h1 = *(int64_t *)((double *)&ad->m_latitude);
|
|
//int64_t h2 = *(int64_t *)((double *)&ad->m_latitude);
|
|
//int64_t h = hash64 ( h1 , h2 );
|
|
//double pr = ad->m_latitude*ad->m_longitude;
|
|
//int64_t h = *(int64_t *) ≺
|
|
// mix it up some more
|
|
//h = hash64 ( h , h1 );
|
|
//h = hash64 ( h , h2 );
|
|
// if another entry that has this same lat/lon exists but
|
|
// different address hash, then nuke them all!
|
|
uint64_t *addrHash = (uint64_t *) dat.getValue ( &cc );
|
|
// check if there
|
|
if ( addrHash && *addrHash != ad->m_hash ) {
|
|
//nuke = true;
|
|
// now just add to the nuke list
|
|
if ( nc < 5000 ) nukeList[nc++] = cc;
|
|
break;
|
|
}
|
|
// hash it in "Dup Address Table"
|
|
if ( ! dat.addKey ( &cc , &ad->m_hash ) ) return false;
|
|
}
|
|
for ( int32_t i = 0 ; nc > 0 && i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// skip if its a plain lat/lon address
|
|
if ( ad->m_flags3 & AF2_LATLON ) continue;
|
|
// see if in nuke like
|
|
for ( int32_t j = 0 ; j < nc ; j++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( nukeList[j].lat != ad->m_latitude ) continue;
|
|
if ( nukeList[j].lon != ad->m_longitude ) continue;
|
|
// blank it out
|
|
ad->m_latitude = NO_LATITUDE;
|
|
ad->m_longitude = NO_LONGITUDE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
////////////////////////////////
|
|
//
|
|
// set m_latitude and m_longitude for the same address
|
|
//
|
|
////////////////////////////////
|
|
HashTableX nt4;
|
|
HashTableX nt5;
|
|
char ntbuf4[5000];
|
|
char ntbuf5[5000];
|
|
nt4.set ( 8,4,256,ntbuf4,5000,false,m_niceness,"nt4addr");
|
|
nt5.set ( 8,4,256,ntbuf5,5000,false,m_niceness,"nt5addr");
|
|
// hash words of the addresses
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// skip address if no lat/lon
|
|
if ( ad->m_latitude != NO_LATITUDE &&
|
|
// do not add if already in there
|
|
! nt4.isInTable(&ad->m_hash) )
|
|
// return false if error adding
|
|
if ( ! nt4.addKey(&ad->m_hash,&ad) ) return false;
|
|
// deal with imported lat/lon too
|
|
if ( ad->m_importedLatitude != NO_LATITUDE &&
|
|
// do not add if already in there
|
|
! nt5.isInTable(&ad->m_hash) )
|
|
// return false if error adding
|
|
if ( ! nt5.addKey(&ad->m_hash,&ad) ) return false;
|
|
}
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// see if other same address but with lat/lon exists
|
|
Address **pad = (Address **) nt4.getValue ( &ad->m_hash );
|
|
// inherit otherwise
|
|
if ( pad && ad->m_latitude == NO_LATITUDE ) {
|
|
ad->m_latitude = (*pad)->m_latitude;
|
|
ad->m_longitude = (*pad)->m_longitude;
|
|
}
|
|
// see if other same address but with lat/lon exists
|
|
Address **pad2 = (Address **) nt5.getValue ( &ad->m_hash );
|
|
// inherit otherwise
|
|
if ( pad2 && ad->m_importedLatitude == NO_LATITUDE ) {
|
|
ad->m_importedLatitude = (*pad2)->m_importedLatitude;
|
|
ad->m_importedLongitude = (*pad2)->m_importedLongitude;
|
|
ad->m_importedVotes = (*pad2)->m_importedVotes;
|
|
}
|
|
}
|
|
|
|
///////////////////////////////
|
|
//
|
|
// . set AF2_LATLONDUP for dup lat/lons like stubhub has
|
|
//
|
|
///////////////////////////////
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = (Address *)m_am.getPtr(i);
|
|
// skip if its a plain lat/lon address
|
|
if ( !(ad->m_flags3 & AF2_LATLON) ) continue;
|
|
// see if in matches another
|
|
for ( int32_t j = i+1 ; j < m_am.getNumPtrs() ; j++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
Address *aj = (Address *)m_am.getPtr(j);
|
|
// must also be lat/lon
|
|
if ( !(aj->m_flags3 & AF2_LATLON) ) continue;
|
|
// compute distance
|
|
float d1 = ad->m_latitude - aj->m_latitude;
|
|
float d2 = ad->m_longitude - aj->m_longitude;
|
|
if ( d1 > .01 ) continue;
|
|
if ( d2 > .01 ) continue;
|
|
if ( d1 < -.01 ) continue;
|
|
if ( d2 < -.01 ) continue;
|
|
// . ok, they are the same i guess
|
|
// . prefer the one with the longest digits as the orig
|
|
// and the other as the alias
|
|
if ( ad->m_street->m_strlen > aj->m_street->m_strlen){
|
|
//aj->m_street->m_alias = ad;
|
|
ad->m_street->m_flags3 |= PLF3_LATLONDUP;
|
|
}
|
|
else {
|
|
//ad->m_street->m_alias = aj;
|
|
aj->m_street->m_flags3 |= PLF3_LATLONDUP;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
////////////////////////////////
|
|
//
|
|
// . SET AF3_SUPPLANTED
|
|
// . fixes stubhub.com xml feed
|
|
// . supplant afterAt and other lat/lon addresses with a single
|
|
// winning lat/lon address
|
|
// . the problem with the getAssociatedPlace() logic above is
|
|
// that it only aliases out true street names or verified street
|
|
// names that are afterat... so we have to fix afterat streets
|
|
// that are not verified here.
|
|
// . fixes "blah blah at STUBHUB. <lat=yyy>><lon=xxx>" so that
|
|
// STUBHUB gets AF3_SUPPLANTED set so that Events.cpp ignores it
|
|
// as a competing address.
|
|
//
|
|
///////////////////////////////
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = (Address *)m_am.getPtr(i);
|
|
// skip if not a lat/lon ADDRESS (unassociated with street)
|
|
// i.e. an independent lat/lon because getAssociatedPlace()
|
|
// above was returning NULL for this lat/lon..
|
|
if ( !(ad->m_flags3 & AF2_LATLON) ) continue;
|
|
// skip if dup lat/lon though
|
|
if ( ad->m_street &&
|
|
(ad->m_street->m_flags3 & PLF3_LATLONDUP) )
|
|
continue;
|
|
// get its section and blow it up until right before we
|
|
// hit a verified fake street name or we hit a street name
|
|
// or we hit a latlon that is not a latlondup.
|
|
// use Section::m_firstPlaceNum. we set that above, but
|
|
// we also set it right below in a secon call to
|
|
// setFirstPlaceNums().
|
|
Section *sk = sp[ad->m_street->m_a];
|
|
// telescope section up around this lat/lon address
|
|
for ( ; sk ; sk = sk->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
int32_t pi = sk->m_firstPlaceNum;
|
|
bool hitRealStreet = false;
|
|
// . scan places in this section
|
|
// . just like Events.cpp address assigning algo does
|
|
for ( ; pi >= 0 && pi < m_numSorted ; pi++ ) {
|
|
// get it
|
|
Place *sr = m_sorted[pi];
|
|
// stop if section breach
|
|
if ( sr->m_a >= sk->m_b ) break;
|
|
// sanity
|
|
if ( sr->m_a < 0 ) { char *xx=NULL;*xx=0; }
|
|
// skip us
|
|
if ( sr == ad->m_street ) continue;
|
|
// ignore if POBOX
|
|
if ( sr->m_flags2 & PLF2_IS_POBOX ) continue;
|
|
// skip if dup latlon
|
|
if ( sr->m_flags3 & PLF3_LATLONDUP ) continue;
|
|
// is the street name really a place name?
|
|
bool isName = ( sr->m_flags2 & PLF2_IS_NAME );
|
|
// skip if fake name
|
|
if ( isName ) continue;
|
|
// stop on real street (not-fake name)
|
|
hitRealStreet = true;
|
|
break;
|
|
}
|
|
// stop if we hit real street!
|
|
if ( hitRealStreet )
|
|
break;
|
|
// ok, supplant all if no real street name to go
|
|
// with our lat/lon
|
|
pi = sk->m_firstPlaceNum;
|
|
// do the scan again
|
|
for ( ; pi >=0 && pi < m_numSorted ; pi++ ) {
|
|
// get it
|
|
Place *sr = m_sorted[pi];
|
|
// stop if section breach
|
|
if ( sr->m_a >= sk->m_b ) break;
|
|
// sanity
|
|
if ( sr->m_a < 0 ) { char *xx=NULL;*xx=0; }
|
|
// skip us
|
|
if ( sr == ad->m_street ) continue;
|
|
// flag it
|
|
sr->m_flags3 |= PLF3_SUPPLANTED;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////
|
|
//
|
|
// normalize m_latitude and m_longitude to be from 0 to 360
|
|
// no! - just do in Events::hash() now
|
|
//
|
|
////////////////////////////////
|
|
/*
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get address
|
|
Address *ad = &m_addresses[i];
|
|
// skip address if no lat/lon
|
|
if ( ad->m_latitude == NO_LATITUDE ) continue;
|
|
if ( ad->m_latitude == AMBIG_LATITUDE ) continue;
|
|
ad->m_latitude += 180.0;
|
|
ad->m_longitude += 180.0;
|
|
}
|
|
*/
|
|
|
|
|
|
////////////////////
|
|
//
|
|
// set Address::m_timeZoneOffset (from GMT)
|
|
//
|
|
////////////////////
|
|
/*
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *aa = &m_addresses[i];
|
|
Place *city = aa->m_city;
|
|
Place *zip = aa->m_zip;
|
|
Place *adm1 = aa->m_adm1;
|
|
// and city hash
|
|
uint64_t cityHash = 0;
|
|
if ( city ) cityHash = city->m_hash;
|
|
else if ( zip ) cityHash = zip->m_cityHash;
|
|
if ( ! cityHash ) { char *xx=NULL;*xx=0; }
|
|
// need this
|
|
char *adm1Str = NULL;
|
|
if ( adm1 ) adm1Str = adm1->m_adm1;
|
|
else if ( zip ) adm1Str = zip->m_adm1;
|
|
else if ( city && city->m_adm1[0] ) adm1Str = city->m_adm1;
|
|
else { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( is_upper_a(adm1Str[0]) ) { char *xx=NULL;*xx=0; }
|
|
if ( is_upper_a(adm1Str[1]) ) { char *xx=NULL;*xx=0; }
|
|
uint32_t adm1Hash32 = (uint32_t)*((uint16_t *)adm1Str);
|
|
uint32_t cityHash32 = (uint32_t)cityHash;
|
|
// combine the two hashes
|
|
uint32_t cityStateHash = hash32h(cityHash32,adm1Hash32);
|
|
// get timezone
|
|
int32_t slot = g_timeZones.getSlot ( &cityStateHash );
|
|
// call it 0 if not good
|
|
aa->m_timeZoneOffset = 0;
|
|
// otherwise, set m_timeZoneOffset appropriately
|
|
if ( slot >= 0 )
|
|
aa->m_timeZoneOffset = *(char *)g_timeZones.
|
|
getValueFromSlot(slot);
|
|
}
|
|
*/
|
|
|
|
|
|
//////////////////////////
|
|
//
|
|
// set Section::m_firstPlaceNum
|
|
//
|
|
// . so we can quickly scan the places contained by a section
|
|
//
|
|
//////////////////////////
|
|
if ( ! setFirstPlaceNums() ) return false;
|
|
|
|
|
|
|
|
////////////////////
|
|
//
|
|
// count # of valid/inlined addresses we have
|
|
//
|
|
////////////////////
|
|
m_numValid = 0;
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *aa = (Address *)m_am.getPtr(i);//&m_addresses[i];
|
|
// is inlined or verified?
|
|
bool valid = false;
|
|
if ( aa->m_flags & AF_INLINED ) valid = true;
|
|
// but unverified streetisname is not good
|
|
if ( aa->m_street && (aa->m_street->m_flags2 & PLF2_IS_NAME) )
|
|
valid = false;
|
|
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_1 ) valid = true;
|
|
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_2 ) valid = true;
|
|
if ( aa->m_flags & AF_VERIFIED_STREET ) valid = true;
|
|
if ( ! valid ) continue;
|
|
m_numValid++;
|
|
aa->m_flags3 |= AF2_VALID;
|
|
}
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
static void gotGeocoderReply ( void *state , TcpSocket *s ) {
|
|
// get us
|
|
Addresses *THIS = (Addresses *)state;
|
|
// process it
|
|
THIS->processGeocoderReply ( s );
|
|
// call callback
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
// . set m_geocoderLat/m_geocoderLon
|
|
// . returns false if blocks
|
|
// . returns true with g_errno set on error
|
|
// . only call from Events.cpp if we have 1+ valid event that will be
|
|
// indexed...
|
|
bool Addresses::setGeocoderLatLons ( void *state,
|
|
void (*callback) (void *state) ) {
|
|
|
|
// only call this once unless we get reset()
|
|
if ( m_calledGeocoder ) return true;
|
|
m_calledGeocoder = true;
|
|
|
|
m_callback = callback;
|
|
m_state = state;
|
|
|
|
// store candidates to select from here
|
|
int32_t cands[MAX_GEOCODERS];
|
|
int32_t nc = 0;
|
|
// select a geocoder by IP
|
|
for ( int32_t i = 0 ; i < MAX_GEOCODERS ; i++ ) {
|
|
// check ip
|
|
if ( ! g_conf.m_geocoderIps[i] ) continue;
|
|
// add to candidates
|
|
cands[nc++] = g_conf.m_geocoderIps[i];
|
|
}
|
|
// if none, bail, we do not do this
|
|
if ( nc <= 0 ) return true;
|
|
|
|
int32_t need = 0;
|
|
// loop over each valid address we and add to request size
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *aa = (Address *)m_am.getPtr(i);
|
|
// reset
|
|
aa->m_geocoderLat = 999;
|
|
aa->m_geocoderLon = 999;
|
|
// is inlined or verified?
|
|
if ( ! ( aa->m_flags3 & AF2_VALID ) ) continue;
|
|
// only do it if used in event now
|
|
if ( ! ( aa->m_flags3 & AF2_USEDINEVENT ) ) continue;
|
|
// skip if lat/lon address
|
|
if ( aa->m_flags3 & AF2_LATLON ) {
|
|
// just inherit that
|
|
aa->m_geocoderLat = aa->m_latitude;
|
|
aa->m_geocoderLon = aa->m_longitude;
|
|
continue;
|
|
}
|
|
// check the cache first!!! used by Repair.cpp to speed up!!
|
|
int64_t key64 = aa->m_hash;
|
|
double *recs;
|
|
int32_t recSize;
|
|
bool inCache = m_latLonCache.getRecord ( (collnum_t) 0,
|
|
(char *)&key64 ,
|
|
(char **)&recs ,
|
|
&recSize ,
|
|
false ,
|
|
3600 ,
|
|
false );
|
|
if ( inCache && recs && recs[0] != 999 ) {
|
|
aa->m_geocoderLat = recs[0];
|
|
aa->m_geocoderLon = recs[1];
|
|
continue;
|
|
}
|
|
|
|
// request needs street,state,city (and zip if there)
|
|
need += aa->m_street->m_strlen + 1;
|
|
// get city length
|
|
if ( aa->m_city ) need += aa->m_city->m_strlen;
|
|
else if ( aa->m_zip ) need += strlen(aa->m_zip->m_cityStr);
|
|
else if ( aa->m_flags3 & AF2_LATLON );
|
|
else { char *xx=NULL;*xx=0; }
|
|
if ( aa->m_zip ) need += 2 + aa->m_zip->m_strlen;
|
|
//need += aa->m_adm1->m_strlen + 1;
|
|
need += 2; // use state abbr
|
|
need += 20; // addrXXX=...&
|
|
}
|
|
|
|
// if none valid, vail
|
|
if ( need == 0 ) return true;
|
|
|
|
// need url cruft "http://..../"
|
|
need += 100;
|
|
|
|
char sbuf[5024];
|
|
char *requestBuf = NULL;
|
|
if ( need < 5024 ) requestBuf = sbuf;
|
|
if ( ! requestBuf ) requestBuf = (char *)mmalloc(need,"geocode");
|
|
if ( ! requestBuf ) return true;
|
|
|
|
// make the url
|
|
char *p = requestBuf;
|
|
// select a geocoder randomly
|
|
int32_t r = rand() % nc;
|
|
// to request manually:
|
|
// http://10.5.66.11:5678/json/+2935-D+Louisiana+NE,+Albuquerque,+NM
|
|
// http://10.5.66.11:5678/txt/+2935-D+Louisiana+NE,+Albuquerque,+NM
|
|
// make the request
|
|
p += sprintf(p,"POST /xml? HTTP/1.0\r\n"
|
|
"Accept: */*\r\n"
|
|
"Host: %s:5678\r\n"
|
|
"Content-Length: xxxxxx\r\n"
|
|
"\r\n",
|
|
iptoa(cands[r]));
|
|
|
|
int32_t num = 1;
|
|
char *contentStart = p;
|
|
// loop over each valid address we and add to request size
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *aa = (Address *)m_am.getPtr(i);
|
|
// is inlined or verified?
|
|
if ( ! ( aa->m_flags3 & AF2_VALID ) ) continue;
|
|
// only do it if used in event now
|
|
if ( ! ( aa->m_flags3 & AF2_USEDINEVENT ) ) continue;
|
|
// skip if we got it already in the cache above
|
|
if ( aa->m_geocoderLat != 999 ) continue;
|
|
// for debugging
|
|
//char *start = p;
|
|
// request needs street,state,city (and zip if there)
|
|
p += sprintf(p,"addr%"INT32"=",num++);
|
|
gbmemcpy(p,aa->m_street->m_str,aa->m_street->m_strlen);
|
|
p += aa->m_street->m_strlen;
|
|
*p++ = ',';
|
|
*p++ = ' ';
|
|
if ( aa->m_city ) {
|
|
gbmemcpy(p,aa->m_city->m_str,aa->m_city->m_strlen);
|
|
p += aa->m_city->m_strlen;
|
|
}
|
|
else if ( aa->m_zip ) {
|
|
int32_t clen = strlen(aa->m_zip->m_cityStr);
|
|
gbmemcpy(p,aa->m_zip->m_cityStr,clen);
|
|
p += clen;
|
|
}
|
|
else if ( aa->m_flags3 & AF2_LATLON );
|
|
else { char *xx=NULL; *xx=0; }
|
|
*p++ = ' ';
|
|
// get state abbr
|
|
if ( aa->m_adm1 ) {
|
|
gbmemcpy(p,aa->m_adm1->m_adm1,2);
|
|
}
|
|
else if ( aa->m_zip ) {
|
|
gbmemcpy(p,aa->m_zip->m_adm1,2);
|
|
}
|
|
else if ( aa->m_flags3 & AF2_LATLON );
|
|
else { char *xx=NULL;*xx=0; }
|
|
p += 2;
|
|
// zip if we got it, seems to help geocoder sometimes
|
|
if ( aa->m_zip ) {
|
|
*p++ = ' ';
|
|
int32_t zlen = aa->m_zip->m_strlen;
|
|
gbmemcpy(p,aa->m_zip->m_str,zlen);
|
|
p += zlen;
|
|
}
|
|
*p++ = '&';
|
|
// log debug
|
|
//log("addr: GET %s",start);
|
|
}
|
|
// null term
|
|
*p = '\0';
|
|
|
|
// fix content-length
|
|
char *qq = strstr(requestBuf,"xxxxxx");
|
|
if ( ! qq ) { char *xx=NULL;*xx=0; }
|
|
if ( p-contentStart > 999999 ) { char *xx=NULL;*xx=0; }
|
|
sprintf(qq,"%06"INT32"",(int32_t)(p-contentStart));
|
|
qq[6]='\r'; // sprintf might have written a \0, so put \r back
|
|
|
|
// finish it
|
|
//p += sprintf(p," HTTP/1.0\r\n\r\n");
|
|
// size of it
|
|
int32_t reqLen = p - requestBuf;
|
|
// sanity
|
|
if ( reqLen >= need ) { char *xx=NULL;*xx=0; }
|
|
// send it off to get back xml reply
|
|
bool status = g_httpServer.getDoc( cands[r] , // ip
|
|
5678 , // port
|
|
requestBuf ,
|
|
reqLen ,
|
|
this ,
|
|
gotGeocoderReply ,
|
|
60*1000 , // timeout 60s
|
|
-1 , // no max
|
|
-1 );// no max
|
|
// free the request since it mdups it
|
|
if ( requestBuf != sbuf ) mfree ( requestBuf , need , "geocode" );
|
|
// return false if it blocked
|
|
if ( ! status ) return false;
|
|
// error? ENOMEM?
|
|
if ( g_errno ) {
|
|
log("addr: get geocoder lat lon: %s",mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
// otherwise, should always block!
|
|
char *xx=NULL;*xx=0;
|
|
return true;
|
|
}
|
|
|
|
// process it
|
|
bool Addresses::processGeocoderReply ( TcpSocket *s ) {
|
|
|
|
if ( g_errno ) {
|
|
log("addr: geocoder reply: %s",mstrerror(g_errno));
|
|
g_errno = EBADGEOCODERREPLY;
|
|
return true;
|
|
}
|
|
// get reply
|
|
char *reply = s->m_readBuf;
|
|
//int32_t replyAlloc = s->m_readBufSize;
|
|
//int32_t replySize = s->m_readOffset;
|
|
|
|
// same for an empty reply
|
|
if ( ! reply || s->m_readBufSize == 0 ) {
|
|
g_errno = EBADGEOCODERREPLY;
|
|
log("addr: geocoder returned empty reply: %s",
|
|
mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
int32_t num = 0;
|
|
// loop over each valid address we and add to request size
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *aa = (Address *)m_am.getPtr(i);
|
|
// is inlined or verified?
|
|
if ( ! ( aa->m_flags3 & AF2_VALID ) ) continue;
|
|
// only do it if used in event now
|
|
if ( ! ( aa->m_flags3 & AF2_USEDINEVENT ) ) continue;
|
|
// skip if we got it already in the cache above
|
|
if ( aa->m_geocoderLat != 999 ) continue;
|
|
// inc it
|
|
num++;
|
|
// make the tag name
|
|
char tagName[32];
|
|
sprintf(tagName,"<addr%"INT32">",num);
|
|
// ok now get that reply
|
|
char *p = strstr(reply,tagName);
|
|
// not found?
|
|
if ( ! p ) {
|
|
log("addr: missing geocoder reply for addr #%"INT32"",num);
|
|
continue;
|
|
}
|
|
// get end tag of it
|
|
char endTagName[32];
|
|
sprintf(endTagName,"</addr%"INT32">",num);
|
|
char *end = strstr(p,endTagName);
|
|
// strange!
|
|
if ( ! end ) {
|
|
log("addr: missing geocoder endtag for addr #%"INT32"",num);
|
|
continue;
|
|
}
|
|
// tmp shutoff
|
|
char c = *end;
|
|
*end = '\0';
|
|
|
|
// set official latitude, this
|
|
double lastLat = NO_LATITUDE;
|
|
// ok, got it, grab all possible lat/lons for it
|
|
for ( char *s = strstr(p,"<lat>"); s ; s=strstr(s+1,"<lat>")){
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get that
|
|
double lat = atof(s+5);
|
|
// had a last? if so, and they do not match, then
|
|
// give up because i'm not sure which is right
|
|
if ( lastLat != NO_LATITUDE && lat != lastLat ) {
|
|
lastLat = NO_LATITUDE;
|
|
break;
|
|
}
|
|
// mark this
|
|
lastLat = lat;
|
|
}
|
|
|
|
// same for longitude
|
|
double lastLon = NO_LONGITUDE;
|
|
// ok, got it, grab all possible lon/lons for it
|
|
for ( char *s = strstr(p,"<lon>"); s ; s=strstr(s+1,"<lon>")){
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get that
|
|
double lon = atof(s+5);
|
|
// had a last? if so, and they do not match, then
|
|
// give up because i'm not sure which is right
|
|
if ( lastLon != NO_LONGITUDE && lon != lastLon ) {
|
|
lastLon = NO_LONGITUDE;
|
|
break;
|
|
}
|
|
// mark this
|
|
lastLon = lon;
|
|
}
|
|
|
|
// put back for next address's reply
|
|
*end = c;
|
|
|
|
// skip if not good
|
|
if ( lastLat == NO_LATITUDE || lastLon == NO_LONGITUDE ) {
|
|
// log it now
|
|
SafeBuf sb;
|
|
sb.safeMemcpy(aa->m_street->m_str,
|
|
aa->m_street->m_strlen);
|
|
if ( aa->m_city ) {
|
|
sb.pushChar(',');
|
|
sb.safeMemcpy(aa->m_city->m_str,
|
|
aa->m_city->m_strlen);
|
|
}
|
|
if ( aa->m_adm1 ) {
|
|
sb.pushChar(',');
|
|
sb.safeMemcpy(aa->m_adm1->m_str,
|
|
aa->m_adm1->m_strlen);
|
|
}
|
|
if ( aa->m_zip && aa->m_zip->m_strlen ) {
|
|
sb.pushChar(',');
|
|
sb.safeMemcpy(aa->m_zip->m_str,
|
|
aa->m_zip->m_strlen);
|
|
}
|
|
log("addr: geocoder failed on %s",sb.getBufStart());
|
|
continue;
|
|
}
|
|
// otherwise, set it!
|
|
aa->m_geocoderLat = lastLat;
|
|
aa->m_geocoderLon = lastLon;
|
|
}
|
|
|
|
// free when done
|
|
//mfree ( reply , replyAlloc , "geocodrp");
|
|
return true;
|
|
}
|
|
|
|
void Address::getLatLon( double *lat, double *lon ) {
|
|
// use geocoder if valid
|
|
if ( m_geocoderLat != NO_LATITUDE && m_geocoderLon != NO_LONGITUDE ) {
|
|
*lat = (double)m_geocoderLat;
|
|
*lon = (double)m_geocoderLon;
|
|
return;
|
|
}
|
|
// use other guy otherwise
|
|
if ( m_latitude != NO_LATITUDE && m_longitude != NO_LONGITUDE ) {
|
|
*lat = (double)m_latitude;
|
|
*lon = (double)m_longitude;
|
|
return;
|
|
}
|
|
// otherwise, no go
|
|
*lat = NO_LATITUDE;
|
|
*lon = NO_LONGITUDE;
|
|
}
|
|
|
|
bool hashPlaceName ( HashTableX *nt1,
|
|
Words *words,
|
|
int32_t a ,
|
|
int32_t b ,
|
|
uint64_t v ) {
|
|
|
|
int64_t *wids = words->m_wordIds;
|
|
// hash
|
|
for ( int32_t k = a ; k < b ; k++ ) {
|
|
// skip if not word
|
|
if ( ! wids[k] ) continue;
|
|
// add it
|
|
if ( ! nt1->addKey ( &wids[k] , &v ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
// returns -1 and sets g_errno on error
|
|
int32_t getCommonWordIds ( int32_t a1 , int32_t b1 ,
|
|
int32_t a2 , int32_t b2 ,
|
|
int64_t *wids ,
|
|
int64_t *commonIds ,
|
|
int32_t max ,
|
|
int32_t niceness ) {
|
|
int32_t nc = 0;
|
|
HashTableX ht;
|
|
char sbuf[640];
|
|
ht.set ( 8,0,64,sbuf,640,false,niceness,"cmmnwrds");
|
|
// hash first round
|
|
for ( int32_t i = a1 ; i < b1 ; i++ ) {
|
|
// skip if not word
|
|
if ( ! wids[i] ) continue;
|
|
// add it otherwise
|
|
if ( ! ht.addKey ( &wids[i] ) ) return -1;
|
|
}
|
|
// now check the other guy
|
|
for ( int32_t i = a2 ; i < b2 ; i++ ) {
|
|
// skip if not word
|
|
if ( ! wids[i] ) continue;
|
|
// add it otherwise
|
|
if ( ! ht.isInTable ( &wids[i] ) ) continue;
|
|
// add him to our common list
|
|
commonIds[nc++] = wids[i];
|
|
// stop if no room left
|
|
if ( nc >= max ) break;
|
|
}
|
|
// return that
|
|
return nc;
|
|
}
|
|
|
|
|
|
Place *Addresses::getAssociatedPlace ( int32_t i ) {
|
|
// get smallest section containing word #i
|
|
Section *si = m_sections->m_sectionPtrs[i];
|
|
// scan addresses also in this section
|
|
for ( ; si ; si = si->m_parent ) {
|
|
// key mixing now
|
|
//int32_t key = hash32h((int32_t)si,456789);
|
|
// ok, now telescope our section out until we
|
|
// find the address
|
|
//int32_t slot = pt->getSlot ( &key );
|
|
// get it
|
|
int32_t pi = si->m_firstPlaceNum;
|
|
// telescope if none
|
|
//if ( slot < 0 ) continue;
|
|
if ( pi < 0 ) continue;
|
|
// count them
|
|
//int32_t count = 0;
|
|
int64_t lasth = 0LL;
|
|
Place *lastpp = NULL;
|
|
// . scan the addresses in section "si"
|
|
// . the places in m_sorted[] are streets or are verfied
|
|
// place names
|
|
for ( ; pi < m_numSorted ; pi++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get place
|
|
Place *pp = m_sorted[pi];
|
|
// stop if breach
|
|
if ( pp->m_a >= si->m_b ) break;
|
|
// get that place
|
|
//Place *pp = *(Place **)pt->getValueFromSlot(slot);
|
|
// use address or alias
|
|
Address *aa = pp->m_address;
|
|
if ( ! aa ) aa = pp->m_alias;
|
|
// get hash. fix www.reverbnation.com/venue/448772
|
|
// which has "Low Spirits" as a place which aliases
|
|
// to an address whose street is 2823 2nd St NW. as
|
|
// are all the places around this url's only pair of
|
|
// valid lat/lon coordinates.
|
|
int64_t h = pp->m_hash;
|
|
if ( aa ) h = aa->m_street->m_hash;
|
|
// compare to last h
|
|
if ( lasth && h != lasth ) { lastpp = NULL; break; }
|
|
// set it for next guy
|
|
lasth = h;
|
|
// save it
|
|
lastpp = pp;
|
|
// count them
|
|
//count++;
|
|
}
|
|
// if multiple stop, we can not be sure with
|
|
// which address we are associated
|
|
//if ( count >= 2 )
|
|
// break;
|
|
//if ( slot >= 0 )
|
|
// break;
|
|
//if ( ! lastpp )
|
|
// break;
|
|
// get that address
|
|
//Place *pa = *(Place **)pt->getValue(&key);
|
|
// this returns NULL if we had multiple possible addresses
|
|
return lastpp; // pa;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
// . array for setting s_lc hashtable
|
|
// . these are words that can be lower case in a place name
|
|
// . fixes "Santa Maria de la Paz Catholic Church" not being a place name
|
|
static char *s_lcWords[] = {
|
|
"de",
|
|
"la",
|
|
|
|
"at",
|
|
"be",
|
|
"by",
|
|
"of",
|
|
"on",
|
|
"or",
|
|
"in",
|
|
|
|
"re", // you're
|
|
"to",
|
|
"vs",
|
|
"the",
|
|
"and",
|
|
"are",
|
|
"for",
|
|
|
|
"s", // Slim's
|
|
|
|
"y", // spanish "Pupuseria y Restaurant Salvado"
|
|
"del", // spanish "this" "Bosque del Apache National Wildfile Refuge"
|
|
"del", // spanish "of" "Casa de las Chimeneas"
|
|
"las", // spanish "the"
|
|
|
|
"not",
|
|
"from",
|
|
"ll", // they'll this'll that'll you'll
|
|
"ve", // would've should've
|
|
NULL
|
|
};
|
|
|
|
|
|
// returns false with g_errno set on error
|
|
bool setHashes ( Place *p , Words *ww , int32_t niceness ) {
|
|
|
|
//Words *ww = m_words;
|
|
int32_t a = p->m_a;
|
|
int32_t b = p->m_b;
|
|
|
|
// adm1 hash is just hash of the two letters
|
|
if ( p->m_type == PT_STATE ) {
|
|
// must be there
|
|
// do not core here anymore since we coule be a foreign
|
|
// latlon only place in which case this will be zero.
|
|
// happens when such a place is in the contactinfo tag
|
|
//if ( ! p->m_adm1Bits ) { char *xx=NULL;*xx=0;}
|
|
//p->m_hash = hash64Lower_utf8 ( p->m_adm1 , 2);
|
|
// will this work?
|
|
p->m_hash = p->m_adm1Bits;
|
|
return true;
|
|
}
|
|
|
|
// if place name was taken from a tag or placedb then we have
|
|
// to set the words class ourself
|
|
Words tmp;
|
|
if ( p->m_a < 0 ) {
|
|
// return false with g_errno set on error
|
|
if ( ! tmp.set ( p->m_str ,
|
|
p->m_strlen ,
|
|
TITLEREC_CURRENT_VERSION ,
|
|
true ,
|
|
niceness ) ) return false;
|
|
// set it up
|
|
ww = &tmp;
|
|
a = 0;
|
|
b = ww->m_numWords;
|
|
}
|
|
|
|
int64_t *wids = ww->m_wordIds;
|
|
int32_t *wlens = ww->m_wordLens;
|
|
char **wptrs = ww->m_words;
|
|
int32_t nw = ww->m_numWords;
|
|
|
|
// the straight up hash
|
|
int64_t h = 0LL;
|
|
// hash of the non indicator alpha words in street name
|
|
int64_t h1 = 0;
|
|
// . includes hash of directional indicators
|
|
// . we only use this if street name is a directional indicator
|
|
int64_t h2 = 0;
|
|
int64_t h2b = 0;
|
|
int64_t h3 = 0;
|
|
int64_t h4 = 0;
|
|
// word id of previous word
|
|
int64_t pi = 0LL;
|
|
|
|
int32_t alphaCount = 0;
|
|
int64_t prevIndId = 0LL;
|
|
|
|
// to fix the street that is "25 School" we cannot map "school"
|
|
// to h_zero
|
|
bool isStreet = ( p->m_type == PT_STREET );
|
|
|
|
// sanity check -- no, suites start with punct!
|
|
//if ( ! wids[a] ) { char *xx=NULL;*xx=0; }
|
|
p->m_simpleHash32 = 0;
|
|
|
|
// loop over words
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// make a simple hash so setting the EV_STORE_HOURS flag
|
|
// in Events.cpp works, since we compare it to the simple
|
|
// hash of the event title
|
|
p->m_simpleHash32 ^= (uint32_t)wids[i];
|
|
// this logic taken from Sections.cpp where it is setting
|
|
// Section::m_sentenceContentHash
|
|
if ( p->m_simpleHash32 == 0 )
|
|
p->m_simpleHash32 = 123456;
|
|
// get synonym of word id
|
|
int64_t *swid = getSynonymWord ( &wids[i] , &pi , isStreet );
|
|
// word id of previous word
|
|
pi = wids[i];
|
|
|
|
// mix it up
|
|
h <<= 1LL;
|
|
// xor it in
|
|
h ^= *swid;
|
|
|
|
// done if not street
|
|
if ( p->m_type != PT_STREET ) continue;
|
|
|
|
// is street a place name in disguise? if so, continue
|
|
if ( p->m_flags2 & PLF2_IS_NAME ) continue;
|
|
|
|
// int16_tcut
|
|
bool isNum = ww->isNum2(i);
|
|
// count it
|
|
if ( ! isNum ) alphaCount++;
|
|
|
|
// the street num hash, hash of the first number
|
|
if ( isNum && h3 == 0 ) h3 = wids[i];
|
|
|
|
// is this word like "st" or "ave" or "blvd"
|
|
IndDesc *id=(IndDesc *)g_indicators.getValue(swid);
|
|
|
|
// hash of last "indicator"
|
|
if ( id ) {
|
|
// map them
|
|
h4 = *swid;
|
|
// map "N.E." to "NE"
|
|
if ( prevIndId == h_north && *swid == h_east )
|
|
h4 = h_northeast;
|
|
if ( prevIndId == h_north && *swid == h_west )
|
|
h4 = h_northwest;
|
|
if ( prevIndId == h_south && *swid == h_east )
|
|
h4 = h_southeast;
|
|
if ( prevIndId == h_south && *swid == h_west )
|
|
h4 = h_southwest;
|
|
// save that
|
|
prevIndId = *swid;
|
|
}
|
|
// prevIndId only means for the previous word, so reset it
|
|
else
|
|
prevIndId = 0LL;
|
|
|
|
// set some flags based on indFlags
|
|
bool isStreetInd = ( id && (id->m_bit & IND_STREET) );
|
|
bool isDir = ( id && (id->m_bit & IND_DIR ) );
|
|
|
|
// cancel the 'S' indicator if potential
|
|
// apostrophe! "aug 17 burt's lounge"
|
|
// we do not want "17 burt's"
|
|
if ( isDir &&
|
|
wlens[i] == 1 &&
|
|
(wptrs[i][0]=='s' || wptrs[i][0]=='S') &&
|
|
i > 1 &&
|
|
wptrs[i][-1] != ' ' )
|
|
isDir = false;
|
|
|
|
// . update this.
|
|
// . exclude numbers from this!
|
|
// . allow other numbers if no alpha word before them!
|
|
// . exclude directional indicators from this
|
|
// . MDW: for PLF2_INTERSECTION "streets" we need to allow
|
|
// when i == a! because we do not have numeric addresses
|
|
// for intersections, so made it from i>a to i>=a
|
|
if ( i >= a &&
|
|
// but allow directional indicators if right after
|
|
// the street number though, like "123 west street"
|
|
( ! isDir || i == a + 2 ) &&
|
|
// commenting this out hurts "100 3/4 road"
|
|
// but it helps "2001 1/2 montgomery blvd"
|
|
//( ! isNum || alphaCount == 0 ) &&
|
|
! isNum &&
|
|
! isStreetInd ) {
|
|
// mix it up
|
|
h1 <<= 1;
|
|
// xor it
|
|
h1 ^= *swid;//wids[j];
|
|
}
|
|
|
|
// fix "2804 hwy 250" from excluding the "250"
|
|
if ( isNum && alphaCount > 0 ) {
|
|
// mix it up
|
|
h1 <<= 1;
|
|
// xor it
|
|
h1 ^= *swid;//wids[j];
|
|
}
|
|
|
|
// set back up hash in case the others are 0
|
|
if ( isStreetInd ) {
|
|
h2b <<= 1;
|
|
h2b ^= wids[i];
|
|
}
|
|
|
|
if ( isDir ) {
|
|
// mix it up
|
|
h2 <<= 1;
|
|
// include it in this
|
|
h2 ^= wids[i];
|
|
}
|
|
|
|
}
|
|
|
|
// set hash
|
|
p->m_hash = h;
|
|
|
|
// keep this as it is
|
|
p->m_wordHash64 = h;
|
|
|
|
// . if we are a city look up in g_places and see if we are an
|
|
// alias for a different city name
|
|
// . fix "abq" so it maps to albuquerque
|
|
// . we now fixed getAddressHash() so this logic is not needed
|
|
//if ( p->m_type == PT_CITY ) { // && (p->m_flags & PF_IS_ALIAS) ) {
|
|
// // convert hash to alias hash
|
|
// int64_t *newh = (int64_t *)g_aliases.getValue ( &h );
|
|
// // set that to h now
|
|
// if ( newh ) p->m_hash = *newh;
|
|
// // could not find this city in the table... strange
|
|
// return true;
|
|
//}
|
|
|
|
|
|
// done if not street
|
|
if ( p->m_type != PT_STREET ) return true;
|
|
|
|
// only use the purer hash if it is non-zero
|
|
if ( h1 ) p->m_hash = h1;
|
|
else if ( h2 ) p->m_hash = h2;
|
|
else p->m_hash = h2b;
|
|
|
|
// sanity check
|
|
//if ( p->m_hash == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
p->m_streetNumHash = h3;
|
|
p->m_streetIndHash = h4;
|
|
|
|
// if we are a "fake" street
|
|
if ( p->m_flags2 & PLF2_IS_NAME )
|
|
// PROBLEM: the street "6201 San Antonio Dr NE" is matching the
|
|
// place name "San Antonio" so let's mix up "h" a little when
|
|
// we are using "place names" in place of the street
|
|
// ALSO, lets revert it back to "h" not "h1", since "h1" is
|
|
// probably zero since i added that extra "continue" above.
|
|
p->m_hash = h ^ 0x123456;
|
|
|
|
// . sanity check
|
|
// . no! the word "The" has a hash of 0, and we don't add it
|
|
// from the caller's point
|
|
//if ( p->m_hash == 0LL ) { char *xx=NULL;*xx=0; }
|
|
|
|
// done if a fake street
|
|
if ( p->m_flags2 & PLF2_IS_NAME ) return true;
|
|
|
|
// done if street was not a "pobox street"
|
|
if ( to_lower_a(wptrs[a][0])!='p' ) return true;
|
|
|
|
// assume none
|
|
int32_t k = -1;
|
|
|
|
// "p o box 123"
|
|
if ( a + 6 < nw &&
|
|
wids[a ] == h_p &&
|
|
wids[a+2] == h_o &&
|
|
wids[a+4] == h_box &&
|
|
is_digit(wptrs[a+6][0]) )
|
|
k = a + 6;
|
|
// "p o box 123"
|
|
if ( a + 6 < nw &&
|
|
wids[a ] == h_post &&
|
|
wids[a+2] == h_office &&
|
|
wids[a+4] == h_box &&
|
|
is_digit(wptrs[a+6][0]) )
|
|
k = a + 6;
|
|
// "po box 123"
|
|
if ( a + 4 < nw &&
|
|
wids[a ] == h_po &&
|
|
wids[a+2] == h_box &&
|
|
is_digit(wptrs[a+4][0]) )
|
|
k = a + 4;
|
|
// "p.o. 81255"
|
|
if ( a + 4 < nw &&
|
|
wids[a ] == h_p &&
|
|
wids[a+2] == h_o &&
|
|
is_digit(wptrs[a+4][0]) )
|
|
k = a + 4;
|
|
// "p o b 81255"
|
|
if ( a + 6 < nw &&
|
|
wids[a ] == h_p &&
|
|
wids[a+2] == h_o &&
|
|
wids[a+4] == h_b &&
|
|
is_digit(wptrs[a+6][0]) )
|
|
k = a + 6;
|
|
|
|
// not a po box i guess
|
|
if ( k == -1 ) return true;
|
|
|
|
// xor it in along with h_po
|
|
p->m_hash = h_po ^ wids[k];
|
|
|
|
return true;
|
|
}
|
|
|
|
static HashTableX s_lc;
|
|
//static char s_lcbuf[2000];
|
|
static HashTableX s_jobTable;
|
|
|
|
#define MAX_ALNUMS_IN_NAME 16
|
|
|
|
// . called from above
|
|
// . returns false and sets g_errno on error
|
|
bool Addresses::set2 ( ) {
|
|
// sanity check
|
|
if ( ! s_init ) { char *xx=NULL; *xx=0; }
|
|
|
|
bool printed = false;
|
|
|
|
// int16_tcuts
|
|
int32_t nw = m_words->getNumWords();
|
|
// msg13 provides a NULL sections ptr. it can't set them for speed!
|
|
// it is the spider compression proxy...
|
|
Section **sp = NULL;
|
|
if ( m_sections ) sp = m_sections->m_sectionPtrs;
|
|
// int16_tcut
|
|
//Sections *ss = m_sections;
|
|
// reset # of addresses we got
|
|
//m_na = 0;
|
|
// and streets
|
|
//m_ns = 0;
|
|
// and cities, states, zips
|
|
//m_np = 0;
|
|
|
|
// place mem and street mem and address mem
|
|
m_pm.reset();
|
|
m_sm.reset();
|
|
m_am.reset();
|
|
|
|
// init them. poolSize=5000.initnumpoolptrs=300.initnumplaceptrs=3000
|
|
m_pm.init(15000,300,3000,NULL,0,m_niceness);
|
|
m_sm.init(15000,300,3000,NULL,0,m_niceness);
|
|
m_am.init(15000,300,3000,NULL,0,m_niceness);
|
|
|
|
// . inherit from contact info page ONLY IF NO OTHERS
|
|
// . tag format = "city=x;adm1=*;adm2=*;country=*"
|
|
// . get up to 10 addresses from the contact info
|
|
Address da[10];
|
|
// init
|
|
int32_t dc = 0;
|
|
// first address is the empty one
|
|
memset ( &da[0] , 0 , sizeof(Address) );
|
|
// skip it
|
|
dc++;
|
|
|
|
// get contact info addresses, use their city/state for our addresses
|
|
int32_t tt = getTagTypeFromStr ( "contactaddress" );
|
|
Tag *tag = NULL;
|
|
// . taken from TagRec::getTag() function
|
|
// . Msg13.cpp does not have tag..
|
|
if ( m_gr ) tag = m_gr->getFirstTag();
|
|
// loop over all contact info addresses in the TagRec
|
|
for ( ; tag && dc < 10 ; tag = m_gr->getNextTag(tag) ){
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . skip if not a "address" tag (ci=contactInfo)
|
|
// . no, now these are venue default addresses
|
|
if ( tag->m_type != tt ) continue;
|
|
// get str
|
|
char *str = tag->getTagData();
|
|
// reserve mem for it
|
|
|
|
// . set address, da[dc], from tag "tag"
|
|
// . flags to OR into Place::m_bits
|
|
if(!setFromStr(&da[dc],str,PLF_FROMTAG,&m_pm,m_niceness))
|
|
return false;
|
|
// if it was a latlon only address, just skip it for now
|
|
// because i'm not sure what the effects will be. plus its
|
|
// m_adm1 and m_city are typically NULL!!
|
|
if ( da[dc].m_flags3 & AF2_LATLON )
|
|
continue;
|
|
// check it out
|
|
// . this just means it was an AF2_LATLON but we were not
|
|
// able to set that because it has the foreign state
|
|
// and city and country set.
|
|
//if ( ! da[dc].m_adm1->m_hash ) { char *xx=NULL;*xx=0; }
|
|
if ( ! da[dc].m_adm1->m_hash ) continue;
|
|
// advance
|
|
dc++;
|
|
}
|
|
|
|
/*
|
|
// . inherit from what abyznewslinks.com says about our place
|
|
// . tag format = "city=x;adm1=*;adm2=*;country=*"
|
|
if ( ( tag = m_gr->getTag("abyznewslinks.address") ) &&
|
|
// skip if not a "address" tag (ci=contactInfo)
|
|
tag->m_type == tt ) {
|
|
// get str
|
|
char *str = tag->m_data;
|
|
// . set address, da[dc], from tag "tag"
|
|
// . flags to OR into Place::m_bits
|
|
if ( ! setFromStr ( &da[dc] , str,PLF_FROMTAG,m_niceness))
|
|
return false;
|
|
// advance
|
|
dc++;
|
|
}
|
|
*/
|
|
|
|
// now use the default venue address, should be more accurate?
|
|
tt = getTagTypeFromStr ( "venueaddress" );
|
|
// taken from TagRec::getTag() function
|
|
if ( m_gr ) tag = m_gr->getFirstTag();
|
|
// loop over all contact info addresses in the TagRec
|
|
for ( ; tag && dc < 10 ; tag = m_gr->getNextTag(tag) ){
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . skip if not a "address" tag (ci=contactInfo)
|
|
// . no, now these are venue default addresses
|
|
if ( tag->m_type != tt ) continue;
|
|
// get str
|
|
char *str = tag->getTagData();
|
|
// . set address, da[dc], from tag "tag"
|
|
// . flags to OR into Place::m_bits
|
|
if(!setFromStr(&da[dc],str,PLF_FROMTAG,&m_pm,m_niceness))
|
|
return false;
|
|
// if it was a latlon only address, just skip it for now
|
|
// because i'm not sure what the effects will be. plus its
|
|
// m_adm1 and m_city are typically NULL!!
|
|
if ( da[dc].m_flags3 & AF2_LATLON )
|
|
continue;
|
|
// check it out
|
|
// . this just means it was an AF2_LATLON but we were not
|
|
// able to set that because it has the foreign state
|
|
// and city and country set.
|
|
//if ( ! da[dc].m_adm1->m_hash ) { char *xx=NULL;*xx=0; }
|
|
if ( ! da[dc].m_adm1->m_hash ) continue;
|
|
// advance
|
|
dc++;
|
|
// stop it
|
|
break;
|
|
}
|
|
|
|
// let's use the meta description as well.
|
|
// should get jonson gallery on collectorsguide.com
|
|
//char *md = m_xd->getMetaDescription();
|
|
|
|
|
|
|
|
// . if section flag is one of these, ignore the words in it
|
|
// . google seems to index marquee, so i took SEC_MARQUEE out
|
|
// . SEC_HIDDEN applies to text and tags in style=display:none tags.
|
|
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_HIDDEN|
|
|
SEC_NOSCRIPT;
|
|
|
|
//
|
|
//
|
|
// BEGIN STREET NAME IDENTIFICATION
|
|
//
|
|
//
|
|
|
|
// fill this array
|
|
//Place streets[MAX_STREETS];
|
|
//Place *streets = m_streets;
|
|
//int32_t qx = 0;
|
|
|
|
// the copyright symbol in utf8 (see Entities.cpp for the code)
|
|
char copy[3];
|
|
copy[0] = 0xc2;
|
|
copy[1] = 0xa9;
|
|
copy[2] = 0x00;
|
|
|
|
// int16_tcuts
|
|
Words *ww = m_words;
|
|
int64_t *wids = ww->getWordIds();
|
|
char **wptrs = ww->getWordPtrs();
|
|
int32_t *wlens = ww->getWordLens();
|
|
nodeid_t *tids = ww->getTagIds();
|
|
// . if section flag is one of these, ignore the words in it
|
|
// . google seems to index marquee, so i took SEC_MARQUEE out
|
|
// . SEC_HIDDEN applies to text and tags in style=display:none tags.
|
|
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_HIDDEN|
|
|
// SEC_NOSCRIPT;
|
|
// int16_tcut
|
|
wbit_t *bits = NULL;
|
|
if ( m_bits ) bits = m_bits->m_bits;
|
|
|
|
// does the word "at" preceed the potential address?
|
|
//bool atPreceeds = false;
|
|
// reset this position
|
|
int32_t alnumPos = -1;
|
|
// "b" of last street added
|
|
int32_t lastb = -1;
|
|
// previous word id
|
|
int64_t savedPrevWid = 0LL;
|
|
// scan the entire document
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// debug
|
|
//if ( wptrs[i][0]=='1' &&
|
|
// wptrs[i][1]=='3' &&
|
|
// wptrs[i][2]=='1' ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
// skip if not an alnum word
|
|
if ( ! wids[i] ) {
|
|
if ( wlens[i] == 1 ) continue;
|
|
if ( wlens[i] > 5 ) continue;
|
|
if ( ! m_words->hasChar(i,'&') ) continue;
|
|
}
|
|
// skip if in a script section
|
|
if ( sp && sp[i] && (sp[i]->m_flags & badFlags) ) continue;
|
|
// stop if streets are maxed
|
|
//if ( m_ns >= MAX_STREETS ) break;
|
|
// record
|
|
int64_t prevWid = savedPrevWid;
|
|
// and update
|
|
savedPrevWid = wids[i];
|
|
// it's an alnum OR has " & " (see above)
|
|
if ( wids[i] ) alnumPos++;
|
|
// . if we are not outside the scope of previous street then
|
|
// keep going!
|
|
// . fixes "1025 1/2 Lomas Blvd" from picking up the substreet
|
|
// of "2 Lomas Blvd" which was causing an AF_AMBIGUOUS
|
|
if ( i < lastb ) continue;
|
|
|
|
// make this the end point
|
|
// quickly add po boxes
|
|
if ( to_lower_a(wptrs[i][0])=='p' ||
|
|
// sometimes they just have "box 27693" like on
|
|
// http://www.unm.edu/~willow/homeless/services.html
|
|
to_lower_a(wptrs[i][0])=='b' ) {
|
|
// assume none
|
|
int32_t j = -1;
|
|
// the hash
|
|
//int64_t poh = 0LL;
|
|
// "box 123"
|
|
if ( i + 2 < nw &&
|
|
wids[i ] == h_box &&
|
|
is_digit(wptrs[i+2][0]) ) {
|
|
j = i + 2;
|
|
}
|
|
// "p o box 123"
|
|
if ( i + 6 < nw &&
|
|
wids[i ] == h_p &&
|
|
wids[i+2] == h_o &&
|
|
wids[i+4] == h_box &&
|
|
is_digit(wptrs[i+6][0]) ) {
|
|
j = i + 6;
|
|
//poh = h_po ^ wids[j];
|
|
}
|
|
// "p o box 123"
|
|
if ( i + 6 < nw &&
|
|
wids[i ] == h_post &&
|
|
wids[i+2] == h_office &&
|
|
wids[i+4] == h_box &&
|
|
is_digit(wptrs[i+6][0]) ) {
|
|
j = i + 6;
|
|
//poh = h_po ^ wids[j];
|
|
}
|
|
// p o b 123
|
|
if ( i + 6 < nw &&
|
|
wids[i ] == h_p &&
|
|
wids[i+2] == h_o &&
|
|
wids[i+4] == h_b &&
|
|
is_digit(wptrs[i+6][0]) ) {
|
|
j = i + 6;
|
|
//poh = h_po ^ wids[j];
|
|
}
|
|
// "po box 123"
|
|
if ( i + 4 < nw &&
|
|
wids[i ] == h_po &&
|
|
wids[i+2] == h_box &&
|
|
is_digit(wptrs[i+4][0]) ) {
|
|
j = i + 4;
|
|
//poh = h_po ^ wids[j];
|
|
}
|
|
// "p.o. 81255"
|
|
if ( i + 4 < nw &&
|
|
wids[i ] == h_p &&
|
|
wids[i+2] == h_o &&
|
|
is_digit(wptrs[i+4][0]) ) {
|
|
j = i + 4;
|
|
//poh = h_po ^ wids[j];
|
|
}
|
|
// skip if no good
|
|
if ( j < 0 ) continue;
|
|
// int16_tcuts
|
|
int32_t a = i;
|
|
int32_t b = j+1;
|
|
// add the street
|
|
Place *street = (Place *)m_sm.getMem(sizeof(Place));
|
|
if ( ! street ) return false;
|
|
street->m_a = a;
|
|
street->m_b = b;
|
|
street->m_alnumA = alnumPos;
|
|
street->m_alnumB = alnumPos+(j-i+2)/2;
|
|
street->m_type = PT_STREET;
|
|
street->m_str = wptrs[i];
|
|
street->m_strlen = wptrs[j]+wlens[j]-wptrs[i];
|
|
//street->m_adm1[0] = 0;
|
|
//street->m_adm1[1] = 0;
|
|
street->m_adm1Bits= 0LL;
|
|
//street->m_crid = 0;
|
|
street->m_flags2 = 0;
|
|
street->m_bits = 0;
|
|
street->m_address = NULL;
|
|
street->m_alias = NULL;
|
|
//street->m_hash = poh;
|
|
street->m_streetNumHash = wids[j];
|
|
street->m_streetIndHash = h_po;
|
|
// prevent overlap with next street
|
|
lastb = street->m_b;
|
|
// . need to know this for getting place name
|
|
// . place name must also be in upper case if po box is
|
|
if ( is_upper_a(wptrs[i][0]) )
|
|
street->m_bits |= PLF_HAS_UPPER;
|
|
// and note that it is a po box so Events.cpp can
|
|
// exclude it as an event location
|
|
street->m_flags2 |= PLF2_IS_POBOX;
|
|
// set its m_hash member
|
|
setHashes ( street , m_words , m_niceness );
|
|
// set some bits
|
|
for ( int32_t k = a ; bits && k < b ; k++ )
|
|
bits[k] |= D_IS_IN_STREET;
|
|
// advance
|
|
//m_ns++;
|
|
// stop if overflowing
|
|
//if ( m_ns >= MAX_STREETS ) break;
|
|
// advance, no! this fux up alnumPos... use lastb
|
|
//i = j;
|
|
// to next
|
|
//continue;
|
|
}
|
|
|
|
//
|
|
// we might be a street intersection!
|
|
//
|
|
bool hasAmp = m_words->hasChar(i,'&') ;
|
|
if ( wids[i] == h_and || hasAmp ) {
|
|
//if ( m_words->hasChar(i,'&') ) {
|
|
// save it
|
|
int32_t old = m_sm.getNumPtrs();
|
|
// use this
|
|
int32_t alnumPosArg = alnumPos;
|
|
// modify alnumPos if we are amp so it doesn't double
|
|
// count the word before the ampersand!
|
|
if ( hasAmp ) alnumPosArg++;
|
|
//m_ns = m_ns;
|
|
if ( ! addIntersection(i,alnumPosArg) )
|
|
return false;
|
|
/*
|
|
// show it
|
|
int32_t a = i - 8;
|
|
int32_t b = i + 8;
|
|
if ( a < 0 ) a = 0;
|
|
if ( m_ns != old ) {
|
|
a = m_streets[m_ns-1].m_a;
|
|
b = m_streets[m_ns-1].m_b;
|
|
}
|
|
char *str = m_wptrs[a];
|
|
int32_t ss = m_words->getStringSize ( a , b );
|
|
SafeBuf pp;
|
|
char c = str[ss];
|
|
str[ss] = 0;
|
|
char *gs = "bad";
|
|
if ( m_ns != old ) gs = "GOOD";
|
|
log("intersect: %s \"%s\"", gs,str);
|
|
str[ss] = c;
|
|
*/
|
|
//m_ns = m_ns;
|
|
int32_t ns = m_sm.getNumPtrs();
|
|
// if no intersection added, keep on going
|
|
if ( ns == old ) continue;
|
|
// keep going if not a street before it either
|
|
if ( ns <= 1 ) continue;
|
|
// get it and street before it
|
|
Place *s1 = (Place *)m_sm.getPtr(ns-1);
|
|
Place *s2 = (Place *)m_sm.getPtr(ns-2);
|
|
// get prev two streets
|
|
if ( s2->m_a > s1->m_a ) {
|
|
// i saw this for
|
|
// "Corner of 1551 State Route 232 and
|
|
// State Route 52". the street at m_ns-2
|
|
// was "1551 State Route 232" and the
|
|
// intersection street started at the word
|
|
// "Corner", so its m_a was less than...
|
|
// so in this case, let's simply disregard
|
|
// this intersection and not core.
|
|
// CAUTION. some m_bits are still set to
|
|
// D_IS_IN_STREET though...
|
|
// url was www.visitclermontohio.com/events.htm
|
|
//m_ns = old;
|
|
m_sm.setNumPtrs ( old );
|
|
//char *xx=NULL;*xx=0; }
|
|
continue;
|
|
}
|
|
// do not overlap streets!
|
|
//i = streets[m_ns-1].m_b - 1;
|
|
lastb = s1->m_b;
|
|
}
|
|
|
|
|
|
// we must now start with a number since we are just doing
|
|
// addresses in the usa, BUT i am now allowing "PO Box 1234"
|
|
// to be a valid street address
|
|
if ( ! is_digit(wptrs[i][0]) && wids[i] != h_one ) continue;
|
|
// if we are h_one we must be capitalized!
|
|
if ( wids[i] == h_one && wptrs[i][0] != 'O' ) continue;
|
|
// must not be in a date!
|
|
if ( bits &&
|
|
(bits[i] & D_IS_IN_DATE) &&
|
|
// noon street?
|
|
wids[i] != h_daily &&
|
|
wids[i] != h_noon &&
|
|
wids[i] != h_midnight )
|
|
continue;
|
|
// a '#' sign can not preceed us
|
|
// "KELLY S #7 JUAN TABO 1418 JUAN TABO NE, ..."
|
|
// . no! messes up "#3515 Berkeley Place NE"
|
|
//if ( i-1 >= 0 && wptrs[i ][-1]=='#' ) continue;
|
|
//if ( i-1 >= 0 && wptrs[i-1][ 0]=='#' ) continue;
|
|
// do not split hyphens
|
|
if ( i-2 >= 0 &&wptrs[i-1][0]=='-'&&wlens[i-1]==1&&wids[i-2])
|
|
continue;
|
|
// do not split periods like '1."5 miles west"'
|
|
if ( i-1 >= 0 && wptrs[i-1][0]=='.'&&wlens[i-1]==1 )
|
|
continue;
|
|
// fix "top X", that is not a street name!
|
|
if ( i-2 >= 0 && wids[i-2] == h_top )
|
|
continue;
|
|
// fix "route 66 casino" (highway 32 hotdogs) etc.
|
|
if ( i-2 >= 0 && wids[i-2] == h_route )
|
|
continue;
|
|
if ( i-2 >= 0 && wids[i-2] == h_rte )
|
|
continue;
|
|
// . fix 'highway "14 on the sandia crest road"'
|
|
// . yeah, the "14" is not a street address
|
|
if ( i-2 >= 0 && wids[i-2]==h_highway )
|
|
continue;
|
|
// fix 'hwy "14 on the sandia crest road"'
|
|
if ( i-2 >= 0 && wids[i-2]==h_hwy )
|
|
continue;
|
|
// fix 'hwy "14 on the sandia crest road"'
|
|
if ( i-2 >= 0 && wids[i-2]==h_hiway )
|
|
continue;
|
|
// fix "8600 West Bryn Mawr Avenue, Suite 920-N, Chicago, IL"
|
|
if ( prevWid == h_suite )
|
|
continue;
|
|
// and "county road" i guess
|
|
if ( i-2 >= 0 && wids[i-2]==h_cr )
|
|
continue;
|
|
// and "state road/route 14" too i guess
|
|
if ( i-4 >= 0 &&
|
|
(wids[i-4]==h_state ||
|
|
wids[i-4]==h_cnty ||
|
|
wids[i-4]==h_cty ||
|
|
wids[i-4]==h_county ) &&
|
|
(wids[i-2]==h_road ||
|
|
wids[i-2]==h_rd ||
|
|
wids[i-2]==h_rt ||
|
|
wids[i-2]==h_rte ||
|
|
wids[i-2]==h_route ) )
|
|
continue;
|
|
// . skip if an an "open" section
|
|
// . cored on http://www.abqtango.org/current.html
|
|
// . 'continue' was causing us to miss 4915 hawkins street
|
|
// for that url, so i commented out
|
|
//if ( sp[i]->m_wordEnd == -1 ) {
|
|
// char *xx=NULL;*xx=0;
|
|
// continue;
|
|
//}
|
|
// sanity check. make sure its the right section
|
|
//if ( i >= sp[i]->m_wordEnd ) {char*xx=NULL;*xx=0;}
|
|
// sanity check
|
|
if ( sp && i < sp[i]->m_a ) {char*xx=NULL;*xx=0;}
|
|
// are we a stop word?
|
|
//bool isStop = wlens[i] <=1 || ww->isQueryStopWord(i);
|
|
// are we cap?
|
|
//bool isCap = ww->isCapitalized(i);
|
|
// do not start with uncapitalized stop word
|
|
//if ( isStop && ! isCap ) continue;
|
|
// never start with "At"
|
|
//if ( wids[i] == h_at ) { atPreceeds = true; continue; }
|
|
// count the number of numbers
|
|
int32_t nums = 0;
|
|
// are we delimited on the left end?
|
|
//bool leftEnd = false;
|
|
// keep an accumulative hash of all the wids in the phrase
|
|
bool firstWasDir = false; // 1st word is a direction?
|
|
bool hadCornerDir = false;
|
|
char uc = -1; // are we capitalized?
|
|
int32_t alphaCount = 0;
|
|
int32_t indCountStreet = 0;
|
|
int32_t indCountDir = 0;
|
|
int32_t stopCount = 0;
|
|
int32_t numCount = 0;
|
|
bool firstWordIsNum = false;
|
|
bool lastWasNum = false;
|
|
bool lastWasDir = false;
|
|
int32_t commaCount = 0;
|
|
int32_t alnumsInPhrase = 0;
|
|
int64_t lastIndStreetHash = 0LL;
|
|
// hash of the non indicator alpha words in street name
|
|
//int64_t h1 = 0;
|
|
// . includes hash of directional indicators
|
|
// . we only use this if street name is a directional indicator
|
|
//int64_t h2 = 0;
|
|
//int64_t h2b = 0;
|
|
//int64_t h3 = 0;
|
|
//int64_t h4 = 0;
|
|
// word id of previous word
|
|
//int64_t pi = 0LL;
|
|
// punct right before us is a left bookend
|
|
//if ( i-1 >= 0 && wlens[i-1] >= 2 ) leftEnd = true;
|
|
//if ( i-1 >= 0 && wptrs[i-1][0] != ' ' &&
|
|
// getUtf8CharSize(wptrs[i-1])==1) leftEnd = true;
|
|
// if we are a number that is good too
|
|
//if ( is_digit(wptrs[i][0]) ) leftEnd = true;
|
|
// or a number is before us
|
|
//if ( i-1 >= 0 && is_digit(wptrs[i-1][0]) ) leftEnd = true;
|
|
// or tag is before us, no alnumword in between us and the tag
|
|
//if ( i-1 >= 0 && tids[i-1] ) leftEnd = true;
|
|
//if ( i-2 >= 0 && tids[i-2] ) leftEnd = true;
|
|
// if we are cap'd and word before us is not let that be a
|
|
// delimiter as well
|
|
//if (i-2>= 0 && isCap && wids[i-2] &&!ww->isCapitalized(i-2))
|
|
// leftEnd = true;
|
|
// need a delimiter on the left
|
|
//if ( ! leftEnd ) { atPreceeds = false; continue; }
|
|
// save it
|
|
int32_t ns_stack = m_sm.getNumPtrs();//m_ns;
|
|
// a flag for "1025 1/2 Lomas Blvd NE..."
|
|
int32_t fractionj = -1;
|
|
// "620-624 Central Ave SW." (El Rey) ?
|
|
bool hasRange = false;
|
|
// fix for "4909-15 Hawkins NE" for ceder.net
|
|
bool hasHyphenAddress = false;
|
|
// reset this
|
|
int32_t lastSpecialj = -1;
|
|
// loop over it
|
|
for ( int32_t j = i ; j < nw ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// we can never contain a tag
|
|
if ( tids[j] ) {
|
|
// skip if <sup>
|
|
if ( tids[j] == TAG_SUP ) continue;
|
|
if ( tids[j] == (TAG_SUP|BACKBIT) ) continue;
|
|
|
|
// fix "1024 4th st sw <span>edit</span>" for
|
|
// mapquest.com url, but carefule, i think
|
|
// a trumba url or something uses spans
|
|
// within its addresses
|
|
if ( (indCountDir || indCountStreet) &&
|
|
tids[j] == TAG_SPAN )
|
|
break;
|
|
|
|
// skip if non-breaking tag
|
|
if ( ! isBreakingTagId(tids[j]) )
|
|
continue;
|
|
// . allow br tags since microsoft front page
|
|
// . no! this is causing the zip code from
|
|
// a previous address to be used as the
|
|
// street address for the name of a business
|
|
// for www.enewsbuilder.net
|
|
// . well then at least allow it for
|
|
// "14 s.<br>2nd street"???? dunno...
|
|
//if ( tids[j] == TAG_BR )
|
|
// continue;
|
|
// allow xml tags
|
|
// . NO! this may help gwair.org because they
|
|
// have stupid xml tags in between addresses
|
|
// but it hurts trumba.com:
|
|
// "86454011</guid>\r\n\t\t\t
|
|
// <xCal:summary>9th Annual Thanksgiving..."
|
|
// because most people do not do this!
|
|
//if ( tids[j] == TAG_XMLTAG )
|
|
// continue;
|
|
//if ( tids[j] == (TAG_XMLTAG|BACKBIT) )
|
|
// continue;
|
|
// otherwise, stop it
|
|
break;
|
|
}
|
|
// are we punctuation?
|
|
if ( ! wids[j] ) {
|
|
// single space is ok
|
|
if (wptrs[j][0]==' '&&wlens[j]==1) continue;
|
|
// double space is ok
|
|
if (wptrs[j][0]==' '&&wptrs[j][1]==' '&&
|
|
wlens[j]==2) continue;
|
|
// period only after abbreviation
|
|
if ( wptrs[j][0] == '.' && j > 0 &&
|
|
isAbbr(wids[j-1])&&
|
|
// watch out for "4477 9TH AVE. SE"
|
|
// from boe.sandovalcountynm.gov
|
|
m_words->isSpaces2(j,1) )
|
|
//wptrs[j][1] == ' ' && wlens[j]==2 )
|
|
continue;
|
|
// . period after a single letter as well
|
|
// . N. M.
|
|
if ( wptrs[j][0] == '.' && j > 0 &&
|
|
wlens[j-1]==1 &&
|
|
// fix "8. wall street"
|
|
!is_digit(wptrs[j-1][0]) &&
|
|
wptrs[j][1] == ' ' &&
|
|
wlens[j]==2 ) continue;
|
|
// N.M.
|
|
if ( wptrs[j][0] == '.' && j > 0 &&
|
|
// fix 1."5 miles west"
|
|
!is_digit(wptrs[j-1][0]) &&
|
|
wlens[j-1]==1 && wlens[j]==1 ) continue;
|
|
// quote: The Noyes House 2525 "N" Avenue
|
|
// National
|
|
if (wptrs[j][0]=='\"'&&wptrs[j][1]==' ' &&
|
|
wlens[j]==2&&
|
|
// 'closer to 37"' is not a street name!
|
|
!is_digit(wptrs[j-1][0]))
|
|
continue;
|
|
if (wptrs[j][0]==' ' &&wptrs[j][1]=='\"'&&
|
|
wlens[j]==2) continue;
|
|
// punct mark: st. michael's drive
|
|
if (wptrs[j][0]=='\''&&wlens[j]==1) continue;
|
|
// mosby's run: utf8 apostrophe
|
|
if (wlens[j]==3&&
|
|
wptrs[j][0]==-30 &&
|
|
wptrs[j][1]==-128 &&
|
|
wptrs[j][2]==-103 )
|
|
continue;
|
|
// village of los ranchos growers' market
|
|
if (wptrs[j][0]=='\''&&wptrs[j][1]==' '&&
|
|
wlens[j]==2) continue;
|
|
// hyphens usually bad, but x-y is ok.
|
|
if(wptrs[j][0]=='-'&&wlens[j]==1&&j>0&&j+1<nw&&
|
|
ww->isAlpha(j-1)&&ww->isAlpha(j+1))continue;
|
|
// fix "3650-A Hwy 528..."
|
|
if(wptrs[j][0]=='-'&&wlens[j]==1&&j==i+1&&
|
|
j+1<nw&&wlens[j+1]==1&&
|
|
is_alpha_a(wptrs[j+1][0])) continue;
|
|
// "620-624 Central Ave SW." (El Rey)
|
|
if ( hasRange &&j==i+1 ) continue;
|
|
// fix for 4909-15 Hawkins NE" for ceder.net
|
|
if(j+1<nw&&
|
|
wlens[j+1]==2&&is_digit(wptrs[j+1][0])&&
|
|
wlens[j-1]>=4&&is_digit(wptrs[j-1][0]) ) {
|
|
hasHyphenAddress = true;
|
|
continue;
|
|
}
|
|
// sequence of whitespace is ok
|
|
int32_t k; for(k=0;k<wlens[j];k++)
|
|
if(!is_wspace_a(wptrs[j][k])) break;
|
|
if(k==wlens[j]) continue;
|
|
// '/' is ok if part of a fraction!
|
|
if( j == fractionj ) continue;
|
|
// . allow commas in foreign street addresses
|
|
// . brazil street address:
|
|
// "Rua Afonso Canargo, 805"
|
|
//if ( wptrs[j][0]==',' && wptrs[j][1]==' ' &&
|
|
// is_digit(wptrs[j][2]) &&
|
|
// j>0 && !is_digit(wptrs[j][-1]) ) {
|
|
// commaCount++;
|
|
// continue;
|
|
//}
|
|
//if ( wptrs[j][0]==' ' && wptrs[j][1]==',' &&
|
|
// is_digit(wptrs[j][2]) &&
|
|
// j>0 && !is_digit(wptrs[j][-1]) ) {
|
|
// commaCount++;
|
|
// continue;
|
|
//}
|
|
// . comma allowed only b4 directional indicatr
|
|
// . "131 Monroe St, NE"
|
|
// . no because we got a false positive:
|
|
// "1024 4th street, sw corner..."
|
|
// . ok, this is back again now! BUT... need
|
|
// to make sure a tag or city name follows it
|
|
// . crap, now we got
|
|
// "5305 Gibson, S.E. <b>Albuquerque ..."
|
|
// . shoot, also need to watch out for
|
|
// "Wisconsin Ave., NW"
|
|
if ( j+3 >= nw ) break;
|
|
bool commaAfter = false;
|
|
if ( wptrs[j][0]==',' )
|
|
commaAfter = true;
|
|
if ( wptrs[j][0]=='.' && wptrs[j][1]==',')
|
|
commaAfter = true;
|
|
if ( wptrs[j][0]==' ' && wptrs[j][1]==',')
|
|
commaAfter = true;
|
|
if ( ! commaAfter ) break;
|
|
char gotDir = 0;
|
|
if ( wids[j+1] == h_ne ) gotDir = 2;
|
|
if ( wids[j+1] == h_nw ) gotDir = 2;
|
|
if ( wids[j+1] == h_se ) gotDir = 2;
|
|
if ( wids[j+1] == h_sw ) gotDir = 2;
|
|
if ( wids[j+1] == h_n&&wids[j+3]==h_e)gotDir=4;
|
|
if ( wids[j+1] == h_n&&wids[j+3]==h_w)gotDir=4;
|
|
if ( wids[j+1] == h_s&&wids[j+3]==h_e)gotDir=4;
|
|
if ( wids[j+1] == h_s&&wids[j+3]==h_w)gotDir=4;
|
|
if ( ! gotDir ) break;
|
|
// do not breach
|
|
if ( j+gotDir >= nw ) continue;
|
|
// its great if tag follows the dir indicator
|
|
if ( tids[j+gotDir] ) continue;
|
|
// do not breach
|
|
if ( j+gotDir+1 >= nw ) continue;
|
|
// or a punct then a tag
|
|
if ( tids[j+gotDir+1] ) continue;
|
|
// fix for "700 Louisiana, SE 87108" for
|
|
// unm.edu url
|
|
if(is_digit(m_wptrs[j+gotDir+1][0]))continue;
|
|
// ok, a cap word must follow
|
|
if ( ! is_upper_utf8 (wptrs[j+gotDir+1]))break;
|
|
// we are good
|
|
continue;
|
|
// otherwise, stop, we hit bad punct that
|
|
// can not be included in a street address
|
|
//break;
|
|
}
|
|
// . otherwise we are alphanumeric
|
|
// . more than 10 is too many for a street
|
|
if ( alnumsInPhrase++ >= 10 ) break;
|
|
// one common is enough for a street address
|
|
if ( commaCount >= 2 ) break;
|
|
// . forbidden words
|
|
// . fixes "less than ; 1 mile away ; abq nm"
|
|
if ( wids[j] == h_away ) break;
|
|
// showing "39 results near" Albuquerque, NM
|
|
if ( wids[j] == h_results ) break;
|
|
// "3 Ave, E 144 To E 145 Sts"
|
|
if ( j==i+2 && wids[j] == h_to ) break;
|
|
// "11 Ave" implies "11th avenue"
|
|
if ( j==i+2 && wids[j] == h_ave ) break;
|
|
if ( j==i+2 && wids[j] == h_avenue ) break;
|
|
// "24 st to crescent st"
|
|
// www.nycgovparks.org/facilities/playgrounds has
|
|
// a ton of street formations describing park
|
|
// boundaries. so fix those:
|
|
if ( j==i+2 &&
|
|
j+2<nw &&
|
|
(wids[j] == h_st ||
|
|
wids[j] == h_sts ||
|
|
wids[j] == h_street ||
|
|
wids[j] == h_streets ||
|
|
wids[j] == h_ave ||
|
|
wids[j] == h_avenue ||
|
|
wids[j] == h_road ||
|
|
wids[j] == h_rd ) &&
|
|
(wids[j+2] == h_bet ||
|
|
wids[j+2] == h_between ||
|
|
wids[j+2] == h_btwn ||
|
|
wids[j+2] == h_to ||
|
|
wids[j+2] == h_at ) )
|
|
break;
|
|
// "90 And E"
|
|
if ( j==i+2 && wids[j] == h_and ) break;
|
|
// 124 st btwn 5 ave"
|
|
if ( wids[j] == h_btwn ) break;
|
|
// are we a stop word?
|
|
//bool isStopWord=wlens[j]<=1 ||ww->isQueryStopWord(j);
|
|
bool isStopWord=wlens[j]<=1 ||s_lc.isInTable(&wids[j]);
|
|
// treat this as a stop word, fixes
|
|
// "2001 E 7<sup>th</sup>"
|
|
if ( lastWasNum ) {
|
|
if ( wids[j] == h_th ) isStopWord = true;
|
|
if ( wids[j] == h_st ) isStopWord = true;
|
|
if ( wids[j] == h_nd ) isStopWord = true;
|
|
if ( wids[j] == h_rd ) isStopWord = true;
|
|
}
|
|
// are we upper or not?
|
|
bool upper = is_upper_utf8(wptrs[j]);
|
|
// do we have an upper or lower case word?
|
|
if ( uc == -1 && ! is_digit(wptrs[j][0]) ) {
|
|
if ( upper ) uc = 1;
|
|
else if ( ! isStopWord ) uc = 0;
|
|
}
|
|
// mixed case? if so stop!
|
|
if ( ! isStopWord &&
|
|
! is_digit(wptrs[j][0])&&
|
|
upper != uc ) {
|
|
// . fix "123 Wyoming ave."
|
|
// . fix "123 Wyoming ne"
|
|
IndDesc *id;
|
|
id=(IndDesc *)g_indicators.getValue(&wids[j]);
|
|
// set some flags based on indFlags
|
|
if ( ! id ) break;
|
|
// must be "avenue" or "ne" etc.
|
|
if ( ! (id->m_bit & IND_STREET) &&
|
|
! (id->m_bit & IND_DIR) )
|
|
break;
|
|
}
|
|
// if lower case stop word of two letters or more
|
|
// leads then do not allow that
|
|
// "1950 in New York, NY"
|
|
if ( isStopWord && wlens[j]>=2 && !upper && j==i+2 )
|
|
break;
|
|
// "7 days a week"
|
|
if ( wids[j]==h_days && j==i+2 )
|
|
break;
|
|
// "2 blocks north"
|
|
if ( wids[j]==h_blocks && j==i+2 )
|
|
break;
|
|
// "1 block north"
|
|
if ( wids[j]==h_block && j==i+2 )
|
|
break;
|
|
// "90 miles north"
|
|
if ( wids[j]==h_miles && j==i+2 )
|
|
break;
|
|
// "1 hour ago"
|
|
if ( wids[j]==h_hour && j==i+2 )
|
|
break;
|
|
if ( wids[j]==h_hr && j==i+2 )
|
|
break;
|
|
// "8 hours ..."
|
|
if ( wids[j]==h_hours && j==i+2 )
|
|
break;
|
|
if ( wids[j]==h_hrs && j==i+2 )
|
|
break;
|
|
// "2 mi north"
|
|
if ( wids[j]==h_mi && j==i+2 )
|
|
break;
|
|
// "cross 8 mile road"
|
|
if ( wids[j]==h_mile && j==i+2 )
|
|
break;
|
|
// "90 kilometers north"
|
|
if ( wids[j]==h_kilometers && j==i+2 )
|
|
break;
|
|
// "90 km north"
|
|
if ( wids[j]==h_km && j==i+2 )
|
|
break;
|
|
// "5 reviews"
|
|
if ( wids[j]==h_reviews && j==i+2)
|
|
break;
|
|
// 18 year(s) old
|
|
if ( (wids[j] == h_year ||
|
|
wids[j] == h_years ||
|
|
wids[j] == h_yr ||
|
|
wids[j] == h_yrs ) && j==i+2 )
|
|
break;
|
|
// this is not a street:
|
|
// "[copyright] 2008 The E.W. Scripps Co."
|
|
if ( j==i && i-1>0 && !tids[i-1] && !wids[i-1] &&
|
|
gb_strncasestr(wptrs[i-1],wlens[i-1],copy) )
|
|
break;
|
|
// this is not a street:
|
|
// "[copyright] 1997 - 2009 Albuquerque Journal"
|
|
if ( j==i && i-4>0 && is_digit(wptrs[i-2][0]) &&
|
|
gb_strncasestr(wptrs[i-1],wlens[i-1],copy) )
|
|
break;
|
|
|
|
// assume not
|
|
bool isDir = false;
|
|
bool isStreetInd = false;
|
|
// int16_tcut
|
|
bool isNum = ww->isNum2(j);
|
|
// set "lastWasNum"
|
|
if ( isNum ) lastWasNum = true;
|
|
else lastWasNum = false;
|
|
// treat this as a number too!
|
|
if ( wids[j] == h_one ) isNum = true;
|
|
// are we a number? (might also be "13a")
|
|
if ( isNum ) {
|
|
// . only one number per phrase?
|
|
// . NO! "2860 state highway 14 N.". needs 2!
|
|
if ( ++nums >= 3 ) break;
|
|
// if a $ preceeds, that is bad!
|
|
if ( j-1>=0 && wptrs[j][-1]=='$' ) break;
|
|
// . or break in front
|
|
// . was messing up "Elk Lodge #929\n
|
|
// 1720 N Montana Ave" so i added the tids
|
|
// check
|
|
// . i took this out because of
|
|
// "Albertsons #903 4300 ridge crest..."
|
|
// for http://www.estrelladelnortevineyard.
|
|
// com/SFV_retloc.php
|
|
//if(j-2>=0&&ww->isNum(j-2)&&!tids[j-1]&&
|
|
// !ww->hasChar(j-1,','))
|
|
// break;
|
|
// . filter "23,000 years ago"
|
|
// . filter "ages 8-16"
|
|
// . filter "ages 8 - 16"
|
|
// . filter "june 3-31"
|
|
// . filter "june 3 - 31"
|
|
// . filter "tuesday 3 - 5"
|
|
// . get first number, make it word #f
|
|
if ( wlens[j]==3 && j-2>=0 &&
|
|
is_digit(wptrs[j-2][0])&&
|
|
wlens[j-2]<=3 &&
|
|
(wptrs[j-1][0]=='-'||wptrs[j-1][0]==','||
|
|
wptrs[j-1][1]=='-') ) {
|
|
// "620-624 Central Ave SW." (El Rey)
|
|
// if word was not a number before us
|
|
if ( ! hasRange ) break;
|
|
if ( j != i+2 ) break;
|
|
}
|
|
if ( wlens[j]<=3 && j+2<nw &&
|
|
is_digit(wptrs[j+2][0]) &&
|
|
wlens[j+2]==3 &&
|
|
wlens[j+1]==1 &&
|
|
(wptrs[j+1][0]=='-'||wptrs[j+1][0]==','||
|
|
wptrs[j+1][1]=='-') ) {
|
|
// "620-624 Central Ave SW." (El Rey)
|
|
// if word was not a number before us
|
|
if ( j != i ) break;
|
|
if ( wptrs[j+1][0]==',') break;
|
|
int32_t a = ww->getAsLong(j);
|
|
int32_t b = ww->getAsLong(j+2);
|
|
if ( a >= b ) break;
|
|
if ( b - a > 10 ) break;
|
|
// i guess it is ok now
|
|
hasRange = true;
|
|
}
|
|
// no years.
|
|
int32_t n = ww->getAsLong(j);
|
|
// possible possessive year?
|
|
if ( n>=1980 && n<=2030 &&
|
|
j+1<nw && wptrs[j+1][0]=='\'')
|
|
break;
|
|
// year ending in s (1960s)
|
|
if(n>=1980&&n<=2030&&wptrs[j][wlens[j]-1]=='s')
|
|
break;
|
|
// count it
|
|
numCount++;
|
|
// and if we are first
|
|
if ( i == j ) firstWordIsNum = true;
|
|
// use for street num hash
|
|
//if ( nums == 1 ) h3 = wids[j];
|
|
}
|
|
// inc this count if not a number
|
|
else alphaCount++;
|
|
// time indicator?
|
|
//if ( wids[j] == h_am ) break;
|
|
//if ( wids[j] == h_pm ) break;
|
|
//if ( wids[j] == h_a && j+2<nw &&wids[j+2]==h_m)break;
|
|
//if ( wids[j] == h_p && j+2<nw &&wids[j+2]==h_m)break;
|
|
|
|
// break if we hit a suite indicator
|
|
if ( wids[j] == h_suite ) break;
|
|
if ( wids[j] == h_ste ) break;
|
|
|
|
// does a single letter or number follow "room"?
|
|
bool numFollows = false;
|
|
if ( j+2<nw && is_digit(wptrs[j+2][0]))numFollows=true;
|
|
// a single letter counts as a number too!
|
|
if (j+2<nw&&wids[j+2] && wlens[j+2]==1)numFollows=true;
|
|
// or ends in a number (like "A1")
|
|
if ( j+3<nw &&is_digit(wptrs[j+3][-1]))numFollows=true;
|
|
|
|
// these are like suites but need a number or
|
|
// single letter after them
|
|
if ( ( wids[j] == h_unit ||
|
|
wids[j] == h_bldg ||
|
|
wids[j] == h_bld ||
|
|
wids[j] == h_building ||
|
|
wids[j] == h_room ||
|
|
wids[j] == h_pier ||
|
|
wids[j] == h_rm ) && numFollows )
|
|
break;
|
|
|
|
// does this number start a fraction?
|
|
// 1025 1/2 Lomas Boulevard North West, Albuquerque, NM
|
|
if ( isNum && numCount == 2 && j+2<nw &&
|
|
wlens[j] == 1 && wptrs[j+1][0]=='/' &&
|
|
wlens[j+1]==1 && ww->isNum(j+2) ) {
|
|
// ignore it kinda
|
|
numCount -= 2;
|
|
nums -= 2;
|
|
// allow the / to pass
|
|
fractionj = j+1;
|
|
}
|
|
|
|
// no back to back numbers allowed in street address
|
|
else if ( isNum && j+3<nw && ww->isNum(j+2) &&
|
|
// exception for "1025 1/2 Lomas Blvd..."
|
|
( wptrs[j+3][0]!='/' || wlens[j+3]!=1) &&
|
|
// exception for "4909-15 hawkins NE"
|
|
// for www.ceder.net
|
|
(j>1&&wptrs[j-1][0]=='-'&&wlens[j-1]==1&&
|
|
wlens[j]<=2&&wlens[j-2]>=4) &&
|
|
! hasRange )
|
|
break;
|
|
|
|
// street has 2 or less numbers though!
|
|
if ( numCount >= 3 ) break;
|
|
|
|
// . if we are the 2nd number in the street name
|
|
// we must follow a "highway" or "state route" or
|
|
// "state road" or such abbreviation...
|
|
// . if we are "3rd" that should not be considered a
|
|
// num so isNum should be false for that,
|
|
// but we might have 3<sup>rd</sup>
|
|
// . this screws ups "Corrales Office Plaza,
|
|
// 3611 NM 528 NW, Ste. B, ABQ 87114" and makes us
|
|
// thinks the road is "528 NW" and "3611 NM" is
|
|
// part of the place name
|
|
/*
|
|
if ( isNum && numCount == 2 ) {
|
|
// assume not ok!
|
|
bool ok = false;
|
|
// are we ok?
|
|
if ( i-2>=0 && wids[i-2]==h_hwy )
|
|
ok = true;
|
|
if ( i-2>=0 && wids[i-2]==h_highway )
|
|
ok = true;
|
|
if ( i-4>=0 &&
|
|
wids[i-4]==h_state &&
|
|
wids[i-2]==h_road )
|
|
ok = true;
|
|
if ( i-4>=0 &&
|
|
wids[i-4]==h_state &&
|
|
wids[i-2]==h_route )
|
|
ok = true;
|
|
// get next alnum word, should be
|
|
// the "th" in "4 th street" for example
|
|
int32_t nn = i + 2;
|
|
if ( nn<nw && tids[nn] ) nn++;
|
|
if ( nn<nw && !wids[nn] ) nn++;
|
|
if ( nn<nw && wids[nn]==h_st ) ok = true;
|
|
if ( nn<nw && wids[nn]==h_nd ) ok = true;
|
|
if ( nn<nw && wids[nn]==h_rd ) ok = true;
|
|
if ( nn<nw && wids[nn]==h_th ) ok = true;
|
|
if ( ! ok )
|
|
break;
|
|
}
|
|
*/
|
|
|
|
|
|
// . fix "4701 wyoming blvd. NE abq nm 87111"
|
|
// . watch out for "501 elizabeth st. S.E."
|
|
// . after dir pretty much stop
|
|
// . "204 bryn mawr drive north east" --> 5 --> 6
|
|
if ( indCountDir>0 && alphaCount >= 6 ) break;
|
|
// containing an indicator qualifies us.
|
|
IndDesc *id=(IndDesc *)g_indicators.getValue(&wids[j]);
|
|
// set some flags based on indFlags
|
|
if ( id && (id->m_bit & IND_STREET) ) {
|
|
// invalidate it if it is "8k run"
|
|
if ( wids[j] == h_run &&
|
|
j-2>0 &&
|
|
is_digit(wptrs[j-2][0]) &&
|
|
to_lower_a(wptrs[j-1][-1])=='k' )
|
|
break;
|
|
// otherwise count it
|
|
indCountStreet++;
|
|
isStreetInd = true;
|
|
// save it
|
|
lastIndStreetHash = wids[j];
|
|
// back up hash
|
|
//h2b <<= 1;
|
|
//h2b ^= wids[j];
|
|
}
|
|
if ( id && (id->m_bit & IND_DIR ) ) {
|
|
// cancel the 'S' indicator if potential
|
|
// apostrophe! "aug 17 burt's lounge"
|
|
// we do not want "17 burt's"
|
|
if ( wlens[j]==1&&
|
|
(wptrs[j][0]=='s' ||
|
|
wptrs[j][0]=='S' ) &&
|
|
j>1 && wptrs[j][-1]!=' ' )
|
|
id = NULL;
|
|
else {
|
|
// mix it up
|
|
//h2 <<= 1;
|
|
// include it in this
|
|
//h2 ^= wids[j];
|
|
}
|
|
}
|
|
// assume not
|
|
lastWasDir = false;
|
|
if ( id && (id->m_bit & IND_DIR ) ) {
|
|
indCountDir++;
|
|
isDir = true;
|
|
if ( alphaCount == 1 ) firstWasDir = true;
|
|
// se? ne? nw? sw?
|
|
if ( wlens[j] == 2 ) hadCornerDir = true;
|
|
// northeast? etc.
|
|
if ( wlens[j] >= 9 ) hadCornerDir = true;
|
|
lastWasDir = true;
|
|
}
|
|
|
|
// . fix "1024 4th st sw <span>edit</span>" for
|
|
// mapquest.com url
|
|
// . this caught "330 Tijeras Ave NW Ofc Albuquerque,"
|
|
// . and "1664 Bridge Boulevard Southwest Rea" but i
|
|
// don't know what ofc and rea mean??
|
|
// . crap we lost "10000 NW Coors Blvd" which is a
|
|
// type-o
|
|
//if ( hadCornerDir && ! id && alphaCount >= 2 )
|
|
// break;
|
|
|
|
// stop "KELLY S #7 JUAN TABO 1418 JUAN TABO NE"
|
|
// from giving "7 JUAN TABO 1418 JUAN TABO NE" street
|
|
// basically, do not allow a part of the street name
|
|
// to be after this 2nd number...
|
|
if ( numCount == 2 &&
|
|
! isNum &&
|
|
! isDir &&
|
|
! isStreetInd &&
|
|
! hasRange &&
|
|
! hasHyphenAddress &&
|
|
wids[j] != h_st &&
|
|
wids[j] != h_nd &&
|
|
wids[j] != h_rd &&
|
|
wids[j] != h_th )
|
|
break;
|
|
|
|
// get synonym of word id
|
|
//int64_t *swid = getSynonymWord ( &wids[j] , &pi );
|
|
// word id of previous word
|
|
//pi = wids[j];
|
|
// this too
|
|
//if ( id ) h4 = *swid;//wids[j];
|
|
// . update this.
|
|
// . exclude numbers from this!
|
|
// . allow other numbers if no alpha word before them!
|
|
// . exclude directional indicators from this
|
|
// . but allow directional indicators if right after
|
|
// the street number though
|
|
//if ( j > i &&
|
|
// ( ! isDir || j == i + 2 ) &&
|
|
// // commenting this out hurts "100 3/4 road"
|
|
// // but it helps "2001 1/2 montgomery blvd"
|
|
// //( ! isNum || alphaCount == 0 ) &&
|
|
// ! isNum &&
|
|
// ! isStreetInd ) {
|
|
// // mix it up
|
|
// h1 <<= 1;
|
|
// // xor it
|
|
// h1 ^= *swid;//wids[j];
|
|
//}
|
|
// fix "2804 hwy 250" from excluding the "250"
|
|
//if ( isNum && alphaCount > 0 ) {
|
|
// // mix it up
|
|
// h1 <<= 1;
|
|
// // xor it
|
|
// h1 ^= *swid;//wids[j];
|
|
//}
|
|
// count stop words
|
|
//if ( ! id && ww->isStopWord(j) ) stopCount++;
|
|
if ( ! id && s_lc.isInTable(&wids[j]) ) stopCount++;
|
|
|
|
// need at least one number to be a street address
|
|
if ( numCount == 0 ) continue;
|
|
// . first or last word must be num
|
|
// . now i am deciding to limit to america only so
|
|
// we need the first word to be a number
|
|
//if ( ! firstWordIsNum && ! isNum ) continue;
|
|
if ( ! firstWordIsNum ) continue;
|
|
// need at least one alpha word
|
|
if ( alphaCount <= 0 ) continue;
|
|
|
|
// if first was number and we are stop word,
|
|
// no stop word right after the number!
|
|
// "2009 at the arts alliance gallery,1100 san mateo.."
|
|
// what about "488 E. hwy 66" ! E is a stop word!
|
|
//if ( numCount == 1 && stopCount == 1 &&
|
|
// alnumsInPhrase == 2 )
|
|
// break;
|
|
// can't have just stop words
|
|
if ( alphaCount == stopCount ) continue;
|
|
// or if a single char word, skip!
|
|
if ( j == i && wlens[i] == 1 ) continue;
|
|
// do not split hyphens
|
|
if ( j+2 <nw && wlens[j+1]==1 && wptrs[j+1][0]=='-'&&
|
|
wids[j+2]&&
|
|
// if both are digits, it is ok!
|
|
(!is_digit(wptrs[j][0])||!is_digit(wptrs[j+2][0])) )
|
|
continue;
|
|
// ok, now we are name, street or suite
|
|
bool goodStreet = ( indCountStreet >= 1 );
|
|
|
|
// if we are not an indicator but "Paseo de" preceeds
|
|
// us like in "Paseo de Peralta" then consider us to
|
|
// be good!
|
|
bool isPaseoDe = false;
|
|
if ( ! isStreetInd && j-4 > i &&
|
|
(wids[j-2]==h_de ||
|
|
// "407 paseo del canon" for guidebookamerica.com
|
|
wids[j-2]==h_del ) &&
|
|
wids[j-4]==h_paseo ) {
|
|
isPaseoDe = true;
|
|
goodStreet = true;
|
|
}
|
|
|
|
// . can't end on a lower case word if we have upper
|
|
// . "311 Main Street is in" was a street name!!
|
|
if ( uc==1 && ! upper && !is_digit(wptrs[j][0]))
|
|
goodStreet = false;
|
|
// direction is ok too
|
|
if ( firstWasDir ) goodStreet = true;
|
|
if ( isDir ) goodStreet = true;
|
|
// if just one alpha word and one indicator,that is bad
|
|
if ( alphaCount == 1 && indCountStreet==1 )
|
|
goodStreet = false;
|
|
if ( alphaCount == 1 && indCountDir ==1 )
|
|
goodStreet = false;
|
|
// if we are not good but an indicator follows, wait
|
|
if ( ! goodStreet && j+2<nw ) {
|
|
IndDesc *id=(IndDesc *)
|
|
g_indicators.getValue(&wids[j+2]);
|
|
if ( id && (id->m_bit & IND_STREET) ) continue;
|
|
if ( id && (id->m_bit & IND_DIR ) ) continue;
|
|
if ( is_digit(wptrs[j+2][0] ) ) continue;
|
|
}
|
|
// did we have a highway? (or state route)
|
|
bool isHighwayNum = false;
|
|
if ( isNum && j-2>=0 && wids[j-2] == h_highway )
|
|
isHighwayNum = true;
|
|
if ( isNum && j-2>=0 && wids[j-2] == h_hwy )
|
|
isHighwayNum = true;
|
|
if ( isNum && j-2>=0 && wids[j-2] == h_hiway )
|
|
isHighwayNum = true;
|
|
if ( isNum && j-2>=0 && wids[j-2] == h_cr )
|
|
isHighwayNum = true;
|
|
if ( isNum && j-4>=0 &&
|
|
(wids[j-4] == h_state ||
|
|
wids[j-4] == h_county ||
|
|
wids[j-4] == h_cnty ||
|
|
wids[j-4] == h_cty ) &&
|
|
( wids[j-2] == h_rd ||
|
|
wids[j-2] == h_road ) )
|
|
isHighwayNum = true;
|
|
// 1501 Route 66 (no state or county before it req'd)
|
|
if ( wids[j-2] == h_route ||
|
|
wids[j-2] == h_rte ||
|
|
wids[j-2] == h_rt )
|
|
isHighwayNum = true;
|
|
// ok if we are like "1300 state route 12" that is good
|
|
if ( isHighwayNum )
|
|
goodStreet = true;
|
|
// two or more street indicators can signifiy
|
|
// a combo of two streets. crap but we have
|
|
// "750 North St. Francis Drive" !
|
|
// "1300 st. hway 14"
|
|
//if ( indCountStreet >= 2 ) goodStreet = false;
|
|
// we must end on an indicator (or be like hwy 13)
|
|
if ( ! isDir && ! isStreetInd && ! isHighwayNum &&
|
|
! isPaseoDe )
|
|
goodStreet = false;
|
|
|
|
// . check this only if we need to
|
|
// . fixes "328 galisteo<br>santa fe. NM 87501"
|
|
// . should fix estrellanortevineyard.com's
|
|
// "T & D Market 485 Parker, Santa Rosa, NM..."
|
|
if ( ! goodStreet &&
|
|
alphaCount >= 1 &&
|
|
! isNum && j+2<nw &&
|
|
// for for "77kkob am abq nm" (radio station fix)
|
|
wids[j] != h_am &&
|
|
wids[j] != h_fm ) {
|
|
int32_t follows = cityAdm1Follows(j+2);
|
|
// good then
|
|
if ( follows ) goodStreet = true;
|
|
// error? this can never happen...
|
|
//if ( follows == -1 ) return false;
|
|
// fix for "6th Ave. New York, NY" which
|
|
// thinks that the city is "York!" for
|
|
// local.botw.org
|
|
if ( follows ) {
|
|
int32_t f2 = cityAdm1Follows(j);
|
|
// this can never happen... comment out
|
|
//if ( f2 == -1 ) return false;
|
|
if ( f2 ) goodStreet = false;
|
|
}
|
|
}
|
|
// if suite follows that is good too:
|
|
// "One Hallidie Plaza, Suite 404,..."
|
|
// from http://pipl.com/contact/
|
|
if ( ! goodStreet && alphaCount >= 1 &&
|
|
! isNum && j+2<nw &&
|
|
( wids[j+2]==h_suite ||
|
|
wids[j+2]==h_ste ) ) {
|
|
// set it good
|
|
goodStreet = true;
|
|
}
|
|
// does a single letter or number follow "room"?
|
|
bool numFollows2 = false;
|
|
if( j+4<nw && is_digit(wptrs[j+4][0]))numFollows2=true;
|
|
// a single letter counts as a number too!
|
|
if(j+4<nw&&wids[j+4] && wlens[j+4]==1)numFollows2=true;
|
|
// or ends in a number (like "A1")
|
|
if( j+5<nw &&is_digit(wptrs[j+5][-1]))numFollows2=true;
|
|
// room <num> is likewise a good stopping point
|
|
if ( ! goodStreet &&
|
|
alphaCount >= 1 &&
|
|
! isNum &&
|
|
numFollows2 &&
|
|
( wids[j+2]==h_building ||
|
|
wids[j+2]==h_bldg ||
|
|
wids[j+2]==h_bld ||
|
|
wids[j+2]==h_unit ||
|
|
wids[j+2] == h_pier ||
|
|
wids[j+2] == h_room ||
|
|
wids[j+2] == h_rm ) )
|
|
goodStreet = true;
|
|
// if we end on "hwy" and a number follows, incl #
|
|
if ( (wids[j] == h_hwy ||
|
|
wids[j] == h_highway ||
|
|
wids[j] == h_hiway ||
|
|
wids[j] == h_cr ) &&
|
|
j + 2 < nw && ww->isNum(j+2) && wlens[j+1]<=3 &&
|
|
! tids[j+1] &&
|
|
// fix "86 Old Las Vegas Hwy., 983-2700."
|
|
! ww->hasChar(i,',') &&
|
|
(j+3>=nw||wptrs[j+3][0]!='-') )
|
|
goodStreet = false;
|
|
// same goes for state routes/roads
|
|
if ( (wids[j] == h_route ||
|
|
wids[j] == h_road ||
|
|
wids[j] == h_rd ||
|
|
wids[j] == h_rte ||
|
|
wids[j] == h_route ) &&
|
|
j - 2 >= 0 &&
|
|
( wids[j-2] == h_state ||
|
|
wids[j-2] == h_cty ||
|
|
wids[j-2] == h_cnty ||
|
|
wids[j-2] == h_county )&&
|
|
j + 2 < nw && ww->isNum(j+2) && wlens[j+1]<=3 &&
|
|
! tids[j+1] &&
|
|
// anticipate similar problem to
|
|
// "86 Old Las Vegas Hwy., 983-2700."
|
|
! ww->hasChar(i,',') &&
|
|
(j+3>=nw||wptrs[j+3][0]!='-') )
|
|
goodStreet = false;
|
|
|
|
|
|
// must not end on a lower case stop word of 2+ letters
|
|
if ( wids[j] == h_and || wids[j] == h_or ||
|
|
// fixes "2006 census for ... abq nm"
|
|
wids[j] == h_for )
|
|
goodStreet = false;
|
|
|
|
// fix 'b "9 st n" of boardwalk'
|
|
if ( numCount == 1 &&
|
|
indCountDir == 1 &&
|
|
indCountStreet == 1 &&
|
|
// fix "357 Court NE" for
|
|
// http://www.anneryan.com/book/order.htm
|
|
lastIndStreetHash != h_court &&
|
|
lastWasDir &&
|
|
alphaCount == (indCountDir + indCountStreet) )
|
|
goodStreet = false;
|
|
|
|
// add as a street?
|
|
if ( ! goodStreet ) continue;
|
|
// only add one street per i
|
|
// UNLESS lasti ended right before a city or state
|
|
// in which case we should add both
|
|
if ( lastSpecialj == -1 )
|
|
//m_ns = ns_stack;
|
|
m_sm.setNumPtrs(ns_stack);
|
|
|
|
// record if a city/state follows us so if we end
|
|
// up absorbing that city/state to make a bigger
|
|
// street name then we create 2+ streets and do not
|
|
// erase the previous one
|
|
if ( goodStreet &&
|
|
j+4<nw &&
|
|
// "9501 Indian School NE" for
|
|
// www.cabq.gov/communitycenters/centers.html
|
|
// was thinking about "School, Nebraska" so
|
|
// let's fix that with this h_ne constraint
|
|
m_wids[j+4] != h_ne && // nebraska = NorthEast
|
|
cityAdm1Follows(j+2) &&
|
|
lastSpecialj < 0 )
|
|
lastSpecialj = j;
|
|
|
|
// . erase previous entry if same starting point
|
|
// . like "501 Copper Ave" vs "501 Copper Ave. NW"
|
|
//if ( ns > 0 && i == streets[ns-1].m_a ) ns--;
|
|
// length of current street (place)
|
|
//int32_t plen = (wptrs[j] + wlens[j]) - wptrs[i];
|
|
// int16_t cut
|
|
int32_t a = i;
|
|
int32_t b = j+1;
|
|
|
|
// fix "corrales bosque gallery
|
|
// 4685 Corrales Rd. *in* Corrales NM"
|
|
if ( m_wids[b-1] == h_in && alphaCount >= 2 ) {
|
|
b -= 2;
|
|
alnumsInPhrase -= 1;
|
|
}
|
|
|
|
// length of current street (place)
|
|
int32_t plen = (wptrs[b-1] + wlens[b-1]) - wptrs[a];
|
|
|
|
// add the street
|
|
Place *street = (Place *)m_sm.getMem(sizeof(Place));
|
|
if ( ! street ) return false;
|
|
street->m_a = a;
|
|
street->m_b = b;
|
|
street->m_alnumA = alnumPos;
|
|
street->m_alnumB = alnumPos + alnumsInPhrase;
|
|
street->m_type = PT_STREET;
|
|
street->m_str = wptrs[i];
|
|
street->m_strlen = plen;
|
|
//street->m_adm1[0] = 0;
|
|
//street->m_adm1[1] = 0;
|
|
street->m_adm1Bits= 0LL;
|
|
//street->m_crid = 0;
|
|
street->m_flags2 = 0;
|
|
street->m_bits = 0;
|
|
street->m_address = NULL;
|
|
street->m_alias = NULL;
|
|
// only use the purer hash if it is non-zero
|
|
//if ( h1 ) street->m_hash = h1;
|
|
//else if ( h2 ) street->m_hash = h2;
|
|
//else street->m_hash = h2b;
|
|
//street->m_streetNumHash = h3;
|
|
//street->m_streetIndHash = h4;
|
|
// set its m_hash member
|
|
setHashes ( street , m_words , m_niceness );
|
|
// prevent overlap with next street
|
|
lastb = street->m_b;
|
|
// . need to know this for getting place name
|
|
// . place name must also be in upper case if
|
|
// the street is...
|
|
if ( uc == 1 ) street->m_bits |= PLF_HAS_UPPER;
|
|
// . set some bits
|
|
// . only do this if we are the unambiguous part,
|
|
// otherwise we miss "Sandia Park" in
|
|
// "1 WILDFLOWER LANE SANDIA PARK NM" because
|
|
// the 2nd street has "SANDIA PARK" as part of it
|
|
// and is doesn't get considered as a city to add
|
|
// to m_places[] below because this bit was getting
|
|
// set -- i.e. we don't take cities from street names
|
|
if ( lastSpecialj==-1 || lastSpecialj==j ) {
|
|
for ( int32_t k = a ; bits && k < b ; k++ )
|
|
bits[k] |= D_IS_IN_STREET;
|
|
}
|
|
|
|
// this is a hack
|
|
if ( lastSpecialj >= 0 && lastSpecialj != j ) {
|
|
int32_t ns = m_sm.getNumPtrs();
|
|
Place *ps = (Place *)m_sm.getPtr(ns-2);
|
|
ps ->m_flags2 |= PLF2_COLLISION;
|
|
street->m_flags2 |= PLF2_COLLISION;
|
|
}
|
|
|
|
// had an indicator? ave rd or direction
|
|
//if ( indCountDir || indCountStreet )
|
|
// street->m_flags2 |= PLF2_HAD_INDICATOR;
|
|
// point to next street
|
|
//m_ns++;
|
|
// stop if overflowing
|
|
//if ( m_ns >= MAX_STREETS ) break;
|
|
}
|
|
// nuke this
|
|
//atPreceeds = false;
|
|
// end i loop - go to next potential start of a phrase
|
|
}
|
|
|
|
//
|
|
//
|
|
// END STREET LIST GENERATION
|
|
//
|
|
//
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
//
|
|
// SET the m_places[] array (m_np) of cities, states and zips
|
|
//
|
|
// we now allow any street address to use any city/state mentioned
|
|
// anywhere in the document.
|
|
//
|
|
|
|
// for setting Place
|
|
alnumPos = -1;
|
|
int32_t ignoreUntil = -1;
|
|
int32_t lastCityAlnumB = -1;
|
|
int64_t prevWid = 0LL;
|
|
bool inCityIndicator = false;
|
|
bool inStateIndicator = false;
|
|
|
|
// scan the entire document
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// a tag?
|
|
if ( m_tids[i] ) {
|
|
// assume not an indicator tag
|
|
inCityIndicator = false;
|
|
inStateIndicator = false;
|
|
// must be xml
|
|
if ( m_tids[i] != TAG_XMLTAG ) continue;
|
|
// it can inidcate things
|
|
char *tagName = m_wptrs[i]+1;
|
|
if ( strncasecmp(tagName,"eventCity",9) == 0 )
|
|
inCityIndicator = true;
|
|
if ( strncasecmp(tagName,"eventState",10) == 0 )
|
|
inStateIndicator = true;
|
|
continue;
|
|
}
|
|
// skip if not alnum
|
|
if ( ! m_wids[i] ) continue;
|
|
// skip if in a script section
|
|
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
|
|
// count alnums
|
|
alnumPos++;
|
|
// skip if in a street. avoid getting "NE" for nebraska when
|
|
// it is in a street like "1234 girard NE" or something. same
|
|
// goes for streets named after cities or states. and using
|
|
// zip codes that are street numbers
|
|
// . assume if in street not capitalized, fixes
|
|
// "123 Main Street Abq" so Abq is not in a phrase too
|
|
if ( bits && (bits[i] & D_IS_IN_STREET) ) continue;
|
|
// skip if in menu
|
|
//if ( sp[i]->m_flags & SEC_MENU ) continue;
|
|
|
|
if ( i < ignoreUntil ) continue;
|
|
|
|
// get it
|
|
int64_t lastWid = prevWid;
|
|
// update it
|
|
prevWid = m_wids[i];
|
|
|
|
// must be a zip
|
|
if ( is_digit(m_wptrs[i][0]) ) {
|
|
// int16_tcut
|
|
// this crashed for h=70799779105646092LL
|
|
// word="60527"
|
|
int64_t h = m_wids[i];
|
|
// 5 digits
|
|
if ( m_wlens[i] != 5 ) continue;
|
|
// check for zip code
|
|
int32_t slot = g_zips.getSlot(&h);
|
|
// skip if not
|
|
if ( slot < 0 ) continue;
|
|
// make sure only one! US-only for now...
|
|
// unfortunately we do have zips that have multiple
|
|
// city names... so we can't have this here...
|
|
// later we should add code to pick the best one...
|
|
//if(g_zips.getNextSlot(slot,&h)>=0){char*xx=NULL;*xx=0
|
|
// get the place
|
|
ZipDesc *zd =(ZipDesc *)g_zips.getValueFromSlot(slot);
|
|
// sanity check
|
|
//if ( m_np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
// ok, add this entry
|
|
Place *p = (Place *)m_pm.getMem(sizeof(Place));
|
|
if ( ! p ) return false;
|
|
// set it
|
|
p->m_adm1Bits = zd->m_adm1Bits;
|
|
p->m_adm1[0] = zd->m_adm1[0];
|
|
p->m_adm1[1] = zd->m_adm1[1];
|
|
p->m_type = PT_ZIP;
|
|
p->m_a = i;
|
|
p->m_b = i+1;
|
|
p->m_alnumA = alnumPos;
|
|
p->m_alnumB = alnumPos+1;
|
|
p->m_str = m_wptrs[i];
|
|
p->m_strlen = m_wlens[i];
|
|
p->m_hash = h;
|
|
p->m_cityHash = zd->m_cityHash;
|
|
p->m_cityStr = g_cityBuf + zd->m_cityOffset;
|
|
p->m_bits = 0;
|
|
// set PLF_FROMTITLE bit
|
|
if ( sp ) {
|
|
Section *ss = sp[p->m_a];
|
|
if ( ss->m_flags & SEC_IN_TITLE )
|
|
p->m_bits |= PLF_FROMTITLE;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// . fix <eventCity>abq</eventCity> for pageaddevent
|
|
// . update this now that we set lastWidCapitalized
|
|
if ( ! is_upper_utf8(m_wptrs[i]) &&
|
|
! inCityIndicator &&
|
|
! inStateIndicator )
|
|
continue;
|
|
|
|
// . deal with "Kansas City"
|
|
// . deal with "New Mexico" where "New" is also a city!
|
|
// . does this word start a city?
|
|
Place *pc = getCityPlace ( i , alnumPos , m_words );
|
|
// or start a state?
|
|
Place *ps = getStatePlace ( i , alnumPos , m_words );
|
|
|
|
// . ignore two letter state codes that are not both capped
|
|
// . fixes "In" "De Paul" "Co" "La"
|
|
if ( ps &&
|
|
ps->m_strlen==2 &&
|
|
// unless like <eventState>nm</eventState>
|
|
! inStateIndicator &&
|
|
!is_upper_a(m_wptrs[ps->m_a][1]) &&
|
|
// . unless we follow a city!
|
|
// . fixes "New Orleans;La;70113" for
|
|
// http://texasdrums.drums.org/new_orleansdrums.htm
|
|
lastCityAlnumB != alnumPos )
|
|
ps = NULL;
|
|
|
|
// if neither, continue on
|
|
if ( ! pc && ! ps ) continue;
|
|
|
|
// set preferred place, "pp"
|
|
Place *pp = NULL;
|
|
if ( ! pp ) pp = pc;
|
|
if ( ! pp ) pp = ps;
|
|
// . if tied prefer longer. if length tied prefer state
|
|
// . "California" is both a state and a city
|
|
if ( pc && ps ) {
|
|
// kill state if city longer
|
|
if ( pc->m_alnumB > ps->m_alnumB ) ps = NULL;
|
|
// or kill city is state is longer
|
|
else if ( pc->m_alnumB < ps->m_alnumB ) pc = NULL;
|
|
}
|
|
|
|
if ( pc )
|
|
lastCityAlnumB = pc->m_alnumB;
|
|
|
|
// set this
|
|
if ( pc ) ignoreUntil = pc->m_b;
|
|
if ( ps ) ignoreUntil = ps->m_b;
|
|
|
|
// prevent breach
|
|
// leave some room for adding places below...
|
|
//if ( m_np + 200 > MAX_PLACES ) {
|
|
// log("addr: too many cities/state to store in places "
|
|
// "array. truncating.");
|
|
// break;
|
|
// //char *xx=NULL;*xx=0;
|
|
//}
|
|
|
|
bool inTitle = false;
|
|
// do not do this if called from msg13 and have no sections
|
|
if ( sp && (sp[i]->m_flags & SEC_IN_TITLE) ) inTitle = true;
|
|
|
|
if ( pc ) {
|
|
// int16_tcut
|
|
Place *p = (Place *)m_pm.getMem(sizeof(Place));
|
|
if ( ! p ) return false;
|
|
// ok, good to add
|
|
gbmemcpy ( p , pc , sizeof(Place) );
|
|
// set PLF_FROMTITLE bit
|
|
if ( inTitle ) p->m_bits |= PLF_FROMTITLE;
|
|
// if last word was in,set this
|
|
if ( lastWid == h_in ) p->m_flags2 |= PLF2_REQUIRED;
|
|
}
|
|
|
|
if ( ps ) {
|
|
// int16_tcut
|
|
Place *p = (Place *)m_pm.getMem(sizeof(Place));
|
|
if ( ! p ) return false;
|
|
// ok, good to add
|
|
gbmemcpy ( p , ps , sizeof(Place) );
|
|
// set PLF_FROMTITLE bit
|
|
if ( inTitle ) p->m_bits |= PLF_FROMTITLE;
|
|
// if last word was in,set this
|
|
if ( lastWid == h_in ) p->m_flags2 |= PLF2_REQUIRED;
|
|
}
|
|
}
|
|
|
|
// record end of this
|
|
m_npSaved = m_pm.getNumPtrs(); // m_np;
|
|
|
|
//
|
|
// make a list of occupation names for avoid false positive
|
|
// identification of a place because it is after the word "at" but
|
|
// really it is something like "john, an engineer at HP, ..." referring
|
|
// to where that person works. fixes
|
|
// www.aliconferences.com/conf/social_media_govt1209/pre.htm which has
|
|
// "jon carpenter, digital strategist at stratacomm"
|
|
//
|
|
// left off on http://www.jobvertise.com/jobs/indexC21.html
|
|
//
|
|
// search for "One who..." in dictionary? "person that ..."
|
|
//
|
|
// AMBIGUITY:
|
|
// "meet the engineer at cisco"
|
|
// - does the at phrase modify "meet" or "engineer" ???
|
|
//
|
|
static char *s_jobs[] = {
|
|
"strategist",
|
|
"accountant",
|
|
// interim rector at St. Margaret's (www.st-margarets.org)
|
|
"rector",
|
|
"director",
|
|
"programmer",
|
|
"lawyer",
|
|
"attorney",
|
|
"engineer",
|
|
"residence", // "artist in residence at the LA county HS"
|
|
"developer",
|
|
"worker",
|
|
"ceo",
|
|
"cto",
|
|
"cmo",
|
|
"cfo",
|
|
|
|
// jobvertise.com
|
|
"lead",
|
|
"mechanic",
|
|
"technician",
|
|
"clerk",
|
|
"specialist",
|
|
"manager",
|
|
"distributor",
|
|
"salesman",
|
|
"consultant",
|
|
"developer",
|
|
"therapist",
|
|
"officer",
|
|
"coordinator",
|
|
"administrator",
|
|
"pilot",
|
|
"advisor",
|
|
"counselor",
|
|
"counsellor",
|
|
"hospitalist",
|
|
"chair",
|
|
"chairman",
|
|
"pulmonologist",
|
|
"repesentative",
|
|
"tutor",
|
|
"planner",
|
|
"assistant",
|
|
"scientist",
|
|
"nutritionist",
|
|
"aquarist",
|
|
"biologist",
|
|
"doctor",
|
|
"dentist",
|
|
"farmer",
|
|
"intern",
|
|
"expert",
|
|
"partner",
|
|
"adjuster",
|
|
"bartender",
|
|
"associate",
|
|
"supervisor",
|
|
"executive",
|
|
"typist",
|
|
"nurse",
|
|
"actor",
|
|
"actress",
|
|
"analyst",
|
|
"modeler",
|
|
"actuary",
|
|
"acupuncturist",
|
|
"poster",
|
|
"professor",
|
|
"teacher",
|
|
"student",
|
|
"senior",
|
|
"junior",
|
|
"sophomore",
|
|
"freshman",
|
|
"writer",
|
|
"blogger",
|
|
"reporter",
|
|
"instructor",
|
|
"designer",
|
|
"physician",
|
|
"driver",
|
|
"trucker",
|
|
"diver",
|
|
"carrier",
|
|
"receptionist",
|
|
"hostess",
|
|
"host",
|
|
"waiter",
|
|
"waitress",
|
|
"cook",
|
|
"chef",
|
|
"recruiter",
|
|
"secretary",
|
|
"practitioner",
|
|
"architect",
|
|
"contractor",
|
|
"plumber",
|
|
"electrician",
|
|
"janitor",
|
|
"bricklayer",
|
|
"banker",
|
|
"trainer",
|
|
"buyer",
|
|
"welder",
|
|
"assembler",
|
|
"packer",
|
|
"aesthetician",
|
|
"officer",
|
|
"policeman",
|
|
"fireman",
|
|
"cop",
|
|
"sheriff",
|
|
"deputy",
|
|
"dispatcher",
|
|
"warden",
|
|
"guard",
|
|
"chemist",
|
|
"operator",
|
|
"owner",
|
|
"producer",
|
|
"housekeeper",
|
|
"maid",
|
|
"babysitter",
|
|
"model",
|
|
"agent",
|
|
"controller",
|
|
"inspector",
|
|
"professional",
|
|
"athlete",
|
|
"facilitator",
|
|
"mover",
|
|
"biller",
|
|
"builder",
|
|
"carpenter",
|
|
"anesthesiologist",
|
|
"animator",
|
|
"investigator",
|
|
"detective",
|
|
"cleaner",
|
|
"maker",
|
|
"sewer",
|
|
"installer",
|
|
"mgr",
|
|
"eng",
|
|
"appraiser",
|
|
"telemarketer",
|
|
"interpreter",
|
|
"linguist",
|
|
"attendant",
|
|
"jeweler",
|
|
"cutter",
|
|
"lumberjack",
|
|
"laborer",
|
|
"collector",
|
|
"coach",
|
|
"counsel",
|
|
"pastor",
|
|
"priest",
|
|
"bishop",
|
|
"cardinal",
|
|
"scout",
|
|
"tester",
|
|
"auditor",
|
|
"drafter",
|
|
"submitter",
|
|
"tech",
|
|
"integrator",
|
|
"machinist",
|
|
"monkey", // grease monkey code monkey
|
|
"liaison",
|
|
"fabricator",
|
|
"wholesaler",
|
|
"baker",
|
|
"handler",
|
|
"bagger",
|
|
"teller",
|
|
"captain",
|
|
"houseperson",
|
|
"server",
|
|
"porter",
|
|
"barber",
|
|
"stylist",
|
|
"barista",
|
|
"reviewer",
|
|
"critic",
|
|
"barwoman",
|
|
"demonstrator",
|
|
"beautician",
|
|
"ambassador",
|
|
"boss",
|
|
"shopper",
|
|
"entrepreneur",
|
|
"bellperson",
|
|
"bellman",
|
|
"biostatistician",
|
|
"statistician",
|
|
"mathematician",
|
|
"biopsychologist",
|
|
"biotechnician",
|
|
"organizer",
|
|
"leader",
|
|
"foreman",
|
|
"bookeeper",
|
|
"bookkeeper",
|
|
"player",
|
|
"bowler",
|
|
"golfer",
|
|
"customer",
|
|
"visitor",
|
|
"ranger",
|
|
"broker",
|
|
"busser",
|
|
"busboy",
|
|
"dishwasher",
|
|
"washer",
|
|
"sweeper",
|
|
"purchaser",
|
|
"cabinetmaker",
|
|
"decorator",
|
|
"cameraman",
|
|
"registrar",
|
|
"canvaser",
|
|
"canvasser",
|
|
"promoter",
|
|
"announcer",
|
|
"pharmacist",
|
|
"stocker",
|
|
"cardiologist",
|
|
"surgeon",
|
|
"miner",
|
|
"dancer",
|
|
"caregiver",
|
|
"aide",
|
|
"caseworker",
|
|
"cashier",
|
|
"librarian",
|
|
"technologist",
|
|
"anchorman",
|
|
"anchor",
|
|
"employee",
|
|
"manufacturer",
|
|
"assoc",
|
|
"scheduler",
|
|
"botanist",
|
|
"grower",
|
|
"processor",
|
|
"educator",
|
|
"marketer",
|
|
"hygienist",
|
|
"coder",
|
|
"paramedic",
|
|
"anesthetist",
|
|
"midwife",
|
|
"doula",
|
|
"master",
|
|
"moderator",
|
|
"mediator",
|
|
"judge",
|
|
"member",
|
|
"juror",
|
|
"chauffeur",
|
|
"butler",
|
|
"cheesemaker",
|
|
NULL
|
|
};
|
|
static bool s_initJobs = false;
|
|
if ( ! s_initJobs ) {
|
|
// load it up
|
|
if ( ! initWordTable ( &s_jobTable,s_jobs,
|
|
//sizeof(s_jobs),
|
|
"jobstbl") )
|
|
return false;
|
|
// do not re-do
|
|
s_initJobs = true;
|
|
}
|
|
|
|
|
|
//
|
|
//
|
|
// BEGIN FAKE STREET NAME IDENTIFICATION
|
|
//
|
|
// "Tingley Colesium"
|
|
//
|
|
// We treat POTENTIAL place names as street names for all practical
|
|
// purposes.
|
|
//
|
|
//
|
|
|
|
// flag
|
|
char lastWasBreak = 0;
|
|
// reset this since we loop anew
|
|
alnumPos = -1;
|
|
// set if at preceeds the name
|
|
bool atFlag = false;
|
|
int64_t lastWid = 0LL;
|
|
// do not do this if we are javascript
|
|
int32_t ni = nw;
|
|
if ( m_contentType == CT_JS ) ni = 0;
|
|
// do not do this if called from msg13
|
|
if ( ! m_sections ) ni = 0;
|
|
// the first word in a td table cell
|
|
int32_t firstWordInCell;
|
|
// first we identify the candidate place names
|
|
for ( int32_t i = 0 ; i < ni ; i++ ) {
|
|
// skip tags
|
|
if ( tids[i] ) {
|
|
// input tags reset at tag, like
|
|
// Location: <input ...> for zevents.com
|
|
if ( tids[i] == TAG_INPUT ) atFlag = false;
|
|
// hit a td cell?
|
|
if ( sp[i]->m_tagId == TAG_TD )
|
|
firstWordInCell = sp[i]->m_firstWordPos;
|
|
lastWasBreak = 1;
|
|
continue;
|
|
}
|
|
// skip if in script section or whatever to keep alnumPos right
|
|
if ( sp[i]->m_flags & badFlags ) continue;
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) {
|
|
// if not just spaces, then we are a "break" in which
|
|
// case set "lastWasBreak" to true
|
|
char *p = wptrs[i];
|
|
char *pend = p + wlens[i];
|
|
for ( ; p < pend ; p++ ) {
|
|
if ( is_wspace_a(*p) ) continue;
|
|
// Dave & Buster's
|
|
if ( *p == '\'' ) continue;
|
|
// Dave & Buster's
|
|
if ( *p == '&' ) continue;
|
|
// St. John's College
|
|
if ( *p == '.' && is_wspace_a(p[1]) &&
|
|
i>0 && isAbbr(wids[i-1]) )
|
|
continue;
|
|
lastWasBreak = 1;
|
|
break;
|
|
}
|
|
// skip this now
|
|
continue;
|
|
}
|
|
// it's an alnum
|
|
alnumPos++;
|
|
// remember last i
|
|
bool saved = atFlag;
|
|
// and update to the new one
|
|
atFlag = false;
|
|
// save this
|
|
int64_t savedWid = lastWid;
|
|
// update it now
|
|
lastWid = wids[i];
|
|
// do not start with a date
|
|
if ( bits && (bits[i]&D_IS_IN_DATE)){lastWasBreak=1;continue;}
|
|
// a lower guy followed by an upper guy is a break
|
|
if ( is_lower_utf8 ( wptrs[i] ) &&
|
|
is_upper_utf8 ( wptrs[i] ) ) {lastWasBreak = 1;continue;}
|
|
|
|
// if it is the first word in a td cell and the column header
|
|
// is like "location" or "venue" then mark it as after at
|
|
if ( i == firstWordInCell ) {
|
|
// get column header
|
|
Section *cp = sp[i]->m_headColSection;
|
|
if ( cp &&
|
|
cp->m_firstWordPos > 0 &&
|
|
// skip the header itself
|
|
cp->m_firstWordPos != i &&
|
|
// must just be one word for now
|
|
cp->m_firstWordPos == cp->m_lastWordPos &&
|
|
( wids[cp->m_firstWordPos] == h_location ||
|
|
wids[cp->m_firstWordPos] == h_venue ||
|
|
wids[cp->m_firstWordPos] == h_where ) ) {
|
|
// assume what follows is a place name
|
|
saved = true; // atFlag = true;
|
|
lastWasBreak = 1;
|
|
//continue;
|
|
}
|
|
}
|
|
|
|
// this is a break
|
|
if ( wids[i] == h_at ) {
|
|
// ignore it though if previous word was one of
|
|
// these because it could be driving directions!!
|
|
// this fixes the "4139 prospect" event because we
|
|
// thought it had two locations and it got
|
|
// SEC_MULT_LOCATIONS because we thought "at Menaul"
|
|
// was a place name and not a driving direction
|
|
// for the salsapower.com url
|
|
if ( savedWid == h_left ||
|
|
savedWid == h_right ||
|
|
// appeared at the blah
|
|
savedWid == h_appeared ||
|
|
// had a role at the world premier
|
|
savedWid == h_role ||
|
|
savedWid == h_studied ||
|
|
// won a prize at the blah
|
|
savedWid == h_prize ||
|
|
savedWid == h_right ||
|
|
// men who stare at goats
|
|
savedWid == h_stare ||
|
|
savedWid == h_gaze ||
|
|
savedWid == h_look ||
|
|
savedWid == h_looking ||
|
|
//savedWid == ||
|
|
savedWid == h_north ||
|
|
savedWid == h_south ||
|
|
savedWid == h_east ||
|
|
savedWid == h_west ) {lastWasBreak=0;continue;}
|
|
// "at sea"
|
|
if ( i+2<nw &&
|
|
( wids[i+2] == h_sea ||
|
|
// "at discounted"
|
|
wids[i+2] == h_discounted ||
|
|
// "at www.fridaynight.com"
|
|
wids[i+2] == h_www ||
|
|
// $10 at door
|
|
wids[i+2] == h_door ||
|
|
// "at discount price"
|
|
wids[i+2] == h_discount ) ) {
|
|
lastWasBreak=0;continue;}
|
|
// skip directional at phrases like
|
|
// "(at Siler Road)" from culturemob.com
|
|
if ( i+4<nw &&
|
|
( wids[i+4]==h_road ||
|
|
// at the finish [line] (racing)
|
|
wids[i+4]==h_finish ||
|
|
// "at the door"
|
|
wids[i+4]==h_door ||
|
|
// "at [a|the] discount[ed]"
|
|
wids[i+4]==h_discount ||
|
|
wids[i+4]==h_discounted ||
|
|
wids[i+4]==h_street ||
|
|
wids[i+4]==h_avenue ||
|
|
wids[i+4]==h_ave ||
|
|
wids[i+4]==h_st ||
|
|
wids[i+4]==h_rd ) ) {
|
|
lastWasBreak=0;continue;}
|
|
// "at the entrance" but not "at the entrance to"
|
|
if ( i+4<nw &&
|
|
wids[i+4] == h_entrance &&
|
|
(i+6>=nw || wids[i+6]!=h_to ) ) {
|
|
lastWasBreak=0;continue;}
|
|
// . at the X area
|
|
// . x = registration (for races)
|
|
if ( i+6<nw &&
|
|
wids[i+2] == h_the &&
|
|
wids[i+6] == h_area ) {
|
|
lastWasBreak=0;continue;}
|
|
// "[occuptation] at [company]"
|
|
if ( s_jobTable.isInTable(&savedWid) ) {
|
|
lastWasBreak=0;continue;}
|
|
// otherwise assume what follows is a place name
|
|
atFlag = true;
|
|
lastWasBreak = 1;
|
|
continue;
|
|
}
|
|
// location: or where: indicates a location too!
|
|
if ( ( wids[i]==h_location ||
|
|
wids[i]==h_venue ||
|
|
wids[i]==h_where ) &&
|
|
i+1<nw && ww->hasChar(i+1,':') &&
|
|
// fix "Events at this location:" for
|
|
// nycday.eventbrite.com
|
|
(i-2<0 || wids[i-2]!=h_this) ) {
|
|
atFlag = true;
|
|
lastWasBreak = 1;
|
|
// skip the colon-containing word
|
|
i++;
|
|
continue;
|
|
}
|
|
// . "come to" is similar to "at"
|
|
// . fixes http://www.metropolisarts.com/index.php/fuseaction/
|
|
// show.details/showid/238/metropolis-wine-tasting.html
|
|
if ( i+4<nw && wids[i] == h_come && wids[i+2]== h_to ) {
|
|
atFlag = true;
|
|
lastWasBreak = 1;
|
|
i = i + 2;
|
|
continue;
|
|
}
|
|
// skip "at least"
|
|
if ( saved && wids[i] == h_least ) {lastWasBreak=0;continue;}
|
|
if ( saved && wids[i] == h_most ) {lastWasBreak=0;continue;}
|
|
if ( saved && wids[i] == h_this ) {lastWasBreak=0;continue;}
|
|
// allow lower case "the" after "at", but skip it
|
|
if ( saved && wids[i] == h_the ) {
|
|
// check for fake at phrase
|
|
if ( i+2 < nw && (wids[i+2] == h_heart ||
|
|
wids[i+2] == h_core ) ) {
|
|
// skip it
|
|
lastWasBreak = 0; continue; }
|
|
// if it is lower case skip it so it is not
|
|
// included in the place name
|
|
if ( is_lower_utf8(wptrs[i]) ) {
|
|
atFlag = true; lastWasBreak = 1; continue; }
|
|
// otherwise do not do the lower case check right below
|
|
}
|
|
// "at the entrace"
|
|
else if ( saved && wids[i] == h_entrance ) {
|
|
atFlag = true;
|
|
// not a break because we need "at the entrance to the"
|
|
lastWasBreak = 0;
|
|
continue;
|
|
}
|
|
else if ( saved && wids[i] == h_to && savedWid == h_entrance ){
|
|
atFlag = true;
|
|
lastWasBreak = 1;
|
|
continue;
|
|
}
|
|
// does it have some kind of delimiter before it?
|
|
else if ( is_lower_utf8(wptrs[i])){lastWasBreak = 0; continue;}
|
|
// each candidate needs somekind of "break" before them
|
|
if ( ! lastWasBreak ) continue;
|
|
// skip if in a script section
|
|
if ( sp[i]->m_flags & badFlags ) continue;
|
|
// or in menu
|
|
if ( sp[i]->m_flags & SEC_MENU ) continue;
|
|
// . skip if trying to start with a date
|
|
// . fixes http://www.usadancenm.org/links.html so we do
|
|
// no start fake street names with ":30 pm ..."
|
|
if ( bits && (bits[i] & D_IS_IN_DATE) ) continue;
|
|
|
|
// skip if trying to start with something we have already
|
|
// listed as a street in the above loop
|
|
if ( bits && (bits[i] & D_IS_IN_STREET) ) continue;
|
|
|
|
// stop if streets are maxed
|
|
//if ( m_ns >= MAX_STREETS ) break;
|
|
// ok, we got a candidate, reset this
|
|
lastWasBreak = 0;
|
|
//int64_t h = 0LL;
|
|
int64_t pi = 0LL;
|
|
bool prevUpper = false;
|
|
bool prevAdded = false; // added prev to the street array?
|
|
// count em
|
|
int32_t alphaCount = 0;
|
|
int32_t numCount = 0;
|
|
// subalnum count
|
|
int32_t subAlnumCount = 0;
|
|
int64_t h = 0LL;
|
|
int64_t lastWid2 = 0LL;
|
|
// . now make a hash of all substrings of the following words
|
|
// for lookup into namedb
|
|
// . ADD CANDIDATE
|
|
for ( int32_t j = i ; j < nw ; j++ ) {
|
|
// tags stop our train
|
|
if ( tids[j] ) break;
|
|
// or if ventures into a street from above
|
|
if ( bits && (bits[j] & D_IS_IN_STREET) ) break;
|
|
// do not include a date
|
|
if ( bits && (bits[j] & D_IS_IN_DATE) ) break;
|
|
// bad punct stops our train
|
|
if ( ! wids[j] ) {
|
|
char *p = wptrs[j];
|
|
char *pend = p + wlens[j];
|
|
for ( ; p < pend ; p++ ) {
|
|
if ( is_wspace_a(*p) ) continue;
|
|
if ( *p == '\'' ) continue;
|
|
// Dave & Buster's
|
|
if ( *p == '&' ) continue;
|
|
// St. John's College
|
|
if ( *p == '.' && is_wspace_a(p[1]) &&
|
|
j>0 && isAbbr(wids[j-1]) )
|
|
continue;
|
|
break;
|
|
}
|
|
// bad punct stops the train!
|
|
if ( p < pend ) break;
|
|
// otherwise, just skip it
|
|
continue;
|
|
}
|
|
// count it
|
|
subAlnumCount++;
|
|
// . do not add the first word if its "The" into this
|
|
// . fixes "The Guild Cinema" not matching placedb
|
|
// entries for "Guild Cinema"
|
|
//if ( wids[j] == h_the && h == 0LL ) continue;
|
|
// are we upper?
|
|
bool isUpper = is_upper_utf8 ( wptrs[j] );
|
|
// fix for "North 4th Arts Center"
|
|
if ( is_digit(wptrs[j][0])){isUpper=true; numCount++; }
|
|
else alphaCount++;
|
|
// lowercase non-stopword stops our train
|
|
//if ( ! isUpper && ! ww->isStopWord(j) ) break;
|
|
if ( ! isUpper && ! s_lc.isInTable(&wids[j]) ) break;
|
|
// . convert place name word into base word
|
|
// . synonyms
|
|
// . converts 4th to fourth, theatre to theater, etc.
|
|
//int64_t *hw = getSynonymWord ( &wids[j] , &pi );
|
|
// wordid of previous word
|
|
pi = wids[j];
|
|
// shift and store
|
|
h <<= 1LL;
|
|
// xor it in
|
|
h ^= wids[j];
|
|
// save it
|
|
int64_t savedWid2 = lastWid2;
|
|
lastWid2 = wids[j];
|
|
// do not int16_ten "Center of Arts" to "Center" because
|
|
// it is causing the "Performing Arts Center of the
|
|
// the Steinbeck Institute of Art" to be an alias for
|
|
// "San Jose Performing Arts Center" because
|
|
// "Performing Arts Center" is a subset of
|
|
// "San Jose Performing Arts Center".
|
|
if(prevAdded&&savedWid2==h_center&&wids[j]==h_of){
|
|
m_sm.rewind(1);
|
|
prevAdded = false;
|
|
}
|
|
// do not end on a lower case stop word
|
|
if ( ! isUpper ) {
|
|
// . got hash in stop words now
|
|
// . ignore it if syn table returned 0 (ignore)
|
|
//if ( *hw ) {
|
|
// h <<= 1LL;
|
|
// h ^= *hw;//wids[j];
|
|
//}
|
|
prevUpper = false;
|
|
continue;
|
|
}
|
|
// prev was upper case and we are upper case,
|
|
// overwrite the previous entry
|
|
if ( prevAdded && prevUpper && isUpper ) {
|
|
//m_ns--;
|
|
m_sm.rewind(1);
|
|
prevAdded = false;
|
|
}
|
|
// likewise, do not split sequences of lowercase words
|
|
if ( prevAdded && ! prevUpper && ! isUpper ) {
|
|
//m_ns--;
|
|
m_sm.rewind(1);
|
|
prevAdded = false;
|
|
}
|
|
// fix "Submit a" in "Submit a New Event"
|
|
//if ( ! prevUpper && isUpper ) ns--;
|
|
// set this
|
|
prevUpper = isUpper;
|
|
// ignore it if syn table returned 0 (ignore) (school)
|
|
//if ( *hw ) {
|
|
// // mix it up
|
|
// h <<= 1LL;
|
|
// // incorporate
|
|
// h ^= *hw; // wids[j];
|
|
//}
|
|
// do not add if only a number, like 4th or 113
|
|
if ( alphaCount == 0 ) continue;
|
|
// skip if crazy - fixes graffiti.org
|
|
if ( alphaCount > 10 ) continue;
|
|
// . do not add if only one word with one letter
|
|
// . fixes javascript variables being place names
|
|
if ( alphaCount == 1 && wlens[j] == 1 ) continue;
|
|
// or if just the word "the"
|
|
if ( alphaCount == 1 && wids[j] == h_the ) continue;
|
|
// now allowed to have City or Town like in
|
|
// "City/Town: Albuquerque NM"
|
|
// fixes www.dukecityfix.com/xn/detail/1233957:Eve
|
|
// nt:391851?xg_source=activity from getting that
|
|
// as a place name in abq
|
|
if ( alphaCount ==1 && wids[j] == h_city ) continue;
|
|
if ( alphaCount ==1 && wids[j] == h_town ) continue;
|
|
// . mdw mdw mdw
|
|
// . not allowed to be a city or adm1 name!
|
|
// . fixes us getting "albuquerque" as a place name!
|
|
if ( g_cities.isInTable ( &h ) ) continue;
|
|
// or state name
|
|
if ( g_states.isInTable ( &h ) ) continue;
|
|
// or zip
|
|
if ( g_zips.isInTable ( &h ) ) continue;
|
|
// TODO: or country????
|
|
|
|
// set this flag
|
|
prevAdded = true;
|
|
// add the street
|
|
Place *street = (Place *)m_sm.getMem(sizeof(Place));
|
|
if ( ! street ) return false;
|
|
street->m_a = i;
|
|
street->m_b = j+1;
|
|
street->m_alnumA = alnumPos;
|
|
street->m_alnumB = alnumPos+subAlnumCount;
|
|
street->m_type = PT_STREET;
|
|
street->m_str = wptrs[i];
|
|
street->m_strlen = wptrs[j]+wlens[j]-wptrs[i];
|
|
//street->m_adm1[0] = 0;
|
|
//street->m_adm1[1] = 0;
|
|
street->m_adm1Bits= 0LL;
|
|
//street->m_crid = 0;
|
|
street->m_bits = 0;
|
|
street->m_address = NULL;
|
|
street->m_alias = NULL;
|
|
//street->m_hash = h;
|
|
//street->m_streetNumHash = 0;//wids[j];
|
|
//street->m_streetIndHash = 0;//h_po;
|
|
// why do we need this now?
|
|
if ( is_upper_a(wptrs[i][0]) )
|
|
street->m_bits |= PLF_HAS_UPPER;
|
|
//
|
|
// we are SPECIAL!!!!!!
|
|
//
|
|
street->m_flags2 = PLF2_IS_NAME;
|
|
// or in this
|
|
if ( saved ) street->m_flags2 |= PLF2_AFTER_AT;
|
|
// set the m_hash member
|
|
setHashes ( street , m_words , m_niceness );
|
|
// do not add if hash is zero, that usually means it
|
|
// is the single word "the"
|
|
if ( street->m_hash == 0 ) {
|
|
m_sm.rewind(1);
|
|
continue;
|
|
}
|
|
// sanity check
|
|
//if(street->m_hash == 0 ) { char *xx=NULL;*xx=0;}
|
|
//m_ns++;
|
|
// stop if full
|
|
//if ( m_ns >= MAX_STREETS ) break;
|
|
}
|
|
}
|
|
|
|
//
|
|
//
|
|
// END FAKE STREET LIST GENERATION
|
|
//
|
|
//
|
|
|
|
|
|
//
|
|
//
|
|
// add UNKNOWN addresses
|
|
//
|
|
// i.e. "location to be determined"
|
|
// i.e. "call for location"
|
|
// This will cause Events.cpp to set the EV_UNKNOWN_LOCATION bit!!!
|
|
//
|
|
int32_t b2;
|
|
bool add = false;
|
|
alnumPos = -1;
|
|
// do not do this if we are javascript
|
|
ni = nw;
|
|
if ( m_contentType == CT_JS ) ni = 0;
|
|
// do not do this if we have no sections -- call from msg13
|
|
if ( ! m_sections ) ni = 0;
|
|
// loop over every word
|
|
for ( int32_t i = 0 ; i < ni ; i++ ) {
|
|
// skip if not word
|
|
if ( ! wids[i] ) continue;
|
|
// skip if in script section or whatever to keep alnumPos right
|
|
// we need this to keep alnumPos in alignment with the other
|
|
// places!
|
|
if ( sp[i]->m_flags & badFlags ) continue;
|
|
// count this
|
|
alnumPos++;
|
|
// must match this
|
|
if ( i+6<nw &&
|
|
wids[i ] == h_location &&
|
|
wids[i+2] == h_to &&
|
|
wids[i+4] == h_be &&
|
|
wids[i+6] == h_determined ) {
|
|
add = true;
|
|
b2 = i + 7;
|
|
}
|
|
if ( i+6<nw &&
|
|
wids[i ] == h_call &&
|
|
wids[i+2] == h_for &&
|
|
wids[i+4] == h_location ) {
|
|
add = true;
|
|
b2 = i + 5;
|
|
}
|
|
// . no,no, i like looking for words that indicate events.
|
|
// getting into the meaning of the language seems to be the
|
|
// way to go, because signmeup.com's sections are all
|
|
// div tags describing the same event really.
|
|
// . no, now we fix this right with SEC_TOD_EVENT flags
|
|
// set in Dates.cpp. you can't telescope to a brother
|
|
// that has that flag set
|
|
// . "details tba"
|
|
// . fixes abtango.com where everyone uses the April 2010
|
|
// as a header
|
|
if ( i+2<nw &&
|
|
wids[i ] == h_details &&
|
|
wids[i+2] == h_tba ) {
|
|
add = true;
|
|
b2 = i + 3;
|
|
}
|
|
// call x-y-z for location
|
|
if ( i+6<nw &&
|
|
wids[i ] == h_call &&
|
|
wids[i+8] == h_for &&
|
|
wids[i+10] == h_location ) {
|
|
add = true;
|
|
b2 = i + 11;
|
|
}
|
|
// call x-y for location
|
|
if ( i+6<nw &&
|
|
wids[i ] == h_call &&
|
|
wids[i+6] == h_for &&
|
|
wids[i+8] == h_location ) {
|
|
add = true;
|
|
b2 = i + 9;
|
|
}
|
|
// skip if nothing found
|
|
if ( ! add ) continue;
|
|
// reset it
|
|
add = false;
|
|
// stop if full
|
|
//if ( m_ns >= MAX_STREETS ) break;
|
|
// add the street
|
|
Place *street = (Place *)m_sm.getMem(sizeof(Place));
|
|
if ( ! street ) return false;
|
|
street->m_a = i;//a2;
|
|
street->m_b = b2;
|
|
// do we need these?
|
|
street->m_alnumA = alnumPos;
|
|
street->m_alnumB = alnumPos + 1; // this is wrong
|
|
street->m_type = PT_STREET;
|
|
street->m_str = wptrs[i];
|
|
street->m_strlen = wptrs[b2-1]+wlens[b2-1]-wptrs[i];
|
|
//street->m_adm1[0] = 0;
|
|
//street->m_adm1[1] = 0;
|
|
street->m_adm1Bits= 0LL;
|
|
//street->m_crid = 0;
|
|
street->m_bits = 0;
|
|
street->m_address = NULL;
|
|
street->m_alias = NULL;
|
|
// why do we need this now?
|
|
if ( is_upper_a(wptrs[i][0]) )
|
|
street->m_bits |= PLF_HAS_UPPER;
|
|
// we are SPECIAL!!!!!!
|
|
street->m_flags2 = PLF2_IS_NAME | PLF2_AFTER_AT;
|
|
// set the m_hash member
|
|
setHashes ( street , m_words , m_niceness );
|
|
// do not add if hash is zero, that usually means it
|
|
// is the single word "the"
|
|
if ( street->m_hash == 0 ) continue;
|
|
// inc it
|
|
//m_ns++;
|
|
}
|
|
|
|
|
|
// update this
|
|
//m_ns = m_ns;
|
|
|
|
// sanity check
|
|
//if ( m_ns > MAX_STREETS ) { char *xx=NULL;*xx=0; }
|
|
|
|
//if ( m_ns == MAX_STREETS ) {
|
|
// log("addr: street buf is maxed out for %s!",m_url->m_url);
|
|
// //char *xx=NULL;*xx=0;
|
|
//}
|
|
|
|
// if no streets found, then bail, that is it
|
|
if ( m_sm.getNumPtrs() == 0 ) return true;
|
|
|
|
// breached?
|
|
//if ( m_sm.getNumPtrs() > 4000 )
|
|
// m_breached = true;
|
|
|
|
/////////////////////////////
|
|
//
|
|
// set PLF2_REGISTER
|
|
//
|
|
/////////////////////////////
|
|
// do not do this logic if we are javascript because we do not set
|
|
// SEC_SENTENCE if the file is javascript
|
|
int32_t imax = m_sm.getNumPtrs();//m_ns;
|
|
if ( m_contentType == CT_JS ) imax = 0;
|
|
|
|
//
|
|
// if it is a place to buy tickets or register for an event then
|
|
// let's set this flag so Events.cpp can ignore it!
|
|
for ( int32_t i = 0 ; i < imax ; i++ ) {
|
|
// not for msg13's call
|
|
if ( ! m_sections ) break;
|
|
// get the street that we center the address around
|
|
Place *street = (Place *)m_sm.getPtr(i);
|
|
// telescope up until we hit the sentence section
|
|
Section *ss = m_sections->m_sectionPtrs[street->m_a];
|
|
for ( ; ss ; ss = ss->m_parent )
|
|
if ( ss->m_flags & SEC_SENTENCE ) break;
|
|
// must have it
|
|
if ( ! ss ) { char *xx=NULL;*xx=0; }
|
|
// . if section is contained in title tag, allow it through
|
|
// . fixes "Tingley Coliseum : Buy Tickets , ... " for
|
|
// events.mapchannels.com
|
|
if ( ss->m_flags & SEC_IN_TITLE ) continue;
|
|
// . use it as the bookends
|
|
// . [a,b) may now actually expand beyond the "ss" section
|
|
// because of the new split sentence logic in
|
|
// Sections::addSentences() to deal with sentences that
|
|
// unevenly span multiple sections like in aliconference.com
|
|
// and abqtango.com
|
|
int32_t a = ss->m_senta;
|
|
int32_t b = ss->m_sentb;
|
|
// use this i guess
|
|
if ( isTicketDate ( a , b , m_wids , m_bits , m_niceness ) )
|
|
street->m_flags2 |= PLF2_TICKET_PLACE;
|
|
/*
|
|
// assume not
|
|
bool reg = false;
|
|
// now scan forward from there
|
|
for ( int32_t j = a ; j < b ; j++ ) {
|
|
// skip punct words
|
|
if ( ! m_wids[j] ) continue;
|
|
// is it register?
|
|
if ( m_wids[j] == h_register ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_sign && m_wids[j+2] == h_up ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_signup ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_buy && m_wids[j+2] == h_tickets ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_purchase&&m_wids[j+2]==h_tickets) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_get && m_wids[j+2] == h_tickets ) {
|
|
reg = true; break; }
|
|
// "give them tickets to" for santafe playhouse url
|
|
// to cancel out "Max's or Dish n' Spoon" as a place
|
|
if ( m_wids[j] == h_tickets&& m_wids[j+2] == h_to ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_presale ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_on && m_wids[j+2] == h_sale ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_pre && m_wids[j+2] == h_sale ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_sales && m_wids[j+2] == h_end ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_sales && m_wids[j+2] == h_begin ) {
|
|
reg = true; break; }
|
|
if ( m_wids[j] == h_sales && m_wids[j+2] == h_start ) {
|
|
reg = true; break; }
|
|
}
|
|
// it is such a place
|
|
if ( reg ) street->m_flags2 |= PLF2_TICKET_PLACE;
|
|
*/
|
|
}
|
|
|
|
|
|
|
|
|
|
//
|
|
// . set Section::numStreets var
|
|
// . scan streets and set Section::m_numStreets
|
|
// . if streets are adjacent in one continuous mass, then treat as
|
|
// a single street for these purposes
|
|
/*
|
|
for ( int32_t X = 0 ; X < ns ; X++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get the street that we center the address around
|
|
Place *street = &streets[X];
|
|
// get street before it
|
|
Place *prev = NULL; if ( X > 0 ) prev = &streets[X-1];
|
|
// . if we had a street immediately before us, bail
|
|
// . we count consecutive streets as a single street
|
|
if ( prev && prev->m_alnumB == street->m_alnumA ) continue;
|
|
// get it
|
|
Section *si = sp[street->m_a];
|
|
// inc recusrively
|
|
for ( ; si ; si = si->m_parent )
|
|
// inc it
|
|
si->m_numStreets++;
|
|
}
|
|
*/
|
|
|
|
// debug
|
|
//printPlaces( streets , ns , m_pbuf , m_sections );
|
|
|
|
//
|
|
//
|
|
// . the huge address creation part
|
|
// . ultimately sets m_addresses[]/m_na array
|
|
//
|
|
//
|
|
|
|
// . make a 5 lists, one for each place type, to hold all the
|
|
// Places in the int16_tlist[] array we just created
|
|
// . include Places in the tagRec and title as well
|
|
// . use a NULL ptr to indicate "no place"
|
|
// . then do a 6-way nested loop over all the combos
|
|
Place *pname [10]; int32_t nn = 0;
|
|
Place *padm1 [MAX_ADM1 ]; int32_t na = 0;
|
|
Place *pcity [MAX_CITIES]; int32_t nc = 0;
|
|
Place *pzip [MAX_ZIPS]; int32_t nz = 0;
|
|
Place *psuite [10]; int32_t nu = 0;
|
|
// each latlon might be tethered to a street address already
|
|
// topologically speaking. we need to telescope it out and
|
|
// tether it to the first street we hit. including afterats and
|
|
// fake street names? it might be tethered to a place venue name
|
|
// that we never recognize. and instead we tether it to a brother
|
|
// brother city/state when we shouldn't.
|
|
//Place *latlon [MAX_LATLONS];
|
|
//Place *pctry [10]; int32_t ny = 0;
|
|
|
|
//Place places [ MAX_PLACES ];
|
|
//int32_t np = 0;
|
|
|
|
// sanity check
|
|
//if ( 500 > MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
// add places from the body!
|
|
//np = addProperPlaces ( 0 , nw , 500 , places , MAX_PLACES , np ,
|
|
// // set this flag Place::m_flags
|
|
// PLF_FROMBODY );
|
|
|
|
/*
|
|
// add in default adm1/city/zip from title
|
|
int32_t a = 0;
|
|
int32_t b = 0;
|
|
int32_t tapos = 0;
|
|
if ( ss ) {
|
|
a = ss->m_titleStart;
|
|
tapos = ss->m_titleStartAlnumPos;
|
|
}
|
|
if ( ss ) b = ss->m_titleEnd ;
|
|
// limit those nasty int32_t titles
|
|
if ( b > a + 30 ) b = a + 30;
|
|
|
|
// add proper places from title into "places" array
|
|
np = addProperPlaces ( a , b , 20 , places , MAX_PLACES , np ,
|
|
// . set this flag Place::m_flags
|
|
PLF_FROMTITLE ,
|
|
// alnumPos, subtract -1 since it immediately
|
|
// adds 1 to the first alnum it finds
|
|
tapos - 1 ,
|
|
-1 );
|
|
// breach check
|
|
if ( np > MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
*/
|
|
|
|
// save for popping
|
|
//int32_t np_stack = m_np;
|
|
|
|
// int16_tcut
|
|
char **w = wptrs;
|
|
|
|
HashTableX dat;
|
|
char datbuf[4000];
|
|
dat.set ( 4 , 4 , 256, datbuf, 4000,false,m_niceness,"adm1buf");
|
|
// . set up the base array of all states
|
|
// . "bn" = baseNum
|
|
// . TODO: make sure state we select is not in a street!
|
|
int32_t bn = 0;
|
|
// always have a NULL
|
|
padm1 [ bn++ ] = NULL;
|
|
// then
|
|
for ( int32_t i = 0 ; i < m_npSaved ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get city, state or zip
|
|
Place *p = (Place *)m_pm.getPtr(i);
|
|
// . allow state to come from anywhere in the document
|
|
// . TODO: later add meta description to get christinesaari.com
|
|
if ( p->m_type != PT_STATE ) continue;
|
|
// skip if intersects a street, like "ohio street"
|
|
if ( p->m_a >= 0 && bits && (bits[p->m_a] & D_IS_IN_STREET) )
|
|
continue;
|
|
// make the key for deduping
|
|
char key[4];
|
|
key[0] = p->m_adm1[0];
|
|
key[1] = p->m_adm1[1];
|
|
key[2] = 0;
|
|
key[3] = 0;
|
|
// skip if dup
|
|
if ( dat.isInTable ( &key ) ) continue;
|
|
// add it to the dedup table
|
|
if ( ! dat.addKey ( &key, &p ) ) return false;
|
|
// add to our array
|
|
padm1 [ bn++ ] = p;
|
|
}
|
|
// how can this happen?
|
|
if ( bn > 55 ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// "X" loops over all the streets we have
|
|
for ( int32_t X = 0 ; X < m_sm.getNumPtrs() ; X++ ) {
|
|
// get the street that we center the address around
|
|
Place *street = (Place *)m_sm.getPtr(X);
|
|
// debug
|
|
//logf(LOG_DEBUG,"events: ****** X=%"INT32" *****",X);
|
|
// reset these
|
|
nc = 0;
|
|
na = bn;
|
|
nz = 0;
|
|
nn = 0;
|
|
nu = 0;
|
|
//ny = 0;
|
|
// preserve the places on there from title
|
|
//np = np_stack;
|
|
// these guys are allowed to have "no place", but everyone else
|
|
// must have something
|
|
pzip [nz++] = NULL;
|
|
//padm1 [na++] = NULL;
|
|
//psuite [nu++] = NULL;
|
|
//pctry [ny++] = NULL;
|
|
//if ( dc > 0 ) pcity [nc++] = NULL;
|
|
//if ( dc > 0 ) padm1 [na++] = NULL;
|
|
//if ( dc > 0 ) pname [nn++] = NULL;
|
|
// add a NULL because if city is unique we can fill this in
|
|
//padm1 [na++] = NULL;
|
|
// likewise, if we have a zip code we can fill in the city too
|
|
pcity [nc++] = NULL;
|
|
|
|
|
|
//
|
|
// search for a suite name BEFORE the street
|
|
//
|
|
int32_t k = street->m_a - 1 ;
|
|
// re-set this
|
|
alnumPos = street->m_alnumA ;
|
|
// start of it
|
|
int32_t ak = -1;
|
|
// flag init
|
|
bool gotSuiteBefore = 0;
|
|
// ptr
|
|
Place *suiteBefore = NULL;
|
|
// suite hash
|
|
int64_t suh = 0LL;
|
|
// start alnumPos
|
|
int32_t akPos = -1;
|
|
// now scan for suite, stop after hitting our first alnum word
|
|
for ( ; k >= 0 ; k-- ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if non alnum word
|
|
if ( ! wids[k] ) continue;
|
|
// skip if in a script section
|
|
if (sp&&sp[k]&&(sp[k]->m_flags & badFlags) ) continue;
|
|
// it's an alnum
|
|
alnumPos--;
|
|
// stop if we are not a suite designation
|
|
if ( wlens[k] != 1 && ! m_words->hasDigit(k) ) break;
|
|
// now before us must be a # sign
|
|
if ( k - 1 > 0 && m_words->hasChar(k-1,'#') ) {
|
|
// start of it was this punct word i guess
|
|
ak = k - 1;
|
|
// and this
|
|
akPos = alnumPos;
|
|
// update suite hash
|
|
suh = wids[k];
|
|
}
|
|
// or a suite indicator
|
|
if ( k - 2 >= 0 &&
|
|
( wids[k-2] == h_suite ||
|
|
wids[k-2] == h_ste ||
|
|
wids[k-2] == h_building ||
|
|
wids[k-2] == h_bldg ||
|
|
wids[k-2] == h_bld ||
|
|
wids[k-2] == h_pier ||
|
|
wids[k-2] == h_room ||
|
|
wids[k-2] == h_rm ||
|
|
wids[k-2] == h_unit ) ) {
|
|
// set this
|
|
akPos = alnumPos - 1;
|
|
// start here
|
|
ak = k - 2;
|
|
// update suite hash
|
|
suh = wids[k];
|
|
// skip that
|
|
//k++;
|
|
// skip punct word
|
|
//k++;
|
|
// update suite hash
|
|
suh <<= 1;
|
|
// xor it in
|
|
suh ^= wids[k];
|
|
// and the indicator
|
|
suh <<= 1;
|
|
suh ^= wids[k-2];
|
|
}
|
|
// that is it either way
|
|
break;
|
|
}
|
|
// add the suite before the place name
|
|
if ( suh ) { // && m_np < MAX_PLACES ) {
|
|
// note it
|
|
gotSuiteBefore = true;
|
|
// sanity check
|
|
//if ( m_np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
// point to the suite to add
|
|
Place *pp = (Place *)m_pm.getMem(sizeof(Place));
|
|
if ( ! pp ) return false;
|
|
// point to it
|
|
suiteBefore = pp;
|
|
// length
|
|
int32_t plen = wptrs[k]-wptrs[ak]+wlens[k];
|
|
// point to the suite
|
|
char *ps = wptrs[ak];
|
|
// skip over initial comma
|
|
if ( *ps == ',' ) { ps++; plen--; }
|
|
// set it
|
|
pp->m_a = ak;
|
|
pp->m_b = k+1;
|
|
pp->m_alnumA = akPos;
|
|
pp->m_alnumB = alnumPos+1;
|
|
pp->m_type = PT_SUITE;
|
|
pp->m_str = ps;
|
|
pp->m_strlen = plen;
|
|
pp->m_hash = 0LL;//suh;
|
|
//pp->m_adm1[0] = 0;
|
|
//pp->m_adm1[1] = 0;
|
|
//pp->m_crid = 0;
|
|
pp->m_bits = 0;
|
|
pp->m_flags2 = 0;
|
|
// that's a suite
|
|
psuite[nu++] = pp;
|
|
// now just use this
|
|
setHashes(pp,m_words,m_niceness);
|
|
// point to next place
|
|
//m_np++;
|
|
}
|
|
|
|
|
|
|
|
//
|
|
// search for a suite name after the street
|
|
//
|
|
k = street->m_b;
|
|
// re-set this
|
|
alnumPos = street->m_alnumB - 1;
|
|
// suite hash
|
|
suh = 0LL;
|
|
// remember start of suite
|
|
int32_t startk = -1;
|
|
int32_t startAlnumPos = -1;
|
|
char got = 0;
|
|
// point to next street
|
|
Place *next = NULL;
|
|
if ( X+1 < m_sm.getNumPtrs() )
|
|
next = (Place *)m_sm.getPtr(X+1);
|
|
// skip until we got a wordid
|
|
for ( ; k < nw ; k++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not an alnum word
|
|
if ( ! wids[k] ) continue;
|
|
// skip if in a script section
|
|
if (sp&&sp[k]&&(sp[k]->m_flags & badFlags) ) continue;
|
|
// it's an alnum
|
|
alnumPos++;
|
|
// start here
|
|
if ( wids[k] == h_building ) { got = 3; continue; }
|
|
if ( wids[k] == h_bldg ) { got = 3; continue; }
|
|
if ( wids[k] == h_bld ) { got = 3; continue; }
|
|
if ( wids[k] == h_unit ) { got = 3; continue; }
|
|
if ( wids[k] == h_suite ) { got = 2; continue; }
|
|
if ( wids[k] == h_ste ) { got = 2; continue; }
|
|
if ( wids[k] == h_pier ) { got = 3; continue; }
|
|
if ( wids[k] == h_room ) { got = 3; continue; }
|
|
if ( wids[k] == h_rm ) { got = 3; continue; }
|
|
// having a # sign before us is good!
|
|
if ( k-1>=0 && !tids[k-1]&& ! got &&
|
|
m_words->hasChar(k-1,'#'))
|
|
got = 1;
|
|
// stop if no suite indicator
|
|
if ( ! got ) break;
|
|
// no tag must preceed us
|
|
if ( tids[k-1] ) break;
|
|
// a number follows?
|
|
bool isNum = false;
|
|
if ( is_digit(wptrs[k][0])) isNum = true;
|
|
// a single letter counts as a number too!
|
|
if ( wlens[k]==1 ) isNum = true;
|
|
// or if we end in a number
|
|
if ( is_digit(wptrs[k][wlens[k]-1])) isNum = true;
|
|
// everyone but suites need something more stringent
|
|
if ( got == 3 && ! isNum ) { got = 0; continue; }
|
|
// put back
|
|
if ( got == 3 ) got = 2;
|
|
// remember the start of it
|
|
startk = k - got;
|
|
// and this too
|
|
if ( got == 2 ) startAlnumPos = alnumPos - 1;
|
|
// if just the pound sign, do not change this
|
|
else startAlnumPos = alnumPos;
|
|
// incorporate into the suite place hash
|
|
if ( got == 2 ) suh = wids[k];
|
|
else suh = 0;
|
|
// incorporate ourselves into "suh" (suite hash)
|
|
suh <<= 1;
|
|
suh ^= wids[k];
|
|
// next is supposed to be the next street name!
|
|
// but it can run into the next list of fake street
|
|
// names that we added above, so fix that
|
|
if ( next && next->m_a <= k ) next = NULL;
|
|
// all done?
|
|
bool gotExt = true;
|
|
if ( k+1 >= nw ) gotExt = false;
|
|
else if ( wptrs[k+1][0] != '-' ) gotExt = false;
|
|
else if ( wlens[k+1] != 1 ) gotExt = false;
|
|
// fix "Suite 920-N"
|
|
//if ( ! is_digit(wptrs[k+2][0]) ) gotExt = false;
|
|
if ( next && k + 2 >= next->m_a ) gotExt = false;
|
|
// if we got something like "Suite G-2" (extension)
|
|
// then add these up
|
|
if ( gotExt ) {
|
|
k += 2;
|
|
alnumPos += 1;
|
|
// incorporate that too
|
|
suh <<= 1;
|
|
suh ^= wids[k];
|
|
}
|
|
// length
|
|
int32_t plen = wptrs[k]-wptrs[startk]+wlens[k];
|
|
// sanity check. i've seen this happen before,
|
|
// on http://cruises.priceline.com/promotion/price
|
|
// line/lm/default.asp for the $339 price, so let's
|
|
// just ignore such beasties now
|
|
if ( plen > 100 ) continue;//{ char *xx=NULL;*xx=0; }
|
|
// sanity check -- if we have no room, bail!
|
|
//if ( m_np >= MAX_PLACES ) break;
|
|
// point to the suite to add
|
|
Place *pp = (Place *)m_pm.getMem(sizeof(Place));
|
|
if ( ! pp ) return false;
|
|
// point to the suite
|
|
char *ps = wptrs[startk];
|
|
// skip over initial comma
|
|
if ( *ps == ',' ) { ps++; plen--; }
|
|
// set it
|
|
pp->m_a = startk;
|
|
pp->m_b = k+1;
|
|
pp->m_alnumA = startAlnumPos;
|
|
pp->m_alnumB = alnumPos+1;
|
|
pp->m_type = PT_SUITE;
|
|
pp->m_str = ps;
|
|
pp->m_strlen = plen;
|
|
pp->m_hash = 0;//suh;
|
|
//pp->m_adm1[0] = 0;
|
|
//pp->m_adm1[1] = 0;
|
|
//pp->m_crid = 0;
|
|
pp->m_bits = 0;
|
|
pp->m_flags2 = 0;
|
|
// that's a suite
|
|
psuite[nu++] = pp;
|
|
// now just use this
|
|
setHashes(pp,m_words,m_niceness);
|
|
// point to next place
|
|
//m_np++;
|
|
// all done
|
|
break;
|
|
}
|
|
|
|
// provide an empty suite if none
|
|
if ( nu <= 0 ) psuite [nu++] = NULL;
|
|
|
|
// "end" is the word # of first word in the street address
|
|
int32_t end = street->m_a;
|
|
int32_t endAlnum = street->m_alnumA;
|
|
// but if we had a suite before... skip over it
|
|
if ( gotSuiteBefore ) {
|
|
end = suiteBefore->m_a;
|
|
endAlnum = suiteBefore->m_alnumA;
|
|
}
|
|
|
|
///////////////////////
|
|
//
|
|
// GET THE PLACE NAME before the street (or before the suite)
|
|
//
|
|
///////////////////////
|
|
|
|
// start at word before word # end
|
|
int32_t i = end - 1;
|
|
// start here
|
|
int32_t pa2 = m_am.getNumPtrs() - 1; // m_na - 1;
|
|
// save start of place array
|
|
int32_t savednp = m_pm.getNumPtrs();//m_np;
|
|
// save start of name array
|
|
int32_t savednn = nn;
|
|
// init
|
|
Address *preva = NULL;
|
|
// assign
|
|
if ( pa2 >= 0 ) preva = (Address *)m_am.getPtr(pa2);
|
|
|
|
// count how many place names we add
|
|
int32_t pcount = 0;
|
|
|
|
////
|
|
//
|
|
// "Tingley Colesium, Abq NM"
|
|
//
|
|
// if the street is a place name, skip this next part...
|
|
//
|
|
////
|
|
if ( street->m_flags2 & PLF2_IS_NAME ) i = -1;
|
|
|
|
|
|
// we come back up here to filter out street address labels
|
|
redo:
|
|
|
|
// set this
|
|
int32_t mini = -1;
|
|
// get the prev address b boundard
|
|
if ( preva ) mini = preva->m_street->m_b;
|
|
// if preva was inlined, use zip or adm1 then
|
|
if ( preva && (preva->m_flags & AF_INLINED) ) {
|
|
if ( preva->m_zip && preva->m_zip->m_b > mini )
|
|
mini = preva->m_zip->m_b;
|
|
if ( preva->m_adm1 && preva->m_adm1->m_b > mini )
|
|
mini = preva->m_adm1->m_b;
|
|
if ( preva->m_city && preva->m_city->m_b > mini )
|
|
mini = preva->m_city->m_b;
|
|
}
|
|
|
|
int32_t parensCount = 0;
|
|
// keep an ongoing hash of alnum words in the name
|
|
//int64_t h = 0LL;
|
|
// backup until we hit an alnum
|
|
for ( ; i >= 0 ; i-- ) {
|
|
|
|
// do not cross a title tag to get place name
|
|
if ( tids[i] == TAG_TITLE ) { i = -1; break; }
|
|
if ( tids[i] == (TAG_TITLE|BACKBIT)) { i = -1; break; }
|
|
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) {
|
|
// skip tags
|
|
if ( tids[i] ) continue;
|
|
// see if this punct word has a ')' in it!
|
|
char *pp = wptrs[i];
|
|
char *ppend = pp + wlens[i];
|
|
for ( ; pp < ppend ; pp++ ) {
|
|
// count 'em
|
|
if ( *pp=='(' ) parensCount--;
|
|
if ( *pp==')' ) parensCount++;
|
|
}
|
|
continue;
|
|
}
|
|
// . skip if in bad section
|
|
// . the two trumba.com urls have quite a few
|
|
// addresses in common, causing the place names
|
|
// to get their SEC_DUP bit set. But out new algo
|
|
// plays somewhat nicely with menu cruft because
|
|
// we have to verify the place names with another
|
|
// website to really make the place name stick,
|
|
// so let's no longer use SEC_DUP or'ed in with
|
|
// the badFlags. mdw.
|
|
if ( sp && (sp[i]->m_flags & badFlags ) ) // |SEC_DUP)
|
|
continue;
|
|
|
|
// abqpeaceandjustice.org has their address on every
|
|
// web page, but on one web page it was
|
|
// "202 Hardvard SE" and another it was SouthEast...
|
|
// BUT for the most part this logic is ok!
|
|
|
|
// if the street does not have SEC_DUP set in its
|
|
// section, BUT the name does, then ignore the name!
|
|
if ( street->m_a>= 0 &&
|
|
// msg13 has no sections
|
|
sp &&
|
|
// if street section does not have SEC_DUP set
|
|
! (sp[street->m_a]->m_votesForDup) &&
|
|
// but the ith word does
|
|
( sp[i]->m_votesForDup ) )
|
|
// then skip over this word and do not
|
|
// allow it to be the place name
|
|
continue;
|
|
|
|
// . skip if "at"
|
|
// . "Post Office & Library at 950 pinetree se ..."
|
|
// http://www.xeriscapenm.com/xeriscape_gardens.php
|
|
// . no "thru October at 6718 Rio Grande NW."
|
|
// . "write elizabeth doak, treasurer at 1606 silver"
|
|
// . no no i guess we got date detection now
|
|
// . and skip "xyz [is located at] 123 main st"
|
|
if ( wids[i] == h_at && is_lower_utf8(wptrs[i]) )
|
|
continue;
|
|
|
|
if ( wids[i] == h_is && is_lower_utf8(wptrs[i]) )
|
|
continue;
|
|
|
|
if ( wids[i] == h_located && is_lower_utf8(wptrs[i]) )
|
|
continue;
|
|
|
|
// skip phone #'s
|
|
if ( i>=6 &&
|
|
wlens[i]==4 &&
|
|
m_words->isNum(i) &&
|
|
wlens[i-2]==3 &&
|
|
m_words->isNum(i-2) &&
|
|
wlens[i-4]==3 &&
|
|
m_words->isNum(i-4) ) {
|
|
i -= 4;
|
|
continue;
|
|
}
|
|
// phone with no area code
|
|
else if ( i>=4 &&
|
|
wlens[i]==4 &&
|
|
m_words->isNum(i) &&
|
|
wlens[i-2]==3 &&
|
|
m_words->isNum(i-2) ) {
|
|
i -= 2;
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// . we are getting place names like "3 baths..."
|
|
// for "6769 Guadalupe Trl Nw" for the url
|
|
// http://www.realtor.com/property-detail/608-
|
|
// Bledsoe-Rd-NW_Albuquerque_NM_87107_fa9ca500
|
|
// which are in the section of a different street,
|
|
// so fix that with this logic.
|
|
// . basically expand the section around "i" and see
|
|
// if it belongs to street #X or to street #X-1.
|
|
//
|
|
|
|
// get prev street
|
|
Place *prev = NULL;
|
|
if ( X>0 ) prev = (Place *)m_sm.getPtr(X-1);
|
|
// flags
|
|
bool gotOurStreet = false;
|
|
bool gotPrevStreet = false;
|
|
// keep expanding the section around the
|
|
// place name until we get a street or multiple
|
|
// streets. if we only get a single street, then
|
|
// it must be OUR STREET, "street"
|
|
Section *si = NULL;
|
|
// msg13 has no sections
|
|
if ( sp ) si = sp[i];
|
|
// keep expanding section until we got street in it
|
|
for ( ; prev && si ; si=si->m_parent ) {
|
|
// stop when it contains our street or
|
|
// previous street
|
|
if ( si->m_a <= street->m_a &&
|
|
si->m_b >= street->m_b )
|
|
gotOurStreet = true;
|
|
if ( si->m_a <= prev->m_a &&
|
|
si->m_b >= prev->m_b )
|
|
gotPrevStreet = true;
|
|
// break on either
|
|
if ( gotOurStreet ) break;
|
|
if ( gotPrevStreet ) break;
|
|
}
|
|
// if it is more closely related to the previous street
|
|
// then do not assign this place name to us, i guess
|
|
// we do not have a good one for this street!
|
|
if ( gotPrevStreet && ! gotOurStreet )
|
|
i = -1;
|
|
|
|
// ok we got a candidate
|
|
break;
|
|
}
|
|
|
|
// . if our place name candidate is in a date, then assume
|
|
// that we have no place name!
|
|
// . fixes http://obits.abqjournal.com/obits/2004/04/13
|
|
if ( i >= 0 && i < nw && bits && ( bits[i] & D_IS_IN_DATE ) &&
|
|
// in case place name ends in midnight or noon
|
|
wids[i] != h_daily &&
|
|
wids[i] != h_noon &&
|
|
wids[i] != h_midnight )
|
|
i = -1;
|
|
|
|
// fix "copyright ; 2009 Albuquerque Journal; Abq ; NM"
|
|
// for http://obits.abqjournal.com/obits/2004/04/13
|
|
if ( i >= 0 && i < nw && wids[i] == h_copyright) {
|
|
// stop getting a name
|
|
i = -1;
|
|
// and mark street as bad
|
|
//street->m_bits |= PLF_IGNORE;
|
|
// go to next street!
|
|
continue;
|
|
}
|
|
|
|
// set that as our right side
|
|
int32_t righti = i;
|
|
// reset this count
|
|
int32_t alnumCount = 0;
|
|
int32_t alphaCount = 0;
|
|
// reset this
|
|
int32_t atPos = -1;
|
|
bool atCityName = false;
|
|
int32_t atAlnumCount = -1;
|
|
// reset this
|
|
bool hadUpper = false;
|
|
bool hadLower = false;
|
|
bool hadAnd = false;
|
|
// save last good i
|
|
int32_t lasti = -1;
|
|
bool isUpper;
|
|
bool isLower;
|
|
// . ok, go backwards up to 15 alnum words from there
|
|
// . The Harwood Museum of Art of the University of New Mexico
|
|
for ( ; i >= 0 && alnumCount < MAX_ALNUMS_IN_NAME ; i-- ) {
|
|
// ignore if in script, etc. tags
|
|
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
|
|
// . ignore if in menu section
|
|
// . might be like "<td>place</td>"
|
|
// . i know for http://www.publicbroadcasting.net/kunm/
|
|
// events.eventsmain?action=showEvent&eventID=833142
|
|
// we are getting "Address: " as the place name
|
|
// because it is in the table like that.
|
|
// . TODO: for single event pages we must require at
|
|
// least another page from same site with same
|
|
// tagPairHash to prevent this kind of thing
|
|
// . likewise, for the same reason above, there are
|
|
// two trumba.com urls that share some addresses
|
|
// in common and the place name is getting its
|
|
// SEC_DUP bit set, so let's reply more on
|
|
// verifying place name 1 and 2 than this:
|
|
//if ( sp[i]->m_flags & SEC_DUP ) continue;
|
|
// stop at tag, not bold tags though
|
|
// fix for local.yahoo.com highlighting terms
|
|
// in the place name.
|
|
if ( tids[i] ) {
|
|
if ( tids[i] == TAG_B ) continue;
|
|
if ( tids[i] == (TAG_B | BACKBIT) ) continue;
|
|
break;
|
|
}
|
|
// count alnums
|
|
if ( wids[i] ) {
|
|
|
|
// do not stop something in parentheses
|
|
if ( parensCount > 0 )
|
|
goto skipbreak;
|
|
|
|
// no dates allowed in name
|
|
if ( bits && (bits[i] & D_IS_IN_DATE) &&
|
|
// "1am gallery"
|
|
(wlens[i]!=3||
|
|
to_lower_a(wptrs[i][1])!='a') &&
|
|
// high noon saloon on
|
|
// www.estrelladelnortevineyard.com/
|
|
// SFV_retloc.php
|
|
wids[i] != h_daily &&
|
|
wids[i] != h_noon &&
|
|
wids[i] != h_midnight )
|
|
break;
|
|
/*
|
|
// if we are the "last" word in the place name
|
|
// then we must always be upper case!
|
|
if ( alnumCount == 0 &&
|
|
! is_upper_utf8(wptrs[i]) &&
|
|
// digits can not be upper case
|
|
! is_digit(wptrs[i]) &&
|
|
// allow "Subway at 1300 main st."
|
|
wids[i] != h_at &&
|
|
// allow "Cable.com"
|
|
(i-1<0 || wptrs[i][-1]=='.') )
|
|
break;
|
|
*/
|
|
|
|
// "KS CITY CONFIDENTIAL and 99 RIVER STREET"
|
|
if ( alnumCount==0 && wids[i]==h_and) break;
|
|
/*
|
|
// "Property Information for 440 Bledsoe Rd"
|
|
// "Map for ..."
|
|
if ( alnumCount==0 && wids[i]==h_for) continue;
|
|
// "Map of ..."
|
|
if ( alnumCount==0 && wids[i]==h_of) continue;
|
|
*/
|
|
|
|
isLower = is_lower_utf8(wptrs[i]);
|
|
isUpper = is_upper_utf8(wptrs[i]);
|
|
|
|
// hack fix for "O'niell's Pub" (apostrop)
|
|
if ( i >= 2 &&
|
|
wlens[i-1] == 1 &&
|
|
wptrs[i-1][0]=='\''&&
|
|
wids[i-2] &&
|
|
wlens[i-2] == 1 &&
|
|
wptrs[i-2][0] =='O' ) {
|
|
// assume it is not lower case
|
|
isLower = false;
|
|
isUpper = true;
|
|
}
|
|
|
|
// if this is lower and we had an upper
|
|
if ( isLower &&
|
|
hadUpper &&
|
|
// must not be an allowable lowercase word
|
|
! s_lc.isInTable(&wids[i]) &&
|
|
// fix "Bandido's Hideout Restaurant" cuz
|
|
// it was breaking on the "s" cuz that is
|
|
// not a query stop word!
|
|
wlens[i] > 1 )
|
|
break;
|
|
// if we had a lower non-stop word, and then
|
|
// we hit an upper...
|
|
if ( isUpper && hadLower ) {
|
|
// force an abort on this street
|
|
lasti = -1;
|
|
break;
|
|
}
|
|
// if we hit a number followed by am or pm,
|
|
// that is a time so stop the scan!
|
|
//if (( wids[i] == h_am || wids[i] == h_pm ) &&
|
|
// i >= 2 && is_digit(wptrs[i-2][0]) )
|
|
// break;
|
|
// if we hit "by" and "sponsored" or
|
|
// "arranged" preceeds it, stop!
|
|
// fixes: "arrangements by ..." in
|
|
// obits.abqjournal.com/obits/2004/04/13
|
|
if ( wids[i] == h_by && i-2>=0 &&
|
|
( wids[i-2] == h_arrangements ||
|
|
wids[i-2] == h_arranged ||
|
|
wids[i-2] == h_sponsored ) )
|
|
break;
|
|
// if we got something and we hit the
|
|
// previous address zip or state or city
|
|
// then just stop
|
|
if ( i < mini && lasti >= 0 )
|
|
break;
|
|
// to be more strict, no lower at all!
|
|
// NO! we lose "explora" then
|
|
//if ( is_lower_utf8(wptrs[i]) &&
|
|
// ! ww->isQueryStopWord(i) )
|
|
// break;
|
|
// . cut off here too
|
|
// . do not include the previous street name
|
|
// as part of your place name
|
|
if ( //preva &&
|
|
i < mini && // preva->m_street->m_b &&
|
|
lasti == -1 ) {
|
|
// skip over it
|
|
i = preva->m_street->m_a - 1;
|
|
// update prev
|
|
pa2--;
|
|
if ( pa2>=0 )
|
|
preva=(Address *)m_am.getPtr(pa2);
|
|
else
|
|
preva = NULL;
|
|
// now we only redo if this is the
|
|
// FIRST place name
|
|
if ( pcount == 0 ) goto redo;
|
|
// otherwise, stop it!
|
|
break;
|
|
}
|
|
// if we did have some junk in the place name
|
|
// then use that, but do not include this
|
|
// street name as part of it
|
|
if ( preva && i < preva->m_street->m_b )
|
|
break;
|
|
|
|
// if we hit previous address
|
|
|
|
skipbreak:
|
|
// store the last good word position
|
|
lasti = i;
|
|
// count it
|
|
alnumCount++;
|
|
// NO! we are looping backwards, so we
|
|
// can't do this here. we now do it below
|
|
// mix it up
|
|
//h <<= 1;
|
|
// hash it into our ongoing hash
|
|
//h ^= wids[i];
|
|
// skip words starting with a digit
|
|
if ( is_digit(wptrs[i][0]) ) continue;
|
|
// consider it alpha i guess now
|
|
alphaCount++;
|
|
// is it upper?
|
|
if ( isUpper ) hadUpper = 1;
|
|
|
|
if ( wids[i] == h_and ) hadAnd = true;
|
|
|
|
// caution "Santa Fe Co-op" or "E-mail" is ok ;
|
|
// don't set hasLower for "op" or "mail"
|
|
if ( i-2>= 0 && wptrs[i][-1]=='-' &&
|
|
is_alnum_a(wptrs[i][-2]) )
|
|
continue;
|
|
|
|
// same goes for "Cable.com"
|
|
if ( i-2>= 0 && wptrs[i][-1]=='.' &&
|
|
is_alnum_a(wptrs[i][-2]) )
|
|
continue;
|
|
|
|
// hadLower only valid if not query stop word
|
|
if ( isLower && //_lower_utf8(wptrs[i]) &&
|
|
// must not be an allowable lowercase word
|
|
! s_lc.isInTable(&wids[i])
|
|
// for some reason 's' is not a query
|
|
// stop word, and we had a bar named
|
|
// "Slim's" that we needed to get
|
|
// ... this is in s_lc table now
|
|
//! ww->isStopWord(i) )
|
|
)
|
|
hadLower = 1;
|
|
// record first at
|
|
if ( wids[i] == h_at && atPos == -1) {
|
|
atPos= i;
|
|
// save this in case we trim off
|
|
atAlnumCount = alnumCount - 1;
|
|
// get string from right after "at"
|
|
// and before the street and see
|
|
// if it is a city name. get hash
|
|
// of all those words so we can look
|
|
// it up. hashes all alnum words
|
|
// in [i+2,righti+1) interval.
|
|
atCityName = isCityName(i+2,righti+1);
|
|
}
|
|
// skip to next
|
|
continue;
|
|
}
|
|
// keep parensCount up to date
|
|
char *pp = wptrs[i];
|
|
char *ppend = pp + wlens[i];
|
|
for ( ; pp < ppend ; pp++ ) {
|
|
// count 'em
|
|
if ( *pp=='(' ) parensCount--;
|
|
if ( *pp==')' ) parensCount++;
|
|
}
|
|
// do not stop something in parentheses
|
|
if ( parensCount > 0 ) continue;
|
|
// only certain types of punct can be in a place name
|
|
if ( wlens[i] == 1 ) {
|
|
// single space ok
|
|
if ( is_wspace_a(w[i][0]) ) continue;
|
|
if ( w[i][0] == '\r' ) continue;
|
|
// hyphen ok
|
|
if ( w[i][0] == '-' ) continue;
|
|
// apostrophe ok
|
|
if ( w[i][0] == '\'' ) continue;
|
|
// / ok, "QX&V Electro/Mechanical"
|
|
// but breaks:
|
|
// "Santa Fe Playhouse/Santa Fe Little Theater"
|
|
//if ( w[i][0] == '/' ) continue;
|
|
// ampersand ok
|
|
if ( w[i][0] == '&' ) continue;
|
|
// asterisk ok ( e*trade)
|
|
if ( w[i][0] == '*' ) continue;
|
|
// period ok (xyz.com,u.s. post office)
|
|
if ( w[i][0] == '.' ) continue;
|
|
// . apostrophe ok if alnum-locked
|
|
// . "Bandido's Hideout"
|
|
if ( w[i][0]=='\'' )
|
|
if (is_alnum_a(w[i][-1]) &&
|
|
is_alnum_a(w[i][1]) )
|
|
continue;
|
|
// otherwise, not
|
|
break;
|
|
}
|
|
if ( wlens[i] == 2 ) {
|
|
// . up to one parenthetical is ok
|
|
// . "The Filling Station (Albuquerque, NM)"
|
|
// http://eventful.com/albuquerque/venues/
|
|
// the-filling-station-/V0-001-001121221-1
|
|
// . we now have parensCount for this
|
|
if ( is_wspace_a(w[i][0])&&
|
|
w[i][1]=='(')
|
|
break; // continue;
|
|
// double space ok
|
|
if ( is_wspace_a(w[i][0])&&
|
|
is_wspace_a(w[i][1]))
|
|
continue;
|
|
// . comma space
|
|
// . i was only allow inc. or llc. to follow
|
|
// but what about:
|
|
// "NM Children, Youth, and Families Dept."
|
|
// . but then we got "St. John's College,
|
|
// Peterson Student Center" which is bad
|
|
// so now we require an and i guess
|
|
if ( w[i][0]==','&&
|
|
is_wspace_a(w[i][1]) &&
|
|
( hadAnd ||
|
|
wids[i+1] == h_inc ||
|
|
wids[i+1] == h_llc ) )
|
|
continue;
|
|
// Yahoo! or Yelp! Inc.
|
|
if ( w[i][0]=='!' &&
|
|
is_wspace_a(w[i][1]) &&
|
|
i+1<nw && wids[i+1]==h_inc )
|
|
continue;
|
|
// colon space
|
|
if ( w[i][0]==':'&&
|
|
is_wspace_a(w[i][1]) ) {
|
|
// NO NO NO, never allow names
|
|
// with colons in them now because
|
|
// we have "place name 2" to pick
|
|
// up the other name if it is a
|
|
// compound name containing a ':'
|
|
break;
|
|
// . Location: not allowed!
|
|
// . "Location: Albuquerque Dance Ctr"
|
|
if ( i-1>=0 && wids[i-1]==h_location)
|
|
break;
|
|
// . Address: not allowed!
|
|
if ( i-1>=0 && wids[i-1]==h_address)
|
|
break;
|
|
// stop at Phone: too!
|
|
if ( i-1>=0 && wids[i-1]==h_phone)
|
|
break;
|
|
// otherwise, allow it!
|
|
continue;
|
|
}
|
|
// the $1 store
|
|
if ( is_wspace_a(w[i][0])&&
|
|
w[i][1]== '$' )
|
|
continue;
|
|
// abbreviation (mtn. supply store)
|
|
if ( w[i][0]=='.'&&
|
|
// "moving co., inc." (allow comma after)
|
|
(is_wspace_a(w[i][1]) ||w[i][1]==',') &&
|
|
i-1>=0 && wids[i-1] &&
|
|
( isAbbr(wids[i-1]) || wlens[i-1]==1 ) &&
|
|
//fix "Institute Inc. All Rights Reserved"
|
|
// for www.aliconferences.com
|
|
wids[i-1] != h_inc )
|
|
continue;
|
|
// store #13
|
|
if ( is_wspace_a(w[i][0]) &&
|
|
w[i][1]== '#' )
|
|
continue;
|
|
// apostrophe space is ok (dunkin' donuts)
|
|
if ( w[i][0]=='\''&&
|
|
is_wspace_a(w[i][1]))
|
|
continue;
|
|
// otherwise, not
|
|
break;
|
|
}
|
|
if ( wlens[i] == 3 ) {
|
|
// crazy utf8 apostrophe from
|
|
// http://www.earthcare.org/guide_online/
|
|
// 197.html
|
|
if ( wptrs[i][0] == (char)0xe2 &&
|
|
wptrs[i][1] == (char)0x80 &&
|
|
wptrs[i][2] == (char)0x99 )
|
|
continue;
|
|
}
|
|
/*
|
|
if ( wlens[i] == 3 ) {
|
|
// "B & B plumbing"
|
|
if ( is_wspace_a(w[i][0])&&w
|
|
[i][1]=='&'&&
|
|
is_wspace_a(w[i][2]) )
|
|
continue;
|
|
// otherwise, not
|
|
break;
|
|
}
|
|
*/
|
|
|
|
// a string of nothing but \n and ' ' is allowed
|
|
// and i see that in quite a few pages. microsoft
|
|
// front page had this issue as i remember...
|
|
int32_t ampCount = 0;
|
|
int32_t comCount = 0;
|
|
// "Dr. Smith, Obstetrician / Gynecologist"
|
|
int32_t slashCount = 0;
|
|
// period is ok "Moving Co., Inc."
|
|
int32_t kstart = 0;
|
|
if ( w[i][0]=='.'&&
|
|
(is_wspace_a(w[i][1]) ||w[i][1]==',') &&
|
|
i-1>=0 && wids[i-1] &&
|
|
( isAbbr(wids[i-1]) || wlens[i-1]==1 ) )
|
|
kstart++;
|
|
// ok now do the loop
|
|
int32_t k ; for ( k = kstart ; k < wlens[i] ; k++ ) {
|
|
// "B & B Plumbing"
|
|
if ( w[i][k] == '&' ) {
|
|
if ( ++ampCount >= 2 ) break;
|
|
if ( comCount > 0 ) break;
|
|
if ( slashCount > 0 ) break;
|
|
continue;
|
|
}
|
|
if ( w[i][k] == '/' ) {
|
|
if ( ++slashCount >= 2 ) break;
|
|
if ( comCount > 0 ) break;
|
|
if ( ampCount > 0 ) break;
|
|
continue;
|
|
}
|
|
// . this is a good delimiter for place names
|
|
// usually, but of course if someone has
|
|
// "Gigablast, \nInc." then this will hurt!
|
|
// . i was only allow inc. or llc. to follow
|
|
// but what about:
|
|
// "NM Children, Youth, and Families Dept."
|
|
|
|
if ( w[i][k] == ',' &&
|
|
( hadAnd ||
|
|
wids[i+1]==h_inc ||
|
|
wids[i+1]==h_llc)) {
|
|
if ( ++comCount >= 2 ) break;
|
|
if ( ampCount > 0 ) break;
|
|
if ( slashCount > 0 ) break;
|
|
continue;
|
|
}
|
|
if ( ! is_wspace_a(w[i][k]) )
|
|
break;
|
|
}
|
|
// skip if ok
|
|
if ( k == wlens[i] ) continue;
|
|
// nothing else allowed
|
|
break;
|
|
}
|
|
|
|
// forget it if too long
|
|
if ( alnumCount >= MAX_ALNUMS_IN_NAME )
|
|
lasti = -1;
|
|
|
|
// come back up here after removing the " ... at" substring
|
|
subloop:
|
|
// trim off lower case stop words from the beginning
|
|
for ( ; lasti >= 0 && lasti <= righti ; lasti++ ) {
|
|
// skip if not alnum
|
|
if ( ! wids[lasti] ) continue;
|
|
// assume nuked!
|
|
alnumCount--;
|
|
// is it like "Friday at The Source"?
|
|
if ( lasti+2 <= righti &&
|
|
//ww->isQueryStopWord(lasti+2) &&
|
|
s_lc.isInTable( &wids[lasti+2]) &&
|
|
getDayOfWeek ( wids[lasti] ) >= 1 ) continue;
|
|
// "monday, wednesday and friday at The Source"
|
|
if ( lasti+2 <= righti &&
|
|
getDayOfWeek ( wids[lasti+2]) >= 1 &&
|
|
getDayOfWeek ( wids[lasti] ) >= 1 ) continue;
|
|
// . or stopword + day of week is bad too!
|
|
// . "Every Monday at The Source"
|
|
if ( lasti+2 <= righti &&
|
|
getDayOfWeek ( wids[lasti+2] ) >= 1 &&
|
|
//( ww->isQueryStopWord(lasti) ||
|
|
( s_lc.isInTable(&wids[lasti]) ||
|
|
wids[lasti] == h_every ) ) continue;
|
|
// assume not nuked
|
|
alnumCount++;
|
|
// stop if not stop word
|
|
//if ( ! ww->isQueryStopWord(lasti) ) break;
|
|
if ( ! s_lc.isInTable(&wids[lasti]) ) break;
|
|
// stop if capitalized
|
|
if ( is_upper_utf8(wptrs[lasti]) &&
|
|
// trim a capitalized "At" off regardless
|
|
wids[lasti] != h_at ) break;
|
|
// assume nuked
|
|
alnumCount--;
|
|
}
|
|
// trim off lower case stop words from the end (Wat Center, at)
|
|
for ( ; righti >= lasti && lasti >= 0 ; righti-- ) {
|
|
// skip if not alnum
|
|
if ( ! wids[righti] ) continue;
|
|
// assume nuked
|
|
alnumCount--;
|
|
// . stop if not stop word
|
|
// . no! too strong. was removing "com" in "Cable.com"
|
|
// . "Sonic Drive-In" "Stepping Stones-Drop In"
|
|
//if ( ! ww->isQueryStopWord(righti) ) break;
|
|
if ( wids[righti] == h_at ) continue;
|
|
//if ( wids[righti] == h_in ) continue;
|
|
//if ( wids[righti] == h_by ) continue;
|
|
//if ( wids[righti] == h_and ) continue;
|
|
// not nuked
|
|
alnumCount++;
|
|
// stop it
|
|
break;
|
|
}
|
|
// if we included "at" then trim up until we hit the "at"
|
|
// UNLESS the place name starts with "The".
|
|
// we need to protect "The Lodge at Santa Fe" for instance.
|
|
if ( lasti>= 0 && lasti<=righti && wids[lasti] != h_the &&
|
|
atPos >= 0 &&
|
|
// ignore "at" in "at law" (e.g. "attorney at law")
|
|
// or really any other "at phrase" like that
|
|
wids[atPos+2] != h_law &&
|
|
// if a city name is between the "at" and the street,
|
|
// then assume the "at" is actually part of the place
|
|
// name!!
|
|
! atCityName ) {
|
|
lasti = atPos + 1;
|
|
// pop this back
|
|
alnumCount = atAlnumCount;
|
|
// undo
|
|
atPos = -1;
|
|
// redo filtering
|
|
goto subloop;
|
|
}
|
|
|
|
// "All rights reserved". no place name in this case
|
|
if ( alnumCount == 3 &&
|
|
lasti >= 0 &&
|
|
i+4<nw &&
|
|
wids[lasti ] == h_all &&
|
|
wids[lasti+2] == h_rights &&
|
|
wids[lasti+4] == h_reserved )
|
|
lasti = -1;
|
|
|
|
// "Contact Us". no place name in this case
|
|
if ( alnumCount == 2 &&
|
|
lasti >= 0 &&
|
|
i+2<nw &&
|
|
wids[lasti ] == h_contact &&
|
|
wids[lasti+2] == h_us )
|
|
lasti = -1;
|
|
|
|
// "[copyrightSign] 2000 Carrier Hotels"
|
|
if ( lasti-1>=0 &&
|
|
gb_strncasestr(wptrs[lasti-1],wlens[lasti-1],copy))
|
|
lasti = -1;
|
|
|
|
// "map of"
|
|
if ( alnumCount == 2 &&
|
|
lasti>=0 &&
|
|
wids[lasti] == h_map &&
|
|
lasti+2<nw &&
|
|
wids[lasti+2] == h_of )
|
|
lasti = -1;
|
|
|
|
// "map for"
|
|
if ( alnumCount == 2 &&
|
|
lasti>=0 &&
|
|
wids[lasti] == h_map &&
|
|
lasti+2<nw &&
|
|
wids[lasti+2] == h_for )
|
|
lasti = -1;
|
|
|
|
|
|
// fix "copyright ; 2009 Albuquerque Journal; Abq ; NM"
|
|
// for http://obits.abqjournal.com/obits/2004/04/13
|
|
if ( lasti >= 0 && alnumCount==1 && wids[lasti]==h_copyright)
|
|
lasti = -1;
|
|
|
|
// ends on lower case word with a whitespace before it
|
|
// so as to not hurt "Wendy's" or "citysearch.com"
|
|
if ( lasti >= 0 &&
|
|
hadUpper &&
|
|
righti>0 && // fix core...
|
|
is_wspace_a(wptrs[righti][-1]) &&
|
|
!is_digit(wptrs[righti][0]) &&
|
|
!is_upper_utf8(wptrs[righti]) )
|
|
lasti = -1;
|
|
|
|
// . we often get zips like "NM 87571" because the previous
|
|
// place has not official street but has a state/zip thing
|
|
// . fixes guidebookamerica.com
|
|
if ( lasti >= 0 &&
|
|
alnumCount == 2 &&
|
|
lasti + 2 < nw &&
|
|
isStateName (lasti) &&
|
|
wlens[lasti+2] == 5 &&
|
|
is_digit(wptrs[lasti+2][0]) )
|
|
lasti = -1;
|
|
|
|
// "New Mexico 87109"
|
|
if ( lasti >= 0 &&
|
|
alnumCount == 3 &&
|
|
lasti + 4 < nw &&
|
|
isStateName (lasti) &&
|
|
wlens[lasti+4] == 5 &&
|
|
is_digit(wptrs[lasti+2][0]) )
|
|
lasti = -1;
|
|
|
|
// now check to see if we should skip this place name and
|
|
// try another before it...
|
|
if ( lasti >= 0 ) {
|
|
// watch out for "Address:" which often preceeds a
|
|
// street name when address is in a table
|
|
if ( alnumCount == 1 && wids[lasti] == h_address ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_street ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_where ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_location ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_office ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_map ) {
|
|
i = lasti - 1; goto redo; }
|
|
// fix "tel: xxxxxxx 9000 girard"
|
|
if ( alnumCount == 1 && wids[lasti] == h_tel ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_edit ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_email ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_added ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_copy ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_search ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_find ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_go ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_town ) {
|
|
i = lasti - 1; goto redo; }
|
|
if ( alnumCount == 1 && wids[lasti] == h_city ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// sometimes "phone:" wedged in there
|
|
if ( alnumCount == 1 && wids[lasti] == h_phone ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// "e-mail"
|
|
if ( alnumCount == 2 &&
|
|
wids[lasti] == h_e &&
|
|
wids[lasti+2] == h_mail ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// "mailing address"
|
|
if ( alnumCount == 2 &&
|
|
wids[lasti] == h_mailing &&
|
|
wids[lasti+2] == h_address ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// "mail address"
|
|
if ( alnumCount == 2 &&
|
|
wids[lasti] == h_mail &&
|
|
wids[lasti+2] == h_address ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// "snail mail"
|
|
if ( alnumCount == 2 &&
|
|
wids[lasti] == h_snail &&
|
|
wids[lasti+2] == h_mail ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// . skip over "33 miles..." or "33 mi..."
|
|
// . Carlsbad Cavern National Park
|
|
// 27 miles S of Carlsbad
|
|
// 3225 National Parks Highway
|
|
if ( alnumCount >= 2 &&
|
|
is_digit(wptrs[lasti][0]) &&
|
|
( wids[lasti+2] == h_mi ||
|
|
wids[lasti+2] == h_miles ||
|
|
wids[lasti+2] == h_km ||
|
|
wids[lasti+2] == h_kilometers ) ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// skip over "(1 review)" or "(33 reviews)"
|
|
if ( alnumCount == 1 &&
|
|
( wids[lasti] == h_review ||
|
|
wids[lasti] == h_reviews ) ) {
|
|
// skip number before too!
|
|
if ( lasti-2>=0 && is_digit(wptrs[lasti-2][0]))
|
|
i = lasti - 3;
|
|
else
|
|
i = lasti - 1;
|
|
goto redo;
|
|
}
|
|
|
|
// skip over "Write a Review"
|
|
if ( alnumCount == 3 &&
|
|
wids[lasti] == h_write &&
|
|
wids[lasti+2] == h_a &&
|
|
wids[lasti+4] == h_review ) {
|
|
i = lasti - 1;
|
|
// skip back until we hit a tag i guess
|
|
// if we have "Be the first to Write a Review"
|
|
for ( ; i > 0 && ! tids[i] ; i-- );
|
|
goto redo;
|
|
}
|
|
|
|
// "Fax: "
|
|
if ( alnumCount >=2 && wids[lasti] == h_fax &&
|
|
m_words->hasChar(lasti+1,':') ) {
|
|
i = lasti - 1; goto redo; }
|
|
// "Ph: "
|
|
if ( alnumCount >=2 && wids[lasti] == h_ph &&
|
|
m_words->hasChar(lasti+1,':') ) {
|
|
i = lasti - 1; goto redo; }
|
|
// "Tel: "
|
|
if ( alnumCount >=2 && wids[lasti] == h_tel &&
|
|
m_words->hasChar(lasti+1,':') ) {
|
|
i = lasti - 1; goto redo; }
|
|
// "Telephone: "
|
|
if ( alnumCount >=2 && wids[lasti] == h_telephone &&
|
|
m_words->hasChar(lasti+1,':') ) {
|
|
i = lasti - 1; goto redo; }
|
|
// "Street Address:"
|
|
if ( alnumCount ==2 && wids[lasti] == h_street &&
|
|
wids[lasti+2] == h_address ) {
|
|
i = lasti - 1; goto redo; }
|
|
// "Location Address:"
|
|
if ( alnumCount ==2 && wids[lasti] == h_location &&
|
|
wids[lasti+2] == h_address ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// "Add to Favorites"
|
|
if ( alnumCount == 3 &&
|
|
wids[lasti ] == h_add &&
|
|
wids[lasti+2] == h_to &&
|
|
wids[lasti+4] == h_favorites ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// "view favorites"
|
|
if ( alnumCount == 2 &&
|
|
wids[lasti ] == h_view &&
|
|
wids[lasti+2] == h_favorites ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// "more info"
|
|
if ( alnumCount == 2 &&
|
|
wids[lasti ] == h_more &&
|
|
wids[lasti+2] == h_info ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// "more information"
|
|
if ( alnumCount == 2 &&
|
|
wids[lasti ] == h_more &&
|
|
wids[lasti+2] == h_information ) {
|
|
i = lasti - 1; goto redo; }
|
|
|
|
// if we just had a sequence of numbers for the place
|
|
// name then ignore that. usually a phone number. fixes
|
|
// http://local.yahoo.com/NM/Albuquerque/Food+Dining/
|
|
// Restaurants/Food+Delivery+Services
|
|
if ( alphaCount == 0 && alnumCount > 0 ) {
|
|
i = lasti - 1; goto redo; }
|
|
}
|
|
|
|
// . if street had upper case words, but we had lower case,
|
|
// then we are not a good place name!
|
|
// . put this after the redo's so we can redo things like
|
|
// "map" or "reviews" which may be in lower case
|
|
if ( (street->m_bits & PLF_HAS_UPPER) && hadLower ) {
|
|
//lasti = -1;
|
|
// EXPERIMENT:
|
|
// skip back to a tag like we do for
|
|
// "Write a Review" skipping logic below
|
|
//i = lasti - 1;
|
|
// skip back until we hit a tag i guess
|
|
// if we have "Be the first to Write a Review"
|
|
for ( ; i > 0 && ! tids[i] ; i-- );
|
|
goto redo;
|
|
}
|
|
|
|
// . add the place name if we found something
|
|
// . if we broke out of the loop because of the alnumCount then
|
|
// that is NOT good because we want something that has a
|
|
// delimiter on the left!
|
|
if ( lasti >= 0 && lasti<=righti && alphaCount > 0 &&
|
|
// this is restricted above!
|
|
//alnumCount <10 &&
|
|
nn<10 ) { // && m_np<MAX_PLACES ) {
|
|
// point to it
|
|
char *p = wptrs[lasti];
|
|
// length
|
|
int32_t plen = (wptrs[righti]+wlens[righti])-wptrs[lasti];
|
|
// set end
|
|
char *pend = p + plen;
|
|
// end on period if we had it
|
|
if ( *pend == '.' ) pend++;
|
|
// include terminating ')' if any
|
|
int32_t parens = 0;
|
|
// start scan
|
|
for ( char *s = p ; s < pend ; s++ ) {
|
|
if ( *s == '(' ) parens++;
|
|
if ( *s == ')' ) parens--;
|
|
}
|
|
// term it with a ) if we had a (
|
|
if ( parens > 0 ) {
|
|
if ( *pend == ')' )
|
|
pend += 1;
|
|
else if ( is_wspace_a(*pend) && pend[1]==')')
|
|
pend += 2;
|
|
}
|
|
// re-set length
|
|
plen = pend - p;
|
|
// note it if crazy...
|
|
if ( plen >= 200 )
|
|
// note it
|
|
log("addr: got place name of %"INT32" chars int32_t",
|
|
plen);
|
|
// sanity check
|
|
//if ( m_np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
// point to the place name
|
|
Place *pp = (Place *)m_pm.getMem(sizeof(Place));
|
|
if ( ! pp ) return false;
|
|
// set the type
|
|
int32_t ptype = 0;
|
|
if ( pcount == 0 ) ptype = PT_NAME_1;
|
|
if ( pcount == 1 ) ptype = PT_NAME_2;
|
|
if ( ptype == 0 ) { char *xx=NULL;*xx=0; }
|
|
// set it
|
|
pp->m_a = lasti;
|
|
pp->m_b = righti+1;
|
|
pp->m_alnumA = -1;//alnumCount;
|
|
pp->m_alnumB = -1;//alnumCount + subcount;
|
|
pp->m_type = ptype;//PT_NAME;
|
|
pp->m_str = p;//wptrs[lasti];
|
|
pp->m_strlen = pend - p;//plen;
|
|
//pp->m_hash = h;
|
|
//pp->m_adm1[0] = 0;//pd->m_adm1[0];
|
|
//pp->m_adm1[1] = 0;//pd->m_adm1[1];
|
|
//pp->m_crid = 0;//pd->m_crid;
|
|
pp->m_bits = 0;//PLF_INFILE;
|
|
pp->m_flags2 = 0;
|
|
// reset hash
|
|
//int64_t h = 0LL;
|
|
// word if of previous word
|
|
//int64_t pi = 0LL;
|
|
// we WERE looping backwards, so we need to
|
|
// compute the hash here
|
|
setHashes ( pp , m_words , m_niceness );
|
|
// if name1/name2 is a city/state or state/city then
|
|
// do not add it
|
|
bool isGood = true;
|
|
// get previous two places, see if city/state
|
|
Place *prev1 = NULL;
|
|
Place *prev2 = NULL;
|
|
int32_t np = m_pm.getNumPtrs();
|
|
if ( np >= 2 ) {
|
|
prev1 = (Place *)m_pm.getPtr(np-1);
|
|
prev2 = (Place *)m_pm.getPtr(np-2);
|
|
}
|
|
// . fix "Kimo Theater, Albuquerque NM, 423 Central"
|
|
// for http://www.zvents.com/albuquerque-nm/venues/sh
|
|
// ow/11865-kimo-theatre
|
|
// . do not allow a city & state to be the two names
|
|
// . sometimes ppl put this before the street
|
|
// . only do this after we have two names (pcount==1)
|
|
if ( pcount == 1 &&
|
|
np > savednp && // we at least added one to np
|
|
prev1 &&
|
|
prev2 &&
|
|
isCityState3 (prev1->m_hash,prev2->m_hash)==1) {
|
|
// wipe out previous name
|
|
nn = savednn;
|
|
// wipe out previous place
|
|
//m_np = savednp;
|
|
m_pm.setNumPtrs ( savednp );
|
|
// reset this too!
|
|
pcount = 0;
|
|
// skip over these guys to get real name
|
|
i = lasti - 1;
|
|
// try again
|
|
goto redo;
|
|
// and do not add this one
|
|
//isGood = false;
|
|
}
|
|
// too long is bad
|
|
if ( plen >= 200 )
|
|
isGood = false;
|
|
if ( ! pp->m_hash )
|
|
isGood = false;
|
|
// . if nothing worth hashing, do not add it
|
|
// . only really add if length is somewhat sane!!
|
|
if ( isGood ) {
|
|
// store it
|
|
pname[nn++] = pp;
|
|
// sanity
|
|
//if (m_np>= MAX_PLACES ){char *xx=NULL;*xx=0;}
|
|
// advance it, but not if we only had "the" for
|
|
// the place name!!
|
|
//m_np++;
|
|
}
|
|
/*
|
|
for ( int32_t k = pp->m_a ; k < pp->m_b ; k++ ) {
|
|
// skip if not word
|
|
if ( ! wids[k] ) continue;
|
|
// . do not add the first word if its "The"
|
|
// into this
|
|
// . fixes "The Guild Cinema" not matching
|
|
// placedb entries for "Guild Cinema"
|
|
if ( h == 0LL && wids[k] == h_the ) continue;
|
|
// . convert place name word into base word
|
|
// . synonyms
|
|
// . converts 4th to fourth, etc.
|
|
int64_t *hw = getSynonymWord (&wids[k],&pi);
|
|
// set previous id
|
|
pi = wids[k];
|
|
// ignore it if returned 0 (ignore) (school)
|
|
if ( ! *hw ) continue;
|
|
// mix it up
|
|
h <<= 1LL;
|
|
// xor it in
|
|
h ^= *hw; // wids[k];
|
|
}
|
|
// only consummate it if not the single word "the"
|
|
if ( h ) {
|
|
// set it
|
|
pp->m_hash = h;
|
|
// store it
|
|
pname[nn++] = pp;
|
|
// advance it, but not if we only had "the" for
|
|
// the place name!!
|
|
np++;
|
|
}
|
|
*/
|
|
// point to before us!
|
|
i = lasti - 1;
|
|
// try to get another one if we only got one
|
|
if ( ++pcount == 1 )
|
|
goto redo;
|
|
}
|
|
// . if no name, beat it. go to the next street we got
|
|
// . no, some events just have a street address and no
|
|
// place name!
|
|
//else
|
|
// continue;
|
|
|
|
|
|
///////////////////////
|
|
//
|
|
// END GET THE PLACE NAME before the street
|
|
//
|
|
///////////////////////
|
|
|
|
|
|
//
|
|
// . if we had multiple streets RIGHT AFTER us, skip over them!
|
|
// . where the "po box 1293" is technically a street
|
|
// . http://www.yelp.com/biz/pizza-9-albuquerque had some too
|
|
//
|
|
// start looking for city/state here
|
|
Place *xstreet = (Place *)m_sm.getPtr(X);
|
|
int32_t start = xstreet->m_b;
|
|
int32_t startAlnum = xstreet->m_alnumB;
|
|
// as = "After Street"
|
|
int32_t as = X + 1;
|
|
// int16_tcut
|
|
int32_t ns = m_sm.getNumPtrs();
|
|
// scan the streets after street #X
|
|
for ( ; as < ns ; as++ ) {
|
|
// get that
|
|
Place *astreet = (Place *)m_sm.getPtr(as);
|
|
// stop if "as" is a "fake street"
|
|
if ( astreet->m_flags2 & PLF2_IS_NAME ) break;
|
|
// if we are NOT the ending word of prev street, then
|
|
// stop this loop.
|
|
if ( startAlnum != astreet->m_alnumA ) break;
|
|
// assign, and do the next
|
|
startAlnum = astreet->m_alnumB;
|
|
start = astreet->m_b;
|
|
}
|
|
// use this
|
|
Place *sss = NULL;
|
|
if ( as < ns ) sss = (Place *)m_sm.getPtr(as);
|
|
// stop if "as" is a "fake street"
|
|
if ( as<ns && (sss->m_flags2 & PLF2_IS_NAME)) as=ns;
|
|
// skip over punct
|
|
if ( start < nw && ! wids[start] ) start++;
|
|
// . skip over "in"
|
|
// . inlines "950 Pinetree SE, in Rio Rancho, NM" for
|
|
// http://www.xeriscapenm.com/xeriscape_gardens.php
|
|
if ( start<nw && wids[start] == h_in ) {
|
|
startAlnum++;
|
|
start += 2;
|
|
}
|
|
// do not scan past this then
|
|
int32_t max = nw;
|
|
if ( as < m_sm.getNumPtrs() ) max = sss->m_a;
|
|
|
|
// NO NO we had "124 ST BTWN 5 AVE AND MT MORRIS PARK WEST"
|
|
// for www.nycgovparks.org/facilities/playgrounds and
|
|
// the street was "124 ST BTWN 5 AVE" and the intersection
|
|
// "AVE AND MT MORRIS PARK WEST" intersected with that
|
|
// street and caused this to core!
|
|
// sanity check
|
|
//if ( max <= street->m_b ) { char *xx=NULL;*xx=0; }
|
|
|
|
//
|
|
// begin parsing out city/adm1/ctry/zip after street name
|
|
//
|
|
|
|
|
|
/*
|
|
|
|
THIS IS THE OLD WAY
|
|
|
|
// . start scan at street->m_b
|
|
// . end scan at "max"
|
|
// . end scan after up to 15 alnum words as well
|
|
// . adds into our places[] array we started up above that
|
|
// includes places from the title
|
|
// . i am expanding from 6 words to 15 because of :
|
|
// "111 Maple Street SE @ Maple and Central beside "
|
|
// Knadjian's Oriental Rugs in Albuquerque, New Mexico "
|
|
// 87106. "
|
|
// . and to reduce bleeding into another address i am now
|
|
// limiting based on the start of the next street, "max"
|
|
np =addProperPlaces(start,max,15,places,MAX_PLACES,np,0,
|
|
// subtract 1 since it is an OPEN ended
|
|
// half interval just like [a,b)
|
|
startAlnum - 1,-1);
|
|
// breach check
|
|
if ( np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
|
|
// check before the street, too, but stay in the sentence!
|
|
if ( nn >= 1 ) {
|
|
int32_t na = pname[0]->m_a;
|
|
int32_t nb = pname[0]->m_b;
|
|
np=addProperPlaces(na,nb,15,places,
|
|
MAX_PLACES,np,0,
|
|
pname[0]->m_alnumA - 1,-1);
|
|
}
|
|
if ( nn >= 2 ) {
|
|
int32_t na = pname[1]->m_a;
|
|
int32_t nb = pname[1]->m_b;
|
|
np=addProperPlaces(na,nb,15,places,
|
|
MAX_PLACES,np,0,
|
|
pname[1]->m_alnumA - 1,-1);
|
|
}
|
|
// breach check
|
|
if ( np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
///////////////////////////
|
|
//
|
|
// ** "... in Santa Fe 213 Washington Ave."
|
|
//
|
|
///////////////////////////
|
|
|
|
// now scan the sentence this street is in for any
|
|
// prepositional phrase beginning with the preposition "in"
|
|
// immediately followed by a city or adm1 name.
|
|
// this logic was hurting abqtango.com because our sentence
|
|
// formation was not good enough and we were allowing the
|
|
// many span tags in the sentence to break the sentence into
|
|
// many smaller sentences because we decided span tags should
|
|
// do that by default. so i made the sentence detection logic
|
|
// better so that abqtango.com would keep 213 washington ave
|
|
// in the sentence that had "in Santa Fe" still...
|
|
Section *ss = m_sections->m_sectionPtrs[street->m_a];
|
|
for ( ; ss ; ss = ss->m_parent )
|
|
if ( ss && (ss->m_flags & SEC_SENTENCE) ) break;
|
|
|
|
// might not have a sentence if we are CT_JAVASCRIPT content
|
|
// type, sense we avoid sentence setting for those doc types
|
|
int32_t sa = 0;
|
|
int32_t sb = 0;
|
|
// scan the first and last word of the senentce this street
|
|
// is in. MAY ACTUALLY BE OUTSIDE of the "ss" section because
|
|
// of the new logic in Sections::addSentences() which allows
|
|
// us to have sentences that split sections now to deal with
|
|
// aliconference.com, abqtango.com, etc.
|
|
if ( ss ) { sa = ss->m_senta; sb = ss->m_sentb; }
|
|
// init this
|
|
bool hasRequiredPlace = false;
|
|
// set this. does it matter???
|
|
int32_t alnumPos = 0;//ss->m_alnumA - 1;
|
|
bool afterIn = false;
|
|
// scan the sentence
|
|
for ( int32_t i = sa ; i < sb ; i++ ) {
|
|
// skip if not alnum word
|
|
if ( ! m_wids[i] ) continue;
|
|
// count it
|
|
alnumPos++;
|
|
// skip if not "in"
|
|
if ( m_wids[i] == h_in ) {
|
|
afterIn = true;
|
|
continue;
|
|
}
|
|
// skip if not after the word "in"
|
|
if ( ! afterIn ) continue;
|
|
// reset in case we get continued below
|
|
afterIn = false;
|
|
// to avoid "just in case" or "in time" let's
|
|
// require it be capitalized
|
|
if ( ! m_words->isCapitalized(i) ) continue;
|
|
// find the end of it
|
|
int32_t j = i + 1;
|
|
int32_t lastj = j;
|
|
// loop until we hit something lowercase or number
|
|
for ( ; j < sb ; j++ ) {
|
|
// stop on tag
|
|
if ( m_tids[j] ) break;
|
|
// check case
|
|
if ( m_wids[j] ) {
|
|
// if upper that's ok
|
|
if ( ! m_words->isCapitalized(j) &&
|
|
! s_lc.isInTable(&m_wids[j]) )
|
|
break;
|
|
// save it
|
|
lastj = j;
|
|
}
|
|
// stop on certain punct
|
|
char *p = wptrs[j];
|
|
char *pend = p + wlens[j];
|
|
for ( ; p < pend ; p++ ) {
|
|
if ( is_wspace_a(*p) )
|
|
continue;
|
|
// St. James?
|
|
if ( *p == '.' )
|
|
continue;
|
|
break;
|
|
}
|
|
if ( p < pend ) break;
|
|
}
|
|
// save
|
|
int32_t oldnp = np;
|
|
// reset
|
|
np = addProperPlaces(i,i+1,8,places,
|
|
MAX_PLACES,np,0,
|
|
alnumPos-1,
|
|
lastj);
|
|
// set the required bit
|
|
for ( int32_t k = oldnp ; k < np ; k++ )
|
|
// set this bit
|
|
places[k].m_bits |= PLF2_REQUIRED;
|
|
// must contain a required bit?
|
|
if ( np > oldnp ) hasRequiredPlace = true;
|
|
// stop
|
|
break;
|
|
}
|
|
// breach check
|
|
if ( np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
|
|
|
|
//
|
|
// parse up all our accumulated Places into arrays so we can
|
|
// loop over them all and get all the possible combinations
|
|
// of Place types, Place::m_type.
|
|
//
|
|
for ( int32_t i = 0 ; i < np ; i++ ) {
|
|
// get it
|
|
Place *pi = &places[i];
|
|
// sanity check
|
|
if ( ! pi->m_hash ) { char *xx=NULL;*xx=0; }
|
|
// parse it up
|
|
if ( pi->m_type == PT_CITY ) {
|
|
if ( nc >= MAX_CITIES2 ) continue;
|
|
pcity[nc++] = pi;
|
|
}
|
|
if ( pi->m_type == PT_STATE ) {
|
|
if ( na >= MAX_ADM1 ) continue;
|
|
padm1[na++] = pi;
|
|
}
|
|
if ( pi->m_type == PT_ZIP ) {
|
|
if ( nz >= 10 ) continue;
|
|
pzip[nz++] = pi;
|
|
}
|
|
if ( pi->m_type == PT_CTRY ) {
|
|
if ( ny >= 10 ) continue;
|
|
pctry[ny++] = pi;
|
|
}
|
|
// sanity check
|
|
if ( pi && ! pi->m_hash ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
END THE OLD WAY
|
|
|
|
*/
|
|
|
|
|
|
// . the new way is to telescope out from our street section
|
|
// looking for cities
|
|
// . we note the telescope depth of each city/state/zip place
|
|
// we encounter so that we prefer the city topologically
|
|
// closest to us
|
|
int32_t sa = xstreet->m_a;
|
|
if ( sa < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// int16_tcut
|
|
Place *st = xstreet;//&streets[X];
|
|
|
|
// are we a street or place name in the title?
|
|
bool streetInTitle = false;
|
|
if ( st->m_a > 0 && sp )
|
|
streetInTitle = (sp[st->m_a]->m_flags & SEC_IN_TITLE);
|
|
|
|
Section *ss = NULL;
|
|
int32_t senta = -1;
|
|
int32_t sentb = -1;
|
|
if ( m_sections ) {
|
|
ss = m_sections->m_sectionPtrs[street->m_a];
|
|
senta = ss->m_senta;
|
|
sentb = ss->m_sentb;
|
|
}
|
|
|
|
int32_t maxZips = nz + 1;
|
|
bool hasRequiredCity = false;
|
|
bool hasRequiredState = false;
|
|
|
|
////////////
|
|
//
|
|
// set pcity[], array of potential cities for this street
|
|
//
|
|
////////////
|
|
for ( int32_t i = 0 ; i < m_npSaved ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get city, state or zip
|
|
Place *p = (Place *)m_pm.getPtr(i);
|
|
// sanity check
|
|
if ( p->m_alnumA < st->m_alnumA && p->m_a > st->m_a ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// skip city if it intersects street
|
|
if ( p->intersects ( xstreet ) ) continue;
|
|
// skip city if it intersects the name too now
|
|
//if(nn>0&&pname[0]&&p->intersects(pname[0])) continue;
|
|
// or name2, to fix omnimedicalsearch.com
|
|
//if(nn>1&&pname[1]&&p->intersects(pname[1])) continue;
|
|
// for zips really, should not be in the suite
|
|
if (psuite[0]&&p->intersects(psuite[0])) continue;
|
|
|
|
// is it required
|
|
bool isRequired = ( p->m_flags2 & PLF2_REQUIRED );
|
|
|
|
// . allow state to come from anywhere in the document
|
|
// . TODO: later add meta description to get
|
|
// christinesaari.com etc.
|
|
if ( p->m_type == PT_STATE ) {
|
|
// is it in our sentence
|
|
bool inSent = (p->m_a>=senta&&p->m_a<sentb);
|
|
// if in our sentence and required, set this
|
|
if ( inSent &&
|
|
isRequired &&
|
|
// fix "in NE Albuquerque" so we do not
|
|
// think that means nebraska... this
|
|
// fixes address in local.yahoo.com/NM/
|
|
// Albuquerque/Food+Dining/Restaurants/
|
|
// Food+Delivery+Services
|
|
m_wids[p->m_a]!= h_ne)
|
|
hasRequiredState = true;
|
|
// make the key for deduping
|
|
char key[4];
|
|
key[0] = p->m_adm1[0];
|
|
key[1] = p->m_adm1[1];
|
|
key[2] = 0;
|
|
key[3] = 0;
|
|
// get if already in padm1[] array
|
|
Place **pp = (Place **)dat.getValue ( &key );
|
|
// if it is us already, skip for sure
|
|
if ( pp && *pp == p ) continue;
|
|
// if we are not near street, skip us
|
|
int32_t dist1 = p->m_alnumA - st->m_alnumA;
|
|
int32_t dist2 = p->m_alnumA - st->m_alnumB;
|
|
if ( dist1 < 0 ) dist1 *= -1;
|
|
if ( dist2 < 0 ) dist2 *= -1;
|
|
int32_t mdist = dist1;
|
|
if ( dist2 < mdist ) mdist = dist2;
|
|
if ( mdist > 10 && ! inSent ) continue;
|
|
// sanity
|
|
if ( na >= 80 ) continue;
|
|
// ok, add it in even though this state might
|
|
// already be represented by another word
|
|
// somewhere else in the document
|
|
padm1 [ na++ ] = p;
|
|
// that's it
|
|
continue;
|
|
}
|
|
|
|
// . stop if far beyond the street
|
|
// . if in venue tag then m_a will be < 0
|
|
if ( p->m_a >= 0 &&
|
|
p->m_alnumA > st->m_alnumB + 10 )
|
|
continue;
|
|
|
|
// is place in title?
|
|
bool inTitle = (p->m_bits & PLF_FROMTITLE);
|
|
|
|
// if we are an xml doc they often have multiple
|
|
// <title> tags, one for each element, so do not
|
|
// consider in that case. this was causing trumba.com
|
|
// to miss its city after the address.
|
|
if ( m_contentType == CT_XML ) inTitle = false;
|
|
|
|
// skip if before us and not in title
|
|
if ( p->m_a >= 0 &&
|
|
p->m_a < st->m_a &&
|
|
// well, allow it to be a few words before us
|
|
// to fix some addresses that have the city
|
|
// before the street. like menuism.com
|
|
// christinesaari.com salsapower.com
|
|
p->m_alnumB < st->m_alnumA - 5 &&
|
|
! inTitle )
|
|
continue;
|
|
|
|
// zip is not allowed to be before us ever though
|
|
// even if in title, which is not allowed
|
|
if ( p->m_type == PT_ZIP &&
|
|
p->m_a >= 0 &&
|
|
p->m_a < st->m_a )
|
|
continue;
|
|
|
|
// only use first zip, no because one zip may be
|
|
// in the title and the other in the body
|
|
if ( p->m_type == PT_ZIP && nz >= MAX_ZIPS )
|
|
continue;
|
|
|
|
// skip zip codes in the title
|
|
if ( p->m_type == PT_ZIP &&
|
|
p->m_a >= 0 &&
|
|
inTitle &&
|
|
! streetInTitle )
|
|
continue;
|
|
|
|
// skip zip codes in the tag
|
|
if ( p->m_type == PT_ZIP && p->m_a < 0 )
|
|
continue;
|
|
|
|
// only allow one zip from what we started with
|
|
if ( p->m_type == PT_ZIP && nz >= maxZips )
|
|
continue;
|
|
|
|
if ( p->m_type == PT_ZIP ) {
|
|
pzip [nz++] = p;
|
|
continue;
|
|
}
|
|
|
|
// limit to like 5 or so, that is indicative of
|
|
// a list of cities after us...
|
|
if ( nc >= MAX_CITIES )
|
|
continue;
|
|
|
|
// this can be a type of PT_NAME since we add tags
|
|
// from a tagrec like
|
|
// "Albuquerque Center for Peace and Justice;;;202
|
|
// Harvard Southeast;Albuquerque;nm;87106;;165445..."
|
|
// and that adds its places into m_places[] and
|
|
// incs m_np
|
|
if ( p->m_type != PT_CITY ) continue;
|
|
|
|
// add it to place table like how addProperPlaces() did
|
|
if ( p->m_type == PT_CITY ) pcity[nc++] = p;
|
|
|
|
// if in our sentence and required, set this
|
|
if ( p->m_a>= senta && p->m_a < sentb && isRequired )
|
|
hasRequiredCity = true;
|
|
}
|
|
|
|
// complain
|
|
if ( nn >= 10 ) {
|
|
if ( ! printed ) log("events: name breach");
|
|
printed = true;
|
|
//char *xx=NULL;*xx=0;
|
|
}
|
|
if ( nc >= MAX_CITIES ) {
|
|
if ( ! printed ) log("addr: cities breach");
|
|
printed = true;
|
|
// just bail out now to fix the slow parsing of
|
|
// www.soul-patrol.com
|
|
g_errno = EBUFOVERFLOW;
|
|
m_breached = true;
|
|
return false;
|
|
//char *xx=NULL;*xx=0;
|
|
}
|
|
if ( na >= MAX_ADM1 ) {
|
|
if ( ! printed ) log("events: adm1 breach");
|
|
printed = true;
|
|
//char *xx=NULL;*xx=0;
|
|
}
|
|
//if ( nc >= MAX_CITIES || nc <= 0 ) {
|
|
// log("events: city breach");
|
|
// char *xx=NULL;*xx=0;
|
|
//}
|
|
|
|
// need at least one city or zip to make an address
|
|
if ( nc <= 1 && nz <= 1 ) continue;
|
|
|
|
// . PO Boxes do not have names
|
|
// . YES THEY DO!
|
|
// . was picking up "yahoo" as the place name for:
|
|
// http://www.usadancenm.org/links.html :
|
|
// "usadancenm@yahoo.com ** P.O. Box 94766, Albuquerque"
|
|
//if ( to_lower_a(street->m_str[0])=='p' ) nn = 0;
|
|
|
|
// . allow for a null place name
|
|
// . some events just have a street address with no official
|
|
// place name
|
|
if ( nn < 2 ) pname[nn++] = NULL;
|
|
if ( nn < 2 ) pname[nn++] = NULL;
|
|
|
|
//
|
|
// TODO: filter out places using the hashtable adm1/ctryId algo
|
|
//
|
|
|
|
// adjust nc
|
|
//int32_t fakena = na + dc;
|
|
|
|
// . now the heavily nested loop (BIG LOOP)
|
|
// . first over addresses to inherit from
|
|
// . default addresses (from tagdb rec - contact info)
|
|
// . TODO: fix this i1 < 2 HACK!
|
|
for ( int32_t i1 = 0 ; i1 < dc && i1 < 2 ; i1++ ) {
|
|
// loop over default address again, but ignore city and
|
|
// just use the adm1 (state).
|
|
// should fix "913 W. Alameda - Santa Fe" which has no state,
|
|
// but "Albuquerque, New Mexico" is in the tag!
|
|
for ( int32_t i1b = 0 ; i1b < 2 /*3*/ ; i1b++ ) {
|
|
// adm1
|
|
for ( int32_t i2 = 0 ; i2 < na ; i2++ ) {
|
|
// city
|
|
for ( int32_t i3 = 0 ; i3 < nc ; i3++ ) {
|
|
// ctry
|
|
//for ( int32_t i4 = 0 ; i4 < ny ; i4++ ) {
|
|
// zip
|
|
for ( int32_t i5 = 0 ; i5 < nz ; i5++ ) {
|
|
// suite
|
|
for ( int32_t i6 = 0 ; i6 < nu ; i6++ ) {
|
|
// place name
|
|
//for ( int32_t i7 = 0 ; i7 < nn ; i7++ ) {
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// we only use i1b for default addresses in da[]
|
|
if ( i1b > 0 && i1 == 0 ) continue;
|
|
|
|
// int16_tcuts
|
|
Place *adm1 = padm1 [i2];
|
|
//Place *ctry = pctry [i4];
|
|
Place *zip = pzip [i5];
|
|
Place *suite = psuite [i6];
|
|
Place *name1 = pname [0];
|
|
Place *name2 = pname [1];
|
|
Place *city = pcity [i3];
|
|
|
|
// now if city is out of bounds use the venue address
|
|
if ( i1 > 0 ) {
|
|
// set it
|
|
Address *addr = &da [i1];
|
|
// always use venue's state!
|
|
adm1 = addr->m_adm1;
|
|
// 1 means inherit city too!
|
|
if ( i1b == 1 )
|
|
city = addr->m_city;
|
|
// don't take the zip!!
|
|
//zip = addr->m_zip;
|
|
zip = NULL;
|
|
}
|
|
|
|
if ( hasRequiredCity ) {
|
|
// skip if no city
|
|
if ( ! city ) continue;
|
|
// skip if city is not "required"
|
|
if ( ! ( city->m_flags2 & PLF2_REQUIRED ) )
|
|
continue;
|
|
// must be in our sentence! this fixes
|
|
// when we had "... in Central New Mexico"
|
|
// in the title, it thought Central was the
|
|
// city. but we had "in Abq" in our sentence.
|
|
// and both cities had this bit set but
|
|
// only Abq should have applied!
|
|
if ( city->m_a < senta ) continue;
|
|
if ( city->m_a >= sentb ) continue;
|
|
}
|
|
|
|
if ( hasRequiredState ) {
|
|
// skip if no state
|
|
if ( ! adm1 ) continue;
|
|
// skip if stateis not "required"
|
|
if ( ! ( adm1->m_flags2 & PLF2_REQUIRED ) )
|
|
continue;
|
|
// see the "city" fix right above
|
|
if ( adm1->m_a < senta ) continue;
|
|
if ( adm1->m_a >= sentb ) continue;
|
|
}
|
|
|
|
// no overlap of adm1 and city
|
|
if ( adm1 && city &&
|
|
adm1->m_a >= 0 &&
|
|
adm1->m_a == city->m_a ) continue;
|
|
|
|
// if we had a prepositional phrase starting with "in"
|
|
// then we must contain its city/adm1 name if it
|
|
// had one...
|
|
/*
|
|
if ( hasRequiredPlace ) {
|
|
bool gotIt = false;
|
|
if ( city && (city->m_bits & PLF2_REQUIRED ) )
|
|
gotIt = true;
|
|
if ( adm1 && (adm1->m_bits & PLF2_REQUIRED ) )
|
|
gotIt = true;
|
|
if ( ! gotIt )
|
|
continue;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// . inherit!
|
|
// . "addr" i think is just the default venue addr now
|
|
if ( i1b == 0 ) {
|
|
// if addr is supplying these, skip if there
|
|
// was a collision.
|
|
if ( addr->m_adm1 && adm1 ) continue;
|
|
if ( addr->m_city && city ) continue;
|
|
//if(addr->m_name.m_str && name ) continue;
|
|
if ( addr->m_adm1 ) adm1 = addr->m_adm1;
|
|
if ( addr->m_city ) city = addr->m_city;
|
|
//if(addr->m_name.m_str ) name = &addr->m_name;
|
|
}
|
|
// . if i1b is 1 then we only inherit adm1!!!
|
|
// . this fixes the bug for 913 W. Alameda described
|
|
// above.
|
|
else if ( i1b == 1 ) {
|
|
// if addr is supplying these, skip if there
|
|
// was a collision.
|
|
if ( addr->m_adm1 && adm1 ) continue;
|
|
if ( addr->m_adm1 ) {
|
|
adm1 = addr->m_adm1;
|
|
if(!adm1->m_hash){char *xx=NULL;*xx=0;}
|
|
}
|
|
}
|
|
*/
|
|
|
|
// need a city, can be implied by a zip
|
|
if ( ! city && ! zip ) continue;
|
|
|
|
// the CF_UNIQUE is too inaccruate for this!!
|
|
//bool hasState = false;
|
|
//if ( adm1 ) hasState = true;
|
|
//if ( zip ) hasState = true;
|
|
//if ( city && city->m_alnumA == st->m_alnumB &&
|
|
// city->m_adm1[0] )
|
|
// hasState = true;
|
|
//if ( ! hasState ) continue;
|
|
// . need a state too, can be implied by a zip
|
|
// . certain unique cities can also imply the state,
|
|
// like "Albuquerque" or "Washington DC"
|
|
if ( ! adm1 && ! zip ) continue;
|
|
|
|
|
|
|
|
// . how to fix "1024 4th St SW in downtown
|
|
// Albuquerque" which has no adm1?
|
|
// . get the adm1/state from the city, BUT
|
|
// only if city is UNIQUE!!!
|
|
/*
|
|
if ( ! adm1 && city->m_bits & PLF_UNIQUE ) {
|
|
tap.m_crid = city->m_crid;
|
|
tap.m_str = city->m_adm1;
|
|
tap.m_strlen = 2;
|
|
tap.m_adm1[0] = city->m_adm1[0];
|
|
tap.m_adm1[1] = city->m_adm1[1];
|
|
adm1 = &tap;
|
|
//continue;
|
|
}
|
|
*/
|
|
// this is required
|
|
//if ( ! adm1 ) continue;
|
|
|
|
//if ( ! name ) continue;
|
|
|
|
// quickly check adm1 vs. city
|
|
//if ( adm1->m_adm1[0] != city->m_adm1[0] ) continue;
|
|
//if ( adm1->m_adm1[1] != city->m_adm1[1] ) continue;
|
|
//if ( adm1->m_crid != city->m_crid ) continue;
|
|
if ( adm1 && city &&
|
|
!(adm1->m_adm1Bits & city->m_adm1Bits))
|
|
continue;
|
|
|
|
/*
|
|
// sanity check
|
|
if ( zip && ! zip->m_hash ) { char *xx=NULL;*xx=0; }
|
|
// cancel out bad zips
|
|
if ( zip && adm1 && adm1->m_adm1Bits!=zip->m_adm1Bits)
|
|
zip = NULL;//continue;
|
|
//if ( adm1->m_crid !=zip->m_crid )continue;
|
|
// cut the int64_t to a int32_t for this compare
|
|
if ( zip && city && city->m_hash != zip->m_cityHash )
|
|
zip = NULL;//continue;
|
|
*/
|
|
|
|
/*
|
|
// debug
|
|
Address tmp;
|
|
memset ( &tmp , 0 , sizeof(Address) );
|
|
if ( street ) tmp.m_street = street;
|
|
if ( adm1 ) tmp.m_adm1 = adm1;
|
|
if ( city ) tmp.m_city = city;
|
|
//if ( ctry ) tmp.m_ctry = ctry;
|
|
if ( zip ) tmp.m_zip = zip;
|
|
if ( suite ) tmp.m_suite = suite;
|
|
if ( name1 ) tmp.m_name1 = name1;
|
|
if ( name2 ) tmp.m_name2 = name2;
|
|
tmp.print();
|
|
*/
|
|
|
|
//if ( street->m_str[0]=='4' && city->m_str[0]=='R'
|
|
// && name->m_str[0]=='E' && adm1->m_str[1]=='M'
|
|
// && name->m_str[20]=='n' ) {
|
|
//printAddress ( &tmp,NULL,0);
|
|
//log("hey");
|
|
//}
|
|
|
|
|
|
//log("events: i1=%"INT32" i2=%"INT32" i3=%"INT32" i4=%"INT32" "
|
|
// "i5=%"INT32" i6=%"INT32" i7=%"INT32"",
|
|
// i1,i2,i3,i4,i5,i6,i7);
|
|
|
|
// clear
|
|
char flags3 = 0;
|
|
// this should be an address flag because we might
|
|
// be using a city/state from another sentence
|
|
// in which it is required, but it is not for us
|
|
// if we are in a different sentence
|
|
if ( hasRequiredCity )
|
|
flags3 |= AF2_HAS_REQUIRED_CITY;
|
|
if ( hasRequiredState )
|
|
flags3 |= AF2_HAS_REQUIRED_STATE;
|
|
|
|
// . now try to add place vec to our array of addresses
|
|
// . we now supply the containing section, "sec"
|
|
// so we can vote on which tag hash supplied the best
|
|
// addresses
|
|
if ( ! addAddress ( name1 ,
|
|
name2 ,
|
|
suite ,
|
|
street ,
|
|
city ,
|
|
adm1 ,
|
|
zip ,
|
|
NULL , // ctry ,
|
|
NULL ,
|
|
startAlnum ,
|
|
flags3 ,
|
|
NULL ) ) return false;
|
|
|
|
//if ( m_breached )
|
|
// goto bustout;
|
|
|
|
} // i1
|
|
} // i1b
|
|
} //adm1
|
|
}
|
|
//} ctry
|
|
} //i5 nz
|
|
}
|
|
// end the BIG LOOP
|
|
}
|
|
|
|
// CRAP! this algo was causing many streets to be ignored on
|
|
// http://www.estrelladelnortevineyard.com/SFV_retloc.php
|
|
// because it has like "main st" and "central" in multiple cities!
|
|
// so comment this algo out and try to think of a better way
|
|
/*
|
|
//
|
|
// now if all street names are the same but with a
|
|
// different city then i would say nuke them! cuz it
|
|
// can be a list of some kind of statistic per city,
|
|
// like
|
|
// Amsterdam Netherlands (114 events)
|
|
// Anaheim CA United States (249 events)
|
|
// Ann Arbor MI United States (155 events)
|
|
// Atlanta GA United States (708 events)
|
|
// on http://events.mapchannels.com/
|
|
//
|
|
//
|
|
// only allow one city to use a streetHash
|
|
HashTableX su;
|
|
char subuf[2000];
|
|
// set allowDups to true!!!!
|
|
su.set ( 8 , 8 , 0 , subuf , 2000 , true , m_niceness );
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Address *a = (Address *)m_am.getPtr(i);
|
|
// skip if not inlined
|
|
if ( ! ( a->m_flags & AF_INLINED ) ) continue;
|
|
// get street hash
|
|
int64_t sh = a->m_street->m_hash;
|
|
// get city hash
|
|
int64_t ch = a->m_city.m_hash;
|
|
// hash it. return false with g_errno set on error
|
|
if ( ! su.addKey ( &sh , &ch ) ) return false;
|
|
}
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Address *a = (Address *)m_am.getPtr(i);
|
|
// skip if not inlined
|
|
if ( ! ( a->m_flags & AF_INLINED ) ) continue;
|
|
// get street hash
|
|
int64_t sh = a->m_street->m_hash;
|
|
// how many different cities have this same street?
|
|
int32_t slot = su.getSlot ( &sh );
|
|
// reset count
|
|
int32_t count = 0;
|
|
// multiple places might have this hash
|
|
for ( ; slot>=0 ; slot = su.getNextSlot ( slot , &sh ) ) {
|
|
// count it
|
|
count++;
|
|
}
|
|
// if only 1 city had this street name, keep it
|
|
if ( count <= 1 ) continue;
|
|
// otherwise, ignore this address
|
|
a->m_flags &= ~AF_INLINED;
|
|
a->m_flags |= AF_IGNORE;
|
|
}
|
|
// free mem just in case
|
|
su.reset();
|
|
*/
|
|
|
|
// bustout:
|
|
//
|
|
// set the AF_AMBIGUOUS bits of each Address if we should
|
|
//
|
|
setAmbiguousFlags();
|
|
|
|
//log("events: combos=%"INT32"",combos);
|
|
//char *xx=NULL;*xx=0;
|
|
//log("events: sleeping 3 seconds. waiting for possible Ctrl-C");
|
|
//sleep(3);
|
|
|
|
return true;
|
|
}
|
|
|
|
Place *getZipPlace ( int32_t a , int32_t alnumPos , Words *words ) {
|
|
// must be a number
|
|
if ( ! is_digit(words->m_words[a][0]) ) return NULL;
|
|
// return this if we got one
|
|
static Place p;
|
|
// make hash
|
|
int64_t h = 0 ^ words->m_wordIds[a];
|
|
// check for zip code
|
|
int32_t slot = g_zips.getSlot(&h);
|
|
// skip if not
|
|
if ( slot < 0 ) return NULL;
|
|
// get the place
|
|
ZipDesc *zd =(ZipDesc *)g_zips.getValueFromSlot(slot);
|
|
// set it
|
|
p.m_adm1Bits = zd->m_adm1Bits;
|
|
p.m_adm1[0] = zd->m_adm1[0];
|
|
p.m_adm1[1] = zd->m_adm1[1];
|
|
p.m_type = PT_ZIP;
|
|
p.m_a = a;
|
|
p.m_b = a+1;
|
|
p.m_bits = 0;
|
|
p.m_alnumA = alnumPos;
|
|
p.m_alnumB = alnumPos+1;
|
|
p.m_str = words->m_words[a];
|
|
p.m_strlen = words->m_wordLens[a];
|
|
p.m_hash = h;
|
|
p.m_cityHash = zd->m_cityHash;
|
|
p.m_cityStr = g_cityBuf + zd->m_cityOffset;
|
|
return &p;
|
|
}
|
|
|
|
Place *getCityPlace ( int32_t a , int32_t alnumPos , Words *words ) {
|
|
// return this if we got one
|
|
static Place p;
|
|
// init hash to zero
|
|
int64_t h = 0LL;
|
|
// max count
|
|
int32_t count = 0;
|
|
// record start
|
|
int32_t startAlnumPos = alnumPos;
|
|
// fix this
|
|
alnumPos--;
|
|
// return this
|
|
Place *retp = NULL;
|
|
// for some filtering
|
|
static bool s_flag = false;
|
|
static int64_t h_university;
|
|
static int64_t h_of;
|
|
if ( ! s_flag ) {
|
|
s_flag = true;
|
|
h_university = hash64n("university");
|
|
h_of = hash64n("of");
|
|
}
|
|
// int16_tcut
|
|
int32_t nw = words->m_numWords;
|
|
int32_t wcount = 0;
|
|
// loop over words in [a,b)
|
|
for ( int32_t k = a ; k < nw ; k++ ) {
|
|
// or 15 words is good enough too!
|
|
if ( ++wcount >= 20 ) break;
|
|
// skip if not alnum
|
|
if ( ! words->isAlnum(k) ) continue;
|
|
// count it
|
|
alnumPos++;
|
|
// only up to 4 words in a city name
|
|
if ( ++count >= 5 ) break;
|
|
// get the hash of potential place name
|
|
int64_t wid = words->m_wordIds[k];
|
|
// int16_tcut
|
|
int32_t wlen = words->m_wordLens[k];
|
|
char *wptr = words->m_words[k];
|
|
// if it ended in apostrophe s then fix that
|
|
if ( wlen > 2 &&
|
|
wptr[wlen-2]=='\'' &&
|
|
to_lower_a(wptr[wlen-1]) == 's' )
|
|
// hash the word without the 's
|
|
wid = hash64Lower_utf8(wptr,wlen-2);
|
|
// mix it up
|
|
h <<= 1;
|
|
// hash it into our ongoing hash
|
|
h ^= wid; // words->m_wordIds[k];
|
|
// might be alias
|
|
//int64_t *ah1 = (int64_t *) g_aliases.getValue(&h);
|
|
//if ( ah1 ) h = *ah1;
|
|
// ignore "University" if "of" follows
|
|
if ( h == h_university &&
|
|
k + 2 < nw &&
|
|
words->m_wordIds[k+2] == h_of )
|
|
continue;
|
|
// get it
|
|
CityDesc *cd = (CityDesc *)g_cities.getValue(&h);
|
|
if ( ! cd ) continue;
|
|
// check for "county" (santa fe county is not a city name)
|
|
if ( k + 2 < nw && words->m_wordIds[k+2] == h_county )
|
|
return NULL;
|
|
// int16_tcuts
|
|
char **wptrs = words->getWords();
|
|
int32_t *wlens = words->getWordLens();
|
|
// set the place
|
|
p.m_adm1Bits = cd->m_adm1Bits;
|
|
p.m_type = PT_CITY;
|
|
p.m_a = a;
|
|
p.m_b = k+1;
|
|
p.m_alnumA = startAlnumPos;
|
|
p.m_alnumB = alnumPos+1;
|
|
p.m_str = wptrs[a];
|
|
p.m_strlen = wptrs[k]+wlens[k]-wptrs[a];
|
|
p.m_hash = h;
|
|
p.m_cityHash = h;
|
|
p.m_bits = 0;
|
|
/*
|
|
// if city is unique, set its adm1Hash
|
|
if ( p.m_adm1Bits & CF_UNIQUE ) {
|
|
// get it
|
|
char *ap = getStateAbbr ( p.m_adm1Bits );
|
|
// set it
|
|
p.m_adm1[0] = ap[0];
|
|
p.m_adm1[1] = ap[1];
|
|
}
|
|
else {
|
|
p.m_adm1[0] = 0;
|
|
p.m_adm1[1] = 0;
|
|
}
|
|
*/
|
|
// note it
|
|
retp = &p;
|
|
// see if we can beat it though
|
|
}
|
|
return retp;
|
|
}
|
|
|
|
Place *getStatePlace ( int32_t a , int32_t alnumPos , Words *words ) {
|
|
// return this if we got one
|
|
static Place p;
|
|
// init hash to zero
|
|
int64_t h = 0LL;
|
|
// max count
|
|
int32_t count = 0;
|
|
// record start
|
|
int32_t startAlnumPos = alnumPos;
|
|
// fix this
|
|
alnumPos--;
|
|
// int16_tcut
|
|
int32_t nw = words->getNumWords();
|
|
// loop over words in [a,b)
|
|
for ( int32_t k = a ; k < nw ; k++ ) {
|
|
// skip if not alnum
|
|
if ( ! words->isAlnum(k) ) continue;
|
|
// count it
|
|
alnumPos++;
|
|
// only up to 3 words "district of columbia"
|
|
if ( ++count >= 4 ) break;
|
|
// get the hash of potential place name
|
|
int64_t wid = words->m_wordIds[k];
|
|
// int16_tcut
|
|
int32_t wlen = words->m_wordLens[k];
|
|
char *wptr = words->m_words[k];
|
|
// if it ended in apostrophe s then fix that
|
|
if ( wlen > 2 &&
|
|
wptr[wlen-2]=='\'' &&
|
|
to_lower_a(wptr[wlen-1]) == 's' )
|
|
// hash the word without the 's
|
|
wid = hash64Lower_utf8(wptr,wlen-2);
|
|
// mix it up
|
|
h <<= 1;
|
|
// hash it into our ongoing hash
|
|
h ^= wid; // words->m_wordIds[k];
|
|
// get it
|
|
int32_t pos = getStateOffset ( &h );
|
|
// skip if not a state
|
|
if ( pos < 0 ) continue;
|
|
// int16_tcuts
|
|
char **wptrs = words->getWords();
|
|
int32_t *wlens = words->getWordLens();
|
|
// otherwise, set it
|
|
int64_t stateBit = 1LL << pos;
|
|
p.m_adm1Bits = stateBit;
|
|
p.m_type = PT_STATE;
|
|
p.m_a = a;
|
|
p.m_b = k+1;
|
|
p.m_alnumA = startAlnumPos;
|
|
p.m_alnumB = alnumPos+1;
|
|
p.m_str = wptrs[a];
|
|
p.m_strlen = wptrs[k]+wlens[k]-wptrs[a];
|
|
// set adm1 code
|
|
StateDesc *sd = &s_states[pos];
|
|
p.m_adm1[0] = sd->m_adm1[0];
|
|
p.m_adm1[1] = sd->m_adm1[1];
|
|
p.m_hash = p.m_adm1Bits;;
|
|
p.m_bits = 0;
|
|
return &p;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
// . returns -1 and sets g_errno on error
|
|
// . returns false if not city/state combo, true otherwise
|
|
int32_t Addresses::isCityState3 ( int64_t h1 , int64_t h2 ) {
|
|
|
|
int64_t nh1 = h1;
|
|
int64_t nh2 = h2;
|
|
|
|
// we now put the aliases into g_cities as if they were their own
|
|
// cities!
|
|
// convert aliases -- only for cities methinks
|
|
//int64_t *ah1 = (int64_t *) g_aliases.getValue(&h1);
|
|
//if ( ah1 ) nh1 = *ah1;
|
|
//int64_t *ah2 = (int64_t *) g_aliases.getValue(&h2);
|
|
//if ( ah2 ) nh2 = *ah2;
|
|
|
|
// get the places
|
|
bool c1 = g_cities.isInTable ( &nh1 );
|
|
bool c2 = g_states.isInTable ( & h1 );
|
|
if ( ! c1 && ! c2 ) return false;
|
|
|
|
bool d1 = g_cities.isInTable ( &nh2 );
|
|
bool d2 = g_states.isInTable ( & h2 );
|
|
if ( ! d1 && ! d2 ) return false;
|
|
|
|
// "Coutrnyside Mobile Home Park" is a PPL (popluated place) in MN
|
|
// so we assume it to be a city. then it is mentioned on the new mexico
|
|
// page http://www.thecityofalbuquerque.com/mobilehome/ in new mexico.
|
|
// so make sure the city is in that state i guess...
|
|
if ( d1 && c2 ) {
|
|
CityDesc *cd = (CityDesc *)g_cities.getValue(&nh2);
|
|
uint64_t sb = getStateBitFromHash ( &h1 );
|
|
if ( ! ( (cd->m_adm1Bits) & sb ) ) { d1 = false; c2 = false; }
|
|
}
|
|
|
|
if ( d2 && c1 ) {
|
|
CityDesc *cd = (CityDesc *)g_cities.getValue(&nh1);
|
|
uint64_t sb = getStateBitFromHash ( &h2 );
|
|
if ( ! ( (cd->m_adm1Bits) & sb ) ) { d2 = false; c1 = false; }
|
|
}
|
|
|
|
if ( c1 && d2 ) return true;
|
|
if ( c2 && d1 ) return true;
|
|
return false;
|
|
}
|
|
|
|
// words range is [a,b)
|
|
bool Addresses::isCityName ( int32_t a , int32_t b ) {
|
|
// init hash to zero
|
|
int64_t h = 0LL;
|
|
// loop over words in [a,b)
|
|
for ( int32_t k = a ; k < b ; k++ ) {
|
|
// skip if not alnum
|
|
if ( ! m_words->isAlnum(k) ) continue;
|
|
// mix it up
|
|
h <<= 1;
|
|
// hash it into our ongoing hash
|
|
h ^= m_wids[k];
|
|
}
|
|
// might be alias
|
|
//int64_t *ah1 = (int64_t *) g_aliases.getValue(&h);
|
|
//if ( ah1 ) h = *ah1;
|
|
// get it
|
|
return g_cities.isInTable(&h);
|
|
}
|
|
|
|
// words range is [a,b)
|
|
bool Addresses::isStateName ( int32_t a ) {
|
|
// init hash to zero
|
|
int64_t h = 0LL;
|
|
// max count
|
|
int32_t count = 0;
|
|
// loop over words in [a,b)
|
|
for ( int32_t k = a ; k < m_nw ; k++ ) {
|
|
// skip if not alnum
|
|
if ( ! m_words->isAlnum(k) ) continue;
|
|
// only up to "district of columbia"
|
|
if ( ++count >= 4 ) break;
|
|
// mix it up
|
|
h <<= 1;
|
|
// hash it into our ongoing hash
|
|
h ^= m_wids[k];
|
|
// get it
|
|
if ( g_states.isInTable(&h) ) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// . words range is [a,b)
|
|
// . used by Events.cpp to demote title score
|
|
bool Addresses::isCityState ( Section *si ) {
|
|
|
|
// skip if too many words
|
|
int32_t na = si->m_lastWordPos - si->m_firstWordPos;
|
|
if ( na <= 0 ) return false;
|
|
if ( na >= 2*10 ) return false;
|
|
|
|
int32_t a = si->m_a;
|
|
int32_t b = si->m_lastWordPos + 1;
|
|
|
|
int32_t lastb = isCityState2 ( a , b );
|
|
|
|
if ( lastb <= 0 ) return false;
|
|
if ( lastb == si->m_lastWordPos ) return true;
|
|
return false;
|
|
}
|
|
|
|
// . returns -1 and sets g_errno on error
|
|
// . returns 0 or 1 otherwise
|
|
int32_t Addresses::cityAdm1Follows ( int32_t a ) {
|
|
// returns -1 if does not follow
|
|
if ( isCityState2 ( a , m_nw ) < 0 ) return 0;
|
|
// it did follow
|
|
return 1;
|
|
}
|
|
|
|
int32_t Addresses::isCityState2 ( int32_t a , int32_t b ) {
|
|
|
|
// m must lie on a punt word or tag
|
|
for ( ; a < b ; a++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop on wid
|
|
if ( m_wids[a] ) break;
|
|
}
|
|
// bail if no wid
|
|
if ( a >= b ) return -1;
|
|
|
|
Place *cp = getCityPlace ( a , 0 , m_words );
|
|
if ( ! cp ) return -1;
|
|
// point to start of state
|
|
int32_t sta = cp->m_b;
|
|
for ( ; sta < b ; sta++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// need a wid
|
|
if ( m_wids[sta] ) break;
|
|
}
|
|
// bail if no room
|
|
if ( sta >= b ) return -1;
|
|
// otherwise, see if its a state
|
|
Place *sp = getStatePlace ( sta , cp->m_alnumB , m_words );
|
|
// skip if not
|
|
if ( ! sp ) return -1;
|
|
// now we make sure city supports state
|
|
if ( ! ( sp->m_adm1Bits & cp->m_adm1Bits ) ) return -1;
|
|
// return last word we match otherwise
|
|
return sp->m_b - 1;
|
|
}
|
|
|
|
void Addresses::setAmbiguousFlags ( ) {
|
|
|
|
// clear those flags first
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
Address *ai = (Address *)m_am.getPtr(i);
|
|
ai->m_flags &= ~AF_AMBIGUOUS;
|
|
}
|
|
|
|
// . loop over the addresses we got
|
|
// . determine which addresses we want to add to placedb and namedb
|
|
// . placedb key is based on street address, city, adm1,crid(ctry),name
|
|
// . namedb key is based on name, city, adm1, crid
|
|
// . only add in addresses that are definitive
|
|
// . must have zip code, must not have another address with the same
|
|
// street address
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() - 1 ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Address *a = (Address *)m_am.getPtr(i);
|
|
// do not do fake street names
|
|
if ( a->m_street->m_a < 0 ) continue;
|
|
// reset verified counts
|
|
int32_t verified1 = 0;
|
|
int32_t verified2 = 0;
|
|
int32_t verified3 = 0;
|
|
int32_t verified4 = 0;
|
|
// count dups, addresses using the same street
|
|
int32_t dups = 0;
|
|
// do we have other verified addresses using this street?
|
|
for ( int32_t j = i ; j < m_am.getNumPtrs() ; j++ ) {
|
|
// get one before us
|
|
Address *b = (Address *)m_am.getPtr(j);
|
|
// stop when street is different
|
|
if ( b->m_street->m_a != a->m_street->m_a ) break;
|
|
// count dups
|
|
dups++;
|
|
// is "b" verified?
|
|
if ( b->m_flags & AF_VERIFIED_STREET )
|
|
verified1++;
|
|
if ( b->m_flags & AF_VERIFIED_STREET_NUM )
|
|
verified2++;
|
|
if ( b->m_flags & AF_VERIFIED_PLACE_NAME_1 )
|
|
verified3++;
|
|
if ( b->m_flags & AF_VERIFIED_PLACE_NAME_2 )
|
|
verified4++;
|
|
}
|
|
|
|
// loop over all the dups
|
|
for ( int32_t j = i ; dups >= 2 && j < m_am.getNumPtrs() ; j++ ) {
|
|
// get one before us
|
|
Address *b = (Address *)m_am.getPtr(j);
|
|
// stop when street is different
|
|
if ( b->m_street->m_a != a->m_street->m_a ) break;
|
|
// if we are the only verified, we are not ambiguous
|
|
if((b->m_flags&AF_VERIFIED_STREET )&&verified1==1)
|
|
continue;
|
|
if((b->m_flags&AF_VERIFIED_STREET_NUM)&&verified2==1)
|
|
continue;
|
|
if((b->m_flags&AF_VERIFIED_PLACE_NAME_1)&&verified3==1)
|
|
continue;
|
|
if((b->m_flags&AF_VERIFIED_PLACE_NAME_2)&&verified4==1)
|
|
continue;
|
|
// otherwise, we are!
|
|
b->m_flags |= AF_AMBIGUOUS;
|
|
// this now too only if some street made it through
|
|
if ( verified2 ) b->m_flags3 |= AF2_BADCITYSTATE;
|
|
}
|
|
}
|
|
}
|
|
|
|
class SynTwin {
|
|
public:
|
|
char *m_s1;
|
|
char *m_s2;
|
|
};
|
|
|
|
// map the place name synonyms here
|
|
static SynTwin s_synList[] = {
|
|
{"1st","first"}
|
|
,{"2nd","second"}
|
|
,{"3rd","third"}
|
|
,{"4th","fourth"} // North 4th Arts Center, Abq NM
|
|
,{"5th","fifth"}
|
|
,{"6th","sixth"}
|
|
,{"7th","seventh"}
|
|
,{"8th","eighth"}
|
|
,{"9th","ninth"}
|
|
,{"10th","tenth"}
|
|
,{"11th","eleventh"}
|
|
,{"12th","twelfth"}
|
|
|
|
,{"theatre","theater"} // Kimo Theatre
|
|
|
|
,{"n","north"}
|
|
,{"s","south"}
|
|
,{"e","east"}
|
|
,{"w","west"}
|
|
|
|
,{"ne","northeast"}
|
|
,{"se","southeast"}
|
|
|
|
,{"nw","northwest"}
|
|
,{"sw","southwest"}
|
|
|
|
// smith elementary should equal smith elementary school
|
|
//,{"school",""}
|
|
|
|
//
|
|
// how about road stuff
|
|
//
|
|
// from http://www.usps.com/ncsc/lookups/usps_abbreviations.html
|
|
//
|
|
// cat usps_abbreviations.html | grep -v "*" | grep -v "back to" | awk '{print ",{\""$2"\",\""$1"\"}"}' > foo
|
|
// cat usps_abbreviations.html | grep -v "*" | grep -v "back to" | awk '{print ",{\""$3"\",\""$1"\"}"}' >> foo
|
|
// cat foo | sort | uniq >> Address.cpp
|
|
,{"ALLEE","ALLEY"}
|
|
,{"ALLEY","ALLEY"}
|
|
,{"ALLY","ALLEY"}
|
|
,{"ALY","ALLEY"}
|
|
,{"ANEX","ANNEX"}
|
|
,{"ANNEX","ANNEX"}
|
|
,{"ANNX","ANNEX"}
|
|
,{"ANX","ANNEX"}
|
|
,{"ARCADE","ARCADE"}
|
|
,{"ARC","ARCADE"}
|
|
,{"AV","AVENUE"}
|
|
,{"AVE","AVENUE"}
|
|
,{"AVEN","AVENUE"}
|
|
,{"AVENU","AVENUE"}
|
|
,{"AVENUE","AVENUE"}
|
|
,{"AVN","AVENUE"}
|
|
,{"AVNUE","AVENUE"}
|
|
,{"BAYOO","BAYOO"}
|
|
,{"BAYOU","BAYOO"}
|
|
,{"BCH","BEACH"}
|
|
,{"BEACH","BEACH"}
|
|
,{"BEND","BEND"}
|
|
,{"BG","BURG"}
|
|
,{"BGS","BURGS"}
|
|
,{"BLF","BLUFF"}
|
|
,{"BLFS","BLUFFS"}
|
|
,{"BLUF","BLUFF"}
|
|
,{"BLUFF","BLUFF"}
|
|
,{"BLUFFS","BLUFFS"}
|
|
,{"BLVD","BOULEVARD"}
|
|
,{"BND","BEND"}
|
|
,{"BOT","BOTTOM"}
|
|
,{"BOTTM","BOTTOM"}
|
|
,{"BOTTOM","BOTTOM"}
|
|
,{"BOUL","BOULEVARD"}
|
|
,{"BOULEVARD","BOULEVARD"}
|
|
,{"BOULV","BOULEVARD"}
|
|
,{"BRANCH","BRANCH"}
|
|
,{"BR","BRANCH"}
|
|
,{"BRDGE","BRIDGE"}
|
|
,{"BRG","BRIDGE"}
|
|
,{"BRIDGE","BRIDGE"}
|
|
,{"BRK","BROOK"}
|
|
,{"BRKS","BROOKS"}
|
|
,{"BRNCH","BRANCH"}
|
|
,{"BROOK","BROOK"}
|
|
,{"BROOKS","BROOKS"}
|
|
,{"BTM","BOTTOM"}
|
|
,{"BURG","BURG"}
|
|
,{"BURGS","BURGS"}
|
|
,{"BYPA","BYPASS"}
|
|
,{"BYPAS","BYPASS"}
|
|
,{"BYPASS","BYPASS"}
|
|
,{"BYP","BYPASS"}
|
|
,{"BYPS","BYPASS"}
|
|
,{"BYU","BAYOO"}
|
|
,{"CAMP","CAMP"}
|
|
,{"CANYN","CANYON"}
|
|
,{"CANYON","CANYON"}
|
|
,{"CAPE","CAPE"}
|
|
,{"CAUSEWAY","CAUSEWAY"}
|
|
,{"CAUSWAY","CAUSEWAY"}
|
|
,{"CEN","CENTER"}
|
|
,{"CENT","CENTER"}
|
|
,{"CENTER","CENTER"}
|
|
,{"CENTERS","CENTERS"}
|
|
,{"CENTR","CENTER"}
|
|
,{"CENTRE","CENTER"}
|
|
,{"CIRC","CIRCLE"}
|
|
,{"CIR","CIRCLE"}
|
|
,{"CIRCL","CIRCLE"}
|
|
,{"CIRCLE","CIRCLE"}
|
|
,{"CIRCLES","CIRCLES"}
|
|
,{"CIRS","CIRCLES"}
|
|
,{"CLB","CLUB"}
|
|
,{"CLF","CLIFF"}
|
|
,{"CLFS","CLIFFS"}
|
|
,{"CLIFF","CLIFF"}
|
|
,{"CLIFFS","CLIFFS"}
|
|
,{"CLUB","CLUB"}
|
|
,{"CMN","COMMON"}
|
|
,{"CMNS","COMMONS"}
|
|
,{"CMP","CAMP"}
|
|
,{"CNTER","CENTER"}
|
|
,{"CNTR","CENTER"}
|
|
,{"CNYN","CANYON"}
|
|
,{"COMMON","COMMON"}
|
|
,{"COMMONS","COMMONS"}
|
|
,{"COR","CORNER"}
|
|
,{"CORNER","CORNER"}
|
|
,{"CORNERS","CORNERS"}
|
|
,{"CORS","CORNERS"}
|
|
,{"COURSE","COURSE"}
|
|
,{"COURT","COURT"}
|
|
,{"COURTS","COURTS"}
|
|
,{"COVE","COVE"}
|
|
,{"COVES","COVES"}
|
|
,{"CP","CAMP"}
|
|
,{"CPE","CAPE"}
|
|
,{"CRCL","CIRCLE"}
|
|
,{"CRCLE","CIRCLE"}
|
|
,{"CREEK","CREEK"}
|
|
,{"CRESCENT","CRESCENT"}
|
|
,{"CRES","CRESCENT"}
|
|
,{"CREST","CREST"}
|
|
,{"CRK","CREEK"}
|
|
,{"CROSSING","CROSSING"}
|
|
,{"CROSSROAD","CROSSROAD"}
|
|
,{"CROSSROADS","CROSSROADS"}
|
|
,{"CRSE","COURSE"}
|
|
,{"CRSENT","CRESCENT"}
|
|
,{"CRSNT","CRESCENT"}
|
|
,{"CRSSING","CROSSING"}
|
|
,{"CRSSNG","CROSSING"}
|
|
,{"CRST","CREST"}
|
|
,{"CSWY","CAUSEWAY"}
|
|
,{"CT","COURT"}
|
|
,{"CTR","CENTER"}
|
|
,{"CTRS","CENTERS"}
|
|
,{"CTS","COURTS"}
|
|
,{"CURV","CURVE"}
|
|
,{"CURVE","CURVE"}
|
|
,{"CV","COVE"}
|
|
,{"CVS","COVES"}
|
|
,{"CYN","CANYON"}
|
|
,{"DALE","DALE"}
|
|
,{"DAM","DAM"}
|
|
,{"DIV","DIVIDE"}
|
|
,{"DIVIDE","DIVIDE"}
|
|
,{"DL","DALE"}
|
|
,{"DM","DAM"}
|
|
,{"DR","DRIVE"}
|
|
,{"DRIV","DRIVE"}
|
|
,{"DRIVE","DRIVE"}
|
|
,{"DRIVES","DRIVES"}
|
|
,{"DRS","DRIVES"}
|
|
,{"DRV","DRIVE"}
|
|
,{"DVD","DIVIDE"}
|
|
,{"DV","DIVIDE"}
|
|
,{"ESTATE","ESTATE"}
|
|
,{"ESTATES","ESTATES"}
|
|
,{"EST","ESTATE"}
|
|
,{"ESTS","ESTATES"}
|
|
,{"EXP","EXPRESSWAY"}
|
|
,{"EXPRESS","EXPRESSWAY"}
|
|
,{"EXPRESSWAY","EXPRESSWAY"}
|
|
,{"EXPR","EXPRESSWAY"}
|
|
,{"EXPW","EXPRESSWAY"}
|
|
,{"EXPY","EXPRESSWAY"}
|
|
,{"EXTENSION","EXTENSION"}
|
|
,{"EXT","EXTENSION"}
|
|
,{"EXTN","EXTENSION"}
|
|
,{"EXTNSN","EXTENSION"}
|
|
,{"EXTS","EXTENSIONS"}
|
|
,{"FALL","FALL"}
|
|
,{"FALLS","FALLS"}
|
|
,{"FERRY","FERRY"}
|
|
,{"FIELD","FIELD"}
|
|
,{"FIELDS","FIELDS"}
|
|
,{"FLAT","FLAT"}
|
|
,{"FLATS","FLATS"}
|
|
,{"FLD","FIELD"}
|
|
,{"FLDS","FIELDS"}
|
|
,{"FLS","FALLS"}
|
|
,{"FLT","FLAT"}
|
|
,{"FLTS","FLATS"}
|
|
,{"FORD","FORD"}
|
|
,{"FORDS","FORDS"}
|
|
,{"FOREST","FOREST"}
|
|
,{"FORESTS","FOREST"}
|
|
,{"FORGE","FORGE"}
|
|
,{"FORGES","FORGES"}
|
|
,{"FORG","FORGE"}
|
|
,{"FORK","FORK"}
|
|
,{"FORKS","FORKS"}
|
|
,{"FORT","FORT"}
|
|
,{"FRD","FORD"}
|
|
,{"FRDS","FORDS"}
|
|
,{"FREEWAY","FREEWAY"}
|
|
,{"FREEWY","FREEWAY"}
|
|
,{"FRG","FORGE"}
|
|
,{"FRGS","FORGES"}
|
|
,{"FRK","FORK"}
|
|
,{"FRKS","FORKS"}
|
|
,{"FRRY","FERRY"}
|
|
,{"FRST","FOREST"}
|
|
,{"FRT","FORT"}
|
|
,{"FRWAY","FREEWAY"}
|
|
,{"FRWY","FREEWAY"}
|
|
,{"FRY","FERRY"}
|
|
,{"FT","FORT"}
|
|
,{"FWY","FREEWAY"}
|
|
,{"GARDEN","GARDEN"}
|
|
,{"GARDENS","GARDENS"}
|
|
,{"GARDN","GARDEN"}
|
|
,{"GATEWAY","GATEWAY"}
|
|
,{"GATEWY","GATEWAY"}
|
|
,{"GATWAY","GATEWAY"}
|
|
,{"GDN","GARDEN"}
|
|
,{"GDNS","GARDENS"}
|
|
,{"GLEN","GLEN"}
|
|
,{"GLENS","GLENS"}
|
|
,{"GLN","GLEN"}
|
|
,{"GLNS","GLENS"}
|
|
,{"GRDEN","GARDEN"}
|
|
,{"GRDN","GARDEN"}
|
|
,{"GRDNS","GARDENS"}
|
|
,{"GREEN","GREEN"}
|
|
,{"GREENS","GREENS"}
|
|
,{"GRN","GREEN"}
|
|
,{"GRNS","GREENS"}
|
|
,{"GROVE","GROVE"}
|
|
,{"GROVES","GROVES"}
|
|
,{"GROV","GROVE"}
|
|
,{"GRV","GROVE"}
|
|
,{"GRVS","GROVES"}
|
|
,{"GTWAY","GATEWAY"}
|
|
,{"GTWY","GATEWAY"}
|
|
,{"HARB","HARBOR"}
|
|
,{"HARBOR","HARBOR"}
|
|
,{"HARBORS","HARBORS"}
|
|
,{"HARBR","HARBOR"}
|
|
,{"HAVEN","HAVEN"}
|
|
,{"HBR","HARBOR"}
|
|
,{"HBRS","HARBORS"}
|
|
,{"HIGHWAY","HIGHWAY"}
|
|
,{"HIGHWY","HIGHWAY"}
|
|
,{"HILL","HILL"}
|
|
,{"HILLS","HILLS"}
|
|
,{"HIWAY","HIGHWAY"}
|
|
,{"HIWY","HIGHWAY"}
|
|
,{"HL","HILL"}
|
|
,{"HLLW","HOLLOW"}
|
|
,{"HLS","HILLS"}
|
|
,{"HOLLOW","HOLLOW"}
|
|
,{"HOLLOWS","HOLLOW"}
|
|
,{"HOLW","HOLLOW"}
|
|
,{"HOLWS","HOLLOW"}
|
|
,{"HRBOR","HARBOR"}
|
|
,{"HT","HEIGHTS"}
|
|
,{"HTS","HEIGHTS"}
|
|
,{"HVN","HAVEN"}
|
|
,{"HWAY","HIGHWAY"}
|
|
,{"HWY","HIGHWAY"}
|
|
,{"INLT","INLET"}
|
|
,{"IS","ISLAND"}
|
|
,{"ISLAND","ISLAND"}
|
|
,{"ISLANDS","ISLANDS"}
|
|
,{"ISLE","ISLE"}
|
|
,{"ISLES","ISLE"}
|
|
,{"ISLND","ISLAND"}
|
|
,{"ISLNDS","ISLANDS"}
|
|
,{"ISS","ISLANDS"}
|
|
,{"JCTION","JUNCTION"}
|
|
,{"JCT","JUNCTION"}
|
|
,{"JCTN","JUNCTION"}
|
|
,{"JCTNS","JUNCTIONS"}
|
|
,{"JCTS","JUNCTIONS"}
|
|
,{"JUNCTION","JUNCTION"}
|
|
,{"JUNCTIONS","JUNCTIONS"}
|
|
,{"JUNCTN","JUNCTION"}
|
|
,{"JUNCTON","JUNCTION"}
|
|
,{"KEY","KEY"}
|
|
,{"KEYS","KEYS"}
|
|
,{"KNL","KNOLL"}
|
|
,{"KNLS","KNOLLS"}
|
|
,{"KNOL","KNOLL"}
|
|
,{"KNOLL","KNOLL"}
|
|
,{"KNOLLS","KNOLLS"}
|
|
,{"KY","KEY"}
|
|
,{"KYS","KEYS"}
|
|
,{"LAKE","LAKE"}
|
|
,{"LAKES","LAKES"}
|
|
,{"LANDING","LANDING"}
|
|
,{"LAND","LAND"}
|
|
,{"LANE","LANE"}
|
|
,{"LCK","LOCK"}
|
|
,{"LCKS","LOCKS"}
|
|
,{"LDGE","LODGE"}
|
|
,{"LDG","LODGE"}
|
|
,{"LF","LOAF"}
|
|
,{"LGT","LIGHT"}
|
|
,{"LGTS","LIGHTS"}
|
|
,{"LIGHT","LIGHT"}
|
|
,{"LIGHTS","LIGHTS"}
|
|
,{"LK","LAKE"}
|
|
,{"LKS","LAKES"}
|
|
,{"LNDG","LANDING"}
|
|
,{"LNDNG","LANDING"}
|
|
,{"LN","LANE"}
|
|
,{"LOAF","LOAF"}
|
|
,{"LOCK","LOCK"}
|
|
,{"LOCKS","LOCKS"}
|
|
,{"LODGE","LODGE"}
|
|
,{"LODG","LODGE"}
|
|
,{"LOOP","LOOP"}
|
|
,{"LOOPS","LOOP"}
|
|
,{"MALL","MALL"}
|
|
,{"MANOR","MANOR"}
|
|
,{"MANORS","MANORS"}
|
|
,{"MDW","MEADOW"}
|
|
,{"MDW","MEADOWS"}
|
|
,{"MDWS","MEADOWS"}
|
|
,{"MEADOW","MEADOW"}
|
|
,{"MEADOWS","MEADOWS"}
|
|
,{"MEDOWS","MEADOWS"}
|
|
,{"MEWS","MEWS"}
|
|
,{"MILL","MILL"}
|
|
,{"MILLS","MILLS"}
|
|
,{"MISSN","MISSION"}
|
|
,{"ML","MILL"}
|
|
,{"MLS","MILLS"}
|
|
,{"MNR","MANOR"}
|
|
,{"MNRS","MANORS"}
|
|
,{"MNTAIN","MOUNTAIN"}
|
|
,{"MNT","MOUNT"}
|
|
,{"MNTN","MOUNTAIN"}
|
|
,{"MNTNS","MOUNTAINS"}
|
|
,{"MOTORWAY","MOTORWAY"}
|
|
,{"MOUNTAIN","MOUNTAIN"}
|
|
,{"MOUNTAINS","MOUNTAINS"}
|
|
,{"MOUNTIN","MOUNTAIN"}
|
|
,{"MOUNT","MOUNT"}
|
|
,{"MSN","MISSION"}
|
|
,{"MSSN","MISSION"}
|
|
,{"MTIN","MOUNTAIN"}
|
|
,{"MT","MOUNT"}
|
|
,{"MTN","MOUNTAIN"}
|
|
,{"MTNS","MOUNTAINS"}
|
|
,{"MTWY","MOTORWAY"}
|
|
,{"NCK","NECK"}
|
|
,{"NECK","NECK"}
|
|
,{"OPAS","OVERPASS"}
|
|
,{"ORCHARD","ORCHARD"}
|
|
,{"ORCH","ORCHARD"}
|
|
,{"ORCHRD","ORCHARD"}
|
|
,{"OVAL","OVAL"}
|
|
,{"OVERPASS","OVERPASS"}
|
|
,{"OVL","OVAL"}
|
|
,{"PARK","PARK"}
|
|
,{"PARK","PARKS"}
|
|
,{"PARKS","PARKS"}
|
|
,{"PARKWAY","PARKWAY"}
|
|
,{"PARKWAYS","PARKWAYS"}
|
|
,{"PARKWY","PARKWAY"}
|
|
,{"PASSAGE","PASSAGE"}
|
|
,{"PASS","PASS"}
|
|
,{"PATH","PATH"}
|
|
,{"PATHS","PATH"}
|
|
,{"PIKE","PIKE"}
|
|
,{"PIKES","PIKE"}
|
|
,{"PINE","PINE"}
|
|
,{"PINES","PINES"}
|
|
,{"PKWAY","PARKWAY"}
|
|
,{"PKWY","PARKWAY"}
|
|
,{"PKWY","PARKWAYS"}
|
|
,{"PKWYS","PARKWAYS"}
|
|
,{"PKY","PARKWAY"}
|
|
,{"PLAIN","PLAIN"}
|
|
,{"PLAINS","PLAINS"}
|
|
,{"PLAZA","PLAZA"}
|
|
,{"PLN","PLAIN"}
|
|
,{"PLNS","PLAINS"}
|
|
,{"PL","PLACE"}
|
|
,{"PLZA","PLAZA"}
|
|
,{"PLZ","PLAZA"}
|
|
,{"PNE","PINE"}
|
|
,{"PNES","PINES"}
|
|
,{"POINT","POINT"}
|
|
,{"POINTS","POINTS"}
|
|
,{"PORT","PORT"}
|
|
,{"PORTS","PORTS"}
|
|
,{"PRAIRIE","PRAIRIE"}
|
|
,{"PRK","PARK"}
|
|
,{"PR","PRAIRIE"}
|
|
,{"PRR","PRAIRIE"}
|
|
,{"PRT","PORT"}
|
|
,{"PRTS","PORTS"}
|
|
,{"PSGE","PASSAGE"}
|
|
,{"PT","POINT"}
|
|
,{"PTS","POINTS"}
|
|
,{"RADIAL","RADIAL"}
|
|
,{"RADIEL","RADIAL"}
|
|
,{"RADL","RADIAL"}
|
|
,{"RAD","RADIAL"}
|
|
,{"RAMP","RAMP"}
|
|
,{"RANCHES","RANCH"}
|
|
,{"RANCH","RANCH"}
|
|
,{"RAPID","RAPID"}
|
|
,{"RAPIDS","RAPIDS"}
|
|
,{"RDGE","RIDGE"}
|
|
,{"RDG","RIDGE"}
|
|
,{"RDGS","RIDGES"}
|
|
,{"RD","ROAD"}
|
|
,{"RDS","ROADS"}
|
|
,{"REST","REST"}
|
|
,{"RIDGE","RIDGE"}
|
|
,{"RIDGES","RIDGES"}
|
|
,{"RIVER","RIVER"}
|
|
,{"RIV","RIVER"}
|
|
,{"RIVR","RIVER"}
|
|
,{"RNCH","RANCH"}
|
|
,{"RNCHS","RANCH"}
|
|
,{"ROAD","ROAD"}
|
|
,{"ROADS","ROADS"}
|
|
,{"ROUTE","ROUTE"}
|
|
,{"ROW","ROW"}
|
|
,{"RPD","RAPID"}
|
|
,{"RPDS","RAPIDS"}
|
|
,{"RST","REST"}
|
|
,{"RTE","ROUTE"}
|
|
,{"RUE","RUE"}
|
|
,{"RUN","RUN"}
|
|
,{"RVR","RIVER"}
|
|
,{"SHL","SHOAL"}
|
|
,{"SHLS","SHOALS"}
|
|
,{"SHOAL","SHOAL"}
|
|
,{"SHOALS","SHOALS"}
|
|
,{"SHOAR","SHORE"}
|
|
,{"SHOARS","SHORES"}
|
|
,{"SHORE","SHORE"}
|
|
,{"SHORES","SHORES"}
|
|
,{"SHR","SHORE"}
|
|
,{"SHRS","SHORES"}
|
|
,{"SKWY","SKYWAY"}
|
|
,{"SKYWAY","SKYWAY"}
|
|
,{"SMT","SUMMIT"}
|
|
,{"SPG","SPRING"}
|
|
,{"SPGS","SPRINGS"}
|
|
,{"SPNG","SPRING"}
|
|
,{"SPNGS","SPRINGS"}
|
|
,{"SPRING","SPRING"}
|
|
,{"SPRINGS","SPRINGS"}
|
|
,{"SPRNG","SPRING"}
|
|
,{"SPRNGS","SPRINGS"}
|
|
,{"SPUR","SPUR"}
|
|
,{"SPUR","SPURS"}
|
|
,{"SPURS","SPURS"}
|
|
,{"SQRE","SQUARE"}
|
|
,{"SQR","SQUARE"}
|
|
,{"SQRS","SQUARES"}
|
|
,{"SQ","SQUARE"}
|
|
,{"SQS","SQUARES"}
|
|
,{"SQUARE","SQUARE"}
|
|
,{"SQUARES","SQUARES"}
|
|
,{"SQU","SQUARE"}
|
|
,{"STA","STATION"}
|
|
,{"STATION","STATION"}
|
|
,{"STATN","STATION"}
|
|
,{"STN","STATION"}
|
|
,{"STRA","STRAVENUE"}
|
|
,{"STRAVEN","STRAVENUE"}
|
|
,{"STRAVENUE","STRAVENUE"}
|
|
,{"STRAVN","STRAVENUE"}
|
|
,{"STRAV","STRAVENUE"}
|
|
,{"STREAM","STREAM"}
|
|
,{"STREETS","STREETS"}
|
|
,{"STREET","STREET"}
|
|
,{"STREME","STREAM"}
|
|
,{"STRM","STREAM"}
|
|
,{"STR","STREET"}
|
|
,{"STRT","STREET"}
|
|
,{"STRVN","STRAVENUE"}
|
|
,{"STRVNUE","STRAVENUE"}
|
|
,{"STS","STREETS"}
|
|
,{"ST","STREET"}
|
|
,{"SUMIT","SUMMIT"}
|
|
,{"SUMITT","SUMMIT"}
|
|
,{"SUMMIT","SUMMIT"}
|
|
,{"TERRACE","TERRACE"}
|
|
,{"TERR","TERRACE"}
|
|
,{"TER","TERRACE"}
|
|
,{"THROUGHWAY","THROUGHWAY"}
|
|
,{"TPKE","TURNPIKE"}
|
|
,{"TRACES","TRACE"}
|
|
,{"TRACE","TRACE"}
|
|
,{"TRACKS","TRACK"}
|
|
,{"TRACK","TRACK"}
|
|
,{"TRAFFICWAY","TRAFFICWAY"}
|
|
,{"TRAILER","TRAILER"}
|
|
,{"TRAILS","TRAIL"}
|
|
,{"TRAIL","TRAIL"}
|
|
,{"TRAK","TRACK"}
|
|
,{"TRCE","TRACE"}
|
|
,{"TRFY","TRAFFICWAY"}
|
|
,{"TRKS","TRACK"}
|
|
,{"TRK","TRACK"}
|
|
,{"TRLRS","TRAILER"}
|
|
,{"TRLR","TRAILER"}
|
|
,{"TRLS","TRAIL"}
|
|
,{"TRL","TRAIL"}
|
|
,{"TRNPK","TURNPIKE"}
|
|
,{"TRWY","THROUGHWAY"}
|
|
,{"TUNEL","TUNNEL"}
|
|
,{"TUNLS","TUNNEL"}
|
|
,{"TUNL","TUNNEL"}
|
|
,{"TUNNELS","TUNNEL"}
|
|
,{"TUNNEL","TUNNEL"}
|
|
,{"TUNNL","TUNNEL"}
|
|
,{"TURNPIKE","TURNPIKE"}
|
|
,{"TURNPK","TURNPIKE"}
|
|
,{"UNDERPASS","UNDERPASS"}
|
|
,{"UNIONS","UNIONS"}
|
|
,{"UNION","UNION"}
|
|
,{"UNS","UNIONS"}
|
|
,{"UN","UNION"}
|
|
,{"UPAS","UNDERPASS"}
|
|
,{"VALLEYS","VALLEYS"}
|
|
,{"VALLEY","VALLEY"}
|
|
,{"VALLY","VALLEY"}
|
|
,{"VDCT","VIADUCT"}
|
|
,{"VIADCT","VIADUCT"}
|
|
,{"VIADUCT","VIADUCT"}
|
|
,{"VIA","VIADUCT"}
|
|
,{"VIEWS","VIEWS"}
|
|
,{"VIEW","VIEW"}
|
|
,{"VILLAGES","VILLAGES"}
|
|
,{"VILLAGE","VILLAGE"}
|
|
,{"VILLAG","VILLAGE"}
|
|
,{"VILLE","VILLE"}
|
|
,{"VILLG","VILLAGE"}
|
|
,{"VILLIAGE","VILLAGE"}
|
|
,{"VILL","VILLAGE"}
|
|
,{"VISTA","VISTA"}
|
|
,{"VIST","VISTA"}
|
|
,{"VIS","VISTA"}
|
|
,{"VLGS","VILLAGES"}
|
|
,{"VLG","VILLAGE"}
|
|
,{"VLLY","VALLEY"}
|
|
,{"VL","VILLE"}
|
|
,{"VLYS","VALLEYS"}
|
|
,{"VLY","VALLEY"}
|
|
,{"VSTA","VISTA"}
|
|
,{"VST","VISTA"}
|
|
,{"VWS","VIEWS"}
|
|
,{"VW","VIEW"}
|
|
,{"WALKS","WALKS"}
|
|
,{"WALK","WALK"}
|
|
,{"WALK","WALKS"}
|
|
,{"WALL","WALL"}
|
|
,{"WAYS","WAYS"}
|
|
,{"WAY","WAY"}
|
|
,{"WELLS","WELLS"}
|
|
,{"WELL","WELL"}
|
|
,{"WLS","WELLS"}
|
|
,{"WL","WELL"}
|
|
,{"WY","WAY"}
|
|
,{"XING","CROSSING"}
|
|
,{"XRD","CROSSROAD"}
|
|
,{"XRDS","CROSSROADS"}
|
|
|
|
// . cities and states
|
|
// . helps with "abq square dance center" i guess
|
|
// . "abq jump" --> "albuquerque jump"
|
|
,{"abq","albuquerque"}
|
|
,{"alb","albuquerque"}
|
|
,{"cinti","cincinnati"}
|
|
,{"cincy","cincinnati"}
|
|
|
|
};
|
|
|
|
static HashTableX s_syn;
|
|
static bool s_synInit = false;
|
|
|
|
// . normalize some words in the place name
|
|
// . synonyms
|
|
// . 4th --> fourth
|
|
// . theatre --> theater
|
|
// . school --> {0}
|
|
int64_t *getSynonymWord ( int64_t *h, int64_t *prevId, bool isStreet ) {
|
|
|
|
static int64_t h_cafeteria;
|
|
static int64_t h_auditorium;
|
|
static int64_t h_school;
|
|
static int64_t h_library;
|
|
static int64_t h_zero;
|
|
static int64_t h_the;
|
|
// set syn table?
|
|
if ( ! s_synInit ) {
|
|
// init it
|
|
if ( ! s_syn.set ( 8,8,1024,NULL,0,false,0,"syntbl")){
|
|
// core dump if this fails
|
|
char *xx=NULL;*xx=0;}
|
|
// stock it
|
|
int32_t n = (int32_t)sizeof(s_synList)/ sizeof(SynTwin);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
//QUICKPOLL ( m_niceness );
|
|
char *s1 = s_synList[i].m_s1;
|
|
char *s2 = s_synList[i].m_s2;
|
|
int32_t len1 = gbstrlen ( s1 );
|
|
int32_t len2 = gbstrlen ( s2 );
|
|
int64_t sh1 = hash64Lower_utf8 ( s1 , len1 );
|
|
int64_t sh2 = hash64Lower_utf8 ( s2 , len2 );
|
|
// skip if the same
|
|
if ( sh1 == sh2 ) continue;
|
|
// sanity check
|
|
if ( sh1 == 0 ) { char *xx=NULL;*xx=0; }
|
|
// core on failure here, this is critical
|
|
if ( ! s_syn.addKey (&sh1,&sh2)){char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// set these
|
|
h_cafeteria = hash64b ( "cafeteria" );
|
|
h_auditorium = hash64b ( "auditorium" );
|
|
h_school = hash64b ( "school" );
|
|
h_library = hash64b ( "library" );
|
|
h_the = hash64b ( "the" );
|
|
h_zero = 0LL;
|
|
// only call once
|
|
s_synInit = true;
|
|
}
|
|
|
|
|
|
if ( ! isStreet ) {
|
|
// . fix for "Grant Middle School Cafeteria"
|
|
// . blank out "school cafeteria"
|
|
if ( *h==h_cafeteria && *prevId == h_school ) return &h_zero;
|
|
// blank out "school auditorium"
|
|
if ( *h==h_auditorium && *prevId == h_school ) return &h_zero;
|
|
// try for "Loma Colorado Main Library Auditorium"?
|
|
if ( *h==h_auditorium && *prevId == h_library ) return &h_zero;
|
|
// smith elementary should equal smith elementary school
|
|
if ( *h==h_school ) return &h_zero;
|
|
}
|
|
|
|
// TODO: uncomment this later and replace h_the logic above
|
|
if ( *h == h_the && *prevId == 0LL ) return &h_zero;
|
|
|
|
int64_t *p = (int64_t *)s_syn.getValue64 ( *h );
|
|
|
|
// check city aliases table. we no longer store city aliases
|
|
// in the synonym list
|
|
// . no! might have "SF Smith" not "Santa Fe Smith"
|
|
//if ( ! p ) {
|
|
// int64_t *ah1 = (int64_t *) g_aliases.getValue(h);
|
|
// if ( ah1 ) return ah1;
|
|
//}
|
|
|
|
// return what we had if not in syn table
|
|
if ( ! p ) return h;
|
|
// . if *p is 0, that means to ignore it!
|
|
// . return the mapped guy otherwise
|
|
return p;
|
|
}
|
|
|
|
void Addresses::print ( SafeBuf *pbuf , int64_t uh64 ) {
|
|
|
|
// print the streets first
|
|
printPlaces( &m_sm , pbuf , m_sections , NULL);//&m_addresses[0] );
|
|
|
|
// print NAMES then
|
|
printPlaces( &m_pm , pbuf , m_sections , NULL);//&m_addresses[0] );
|
|
|
|
char *hdrFormat =
|
|
"<table cellpadding=3 border=1>\n"
|
|
"<tr>"
|
|
"<td colspan=40>"
|
|
// table header row
|
|
"%s"
|
|
"</tr>"
|
|
"<tr>"
|
|
"<td><b><nobr>start word</nobr></b></td>"
|
|
"<td><nobr><b>place name 1</b></nobr></td>"
|
|
"<td><nobr><b>place name 2</b></nobr></td>"
|
|
"<td><b>suite</b></td>"
|
|
"<td><b>street</b></td>"
|
|
"<td><b>city</b></td>"
|
|
"<td><b>adm1</b></td>"
|
|
"<td><b>zip</b></td>"
|
|
"<td><b>ctry</b></td>"
|
|
|
|
"<td><b>geolat</b></td>"
|
|
"<td><b>geolon</b></td>"
|
|
|
|
"<td><b>minedlat</b></td>"
|
|
"<td><b>minedlon</b></td>"
|
|
|
|
"<td><b><nobr>importlat</nobr></b></td>"
|
|
"<td><b><nobr>importlon</nobr></b></td>"
|
|
|
|
"<td><b>flags</b></td>"
|
|
"<td><b>addrptr</b></td>"
|
|
"<td><b>addrhash</b></td>"
|
|
"<td><b>altnames</b></td>"
|
|
"<td><b>hashes</b></td>"
|
|
"</tr>\n" ;
|
|
|
|
|
|
// print address table header
|
|
pbuf->safePrintf ( hdrFormat , "Invalid Addresses" );
|
|
|
|
// print the final winning addresses
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *aa = (Address *)m_am.getPtr(i);
|
|
// is inlined or verified?
|
|
bool valid = false;
|
|
if ( aa->m_flags & AF_INLINED ) valid = true;
|
|
// but unverified streetisname is not good
|
|
if ( aa->m_street && (aa->m_street->m_flags2 & PLF2_IS_NAME))
|
|
valid = false;
|
|
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_1 ) valid = true;
|
|
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_2 ) valid = true;
|
|
if ( aa->m_flags & AF_VERIFIED_STREET ) valid = true;
|
|
// we are only printing INvalids in this table
|
|
if ( valid ) continue;
|
|
// print to page parser pbuf
|
|
Address *ai = (Address *)m_am.getPtr(i);
|
|
ai->print2 ( i,pbuf , 0 );
|
|
}
|
|
|
|
pbuf->safePrintf("</table>\n");
|
|
pbuf->safePrintf("<br><br>\n");
|
|
|
|
pbuf->safePrintf("<a name=events>\n");
|
|
|
|
// Spider.cpp when storing parse.* file will also store an
|
|
// abbreviate file called parse-int16_tdisplay.* consisting only
|
|
// of these div tags for rendering within the qa.html file! that
|
|
// way the qa person can easily check/uncheck all the checkboxes
|
|
// right in the qa.html file
|
|
pbuf->safePrintf("<div class=int16_tdisplay>\n");
|
|
|
|
// print checkbox to indicate if events are wrong
|
|
pbuf->safePrintf ( "<!--ignore-->" // ignore for Test.cpp diff
|
|
"<br>"
|
|
"<nobr>"
|
|
// light blue background
|
|
"<span class=validated "
|
|
"style=background-color:#9090e0>"
|
|
"<input type=checkbox "
|
|
"onclick=\"senddiv(this,'%"INT64"');\" "
|
|
"unchecked>"
|
|
"<div class=validated style=display:inline>"
|
|
" Has <b>address</b> parsing issue. Flag to fix."
|
|
"</div>"
|
|
"</span>"
|
|
"</nobr>"
|
|
"<br>"
|
|
"<br>\n" ,
|
|
uh64 );
|
|
|
|
// print address table header
|
|
pbuf->safePrintf ( hdrFormat , "Inlined and Verified Addresses" );
|
|
|
|
// . first print only the INLINED (valid) addresses
|
|
// . i guess if they are verified that is considered valid too!
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// get it
|
|
Address *aa = (Address *)m_am.getPtr(i);
|
|
// is inlined or verified?
|
|
bool valid = false;
|
|
if ( aa->m_flags & AF_INLINED ) valid = true;
|
|
// but unverified streetisname is not good
|
|
// but unverified streetisname is not good
|
|
if ( aa->m_street && (aa->m_street->m_flags2 & PLF2_IS_NAME))
|
|
valid = false;
|
|
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_1 ) valid = true;
|
|
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_2 ) valid = true;
|
|
if ( aa->m_flags & AF_VERIFIED_STREET ) valid = true;
|
|
if ( ! valid ) continue;
|
|
// print to page parser pbuf
|
|
aa->print2 ( i,pbuf , uh64 );//&m_addresses[0]);
|
|
}
|
|
pbuf->safePrintf("</table>\n");
|
|
pbuf->safePrintf("</div class=int16_tdisplay>\n");
|
|
pbuf->safePrintf("<i>NOTE: a name must be VERIFIED before it will "
|
|
"be a KEY in placedb. So you generally need two "
|
|
"places inlining the same name before that will "
|
|
"happen.</i>");
|
|
pbuf->safePrintf("<br>\n");
|
|
|
|
}
|
|
|
|
// . looks up each word/phrase in our table of known places
|
|
// . table includes cities, countries, states (adm1), counties, zipcodes
|
|
/*
|
|
int32_t Addresses::addProperPlaces ( int32_t a ,
|
|
int32_t b ,
|
|
int32_t maxAlnumCount ,
|
|
Place *places ,
|
|
int32_t maxPlaces ,
|
|
int32_t np ,
|
|
pbits_t flags ,
|
|
// this count excludes "a"?
|
|
int32_t alnumPos ,
|
|
int32_t forcedEnd ) {
|
|
// int16_tcuts
|
|
Words *ww = m_words;
|
|
int32_t nw = ww->getNumWords();
|
|
int64_t *wids = ww->getWordIds();
|
|
char **wptrs = ww->getWordPtrs();
|
|
int32_t *wlens = ww->getWordLens();
|
|
nodeid_t *tids = ww->getTagIds();
|
|
// "4 miles" and "miles" does not mean "miles, california", the city
|
|
int64_t h_miles = hash64 ( "miles",5);
|
|
int64_t h_mi = hash64 ( "mi",2);
|
|
int64_t h_kilometers= hash64 ( "kilometers",10);
|
|
int64_t h_km = hash64 ( "km",2);
|
|
// reset this count again
|
|
int32_t alnumCount = 0;
|
|
// after the street is an optional city
|
|
for ( int32_t j = a ; j<b && alnumCount<maxAlnumCount ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not alnum
|
|
if ( ! wids[j] ) continue;
|
|
// count alnums
|
|
alnumCount++;
|
|
// skip "miles" in "4 miles"
|
|
if ( wids[j] == h_miles && j-2>= 0 && is_digit(wptrs[j-2][0]))
|
|
continue;
|
|
if ( wids[j] == h_mi && j-2>= 0 && is_digit(wptrs[j-2][0]))
|
|
continue;
|
|
if ( wids[j] == h_km && j-2>= 0 && is_digit(wptrs[j-2][0]))
|
|
continue;
|
|
if ( wids[j] == h_kilometers&&j-2>=0&&is_digit(wptrs[j-2][0]))
|
|
continue;
|
|
// . skip if only one char
|
|
// . no! might be like "N. M." to be "new mexico"
|
|
//if ( wlens[j] == 1 ) continue;
|
|
// . skip if two chars and not capitalized
|
|
// . no! misses "123 main st, albuquerque, nm"
|
|
//if ( wlens[j] == 2 && ! is_upper_utf8(wptrs[j]) ) continue;
|
|
// try just doing caps only for now
|
|
if ( is_lower_utf8(wptrs[j]) ) continue;
|
|
// do not skip too far
|
|
int32_t max = j + 6;
|
|
// truncate?
|
|
if ( max > nw ) max = nw;
|
|
// init hash
|
|
int64_t h = 0LL;
|
|
// the alnumcount for this
|
|
int32_t subcount = 0;
|
|
// scan for city/adm1/zip after this street address
|
|
for ( int32_t k = j ; k < max ; k++ ) {
|
|
// stop if tag
|
|
if ( tids[k] ) {
|
|
// skip non-breaking tags
|
|
if ( !isBreakingTagId(tids[k]) ) continue;
|
|
// allow <br> too since microsoft front page
|
|
// inserts those to break a line
|
|
if ( tids[k] == TAG_BR ) continue;
|
|
// other tags, stop us
|
|
break;
|
|
}
|
|
// is it punct?
|
|
if ( ! wids[k] ) {
|
|
// . big punct is a show stopper
|
|
// . no, we had "New\n Mexico"
|
|
//if ( wlens[k] >= 4 ) break;
|
|
// just skip otherwise
|
|
continue;
|
|
}
|
|
// count it
|
|
subcount++;
|
|
// mix it up
|
|
h <<= 1;
|
|
// hash it into our ongoing hash
|
|
h ^= wids[k];
|
|
// look it up
|
|
int32_t slot = g_cities.getSlot(&h);
|
|
// length
|
|
int32_t plen = (wptrs[k] + wlens[k]) - wptrs[j];
|
|
// skip otherwise
|
|
if ( forcedEnd >= 0 && k < forcedEnd ) continue;
|
|
// clear this
|
|
//int32_t cityCount = 0;
|
|
// init
|
|
Place *pp;
|
|
// multiple places might have this hash
|
|
for ( ; slot>=0 ; slot=g_cities.getNextSlot(slot,&h)){
|
|
// get the place
|
|
PlaceDesc *pd =(PlaceDesc *)g_cities.
|
|
getValueFromSlot(slot);
|
|
|
|
// it might be an alias to another slot!
|
|
int32_t slot2 = -1;
|
|
if ( pd->m_bits & PLF_ALIAS ) {
|
|
// get the slot we alias
|
|
slot2 = pd->getSlot();
|
|
// sanity check
|
|
if ( slot2 < 0 ) {char *xx=NULL;*xx=0;}
|
|
// re-get
|
|
pd=(PlaceDesc *)g_cities.getValueFromSlot(slot2);
|
|
}
|
|
|
|
// skip if not a recognized place
|
|
if ( pd->m_type != PT_CITY &&
|
|
pd->m_type != PT_STATE &&
|
|
//pd->m_type != PT_ZIP &&
|
|
pd->m_type != PT_CTRY )
|
|
continue;
|
|
// city count
|
|
//if(pd->m_type == PT_CITY) cityCount++;
|
|
// skip if full
|
|
if ( np >= maxPlaces ) continue;
|
|
// point to the right place to store into
|
|
pp = &places[np];
|
|
// sanity check
|
|
if ( ! h ) { char *xx=NULL;*xx=0; }
|
|
// make a place
|
|
pp->m_a = j;
|
|
pp->m_b = k+1;
|
|
pp->m_alnumA = alnumPos + alnumCount;
|
|
pp->m_alnumB = alnumPos + alnumCount+subcount;
|
|
pp->m_type = pd->m_type;
|
|
pp->m_str = wptrs[j];
|
|
pp->m_strlen = plen;
|
|
pp->m_hash = h;
|
|
|
|
// . use the aliased city, etc. if we had it
|
|
// . that way when we lookup this place in
|
|
// placedb it will use the right hash
|
|
if ( slot2 >= 0 )
|
|
pp->m_hash = *(int64_t *)g_cities.getKeyFromSlot(slot2);
|
|
|
|
pp->m_adm1[0] = pd->m_adm1[0];
|
|
pp->m_adm1[1] = pd->m_adm1[1];
|
|
pp->m_crid = pd->m_crid;
|
|
pp->m_bits = PLF_INFILE | flags ;
|
|
// we use these for zip codes mostly
|
|
pp->m_cityHash= 0;//pd->m_cityHash;
|
|
// inc it
|
|
np++;
|
|
// sanity check
|
|
if ( np >= maxPlaces ) {char*xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// only one word for zip code
|
|
if ( k != j ) continue;
|
|
|
|
//
|
|
// check if zip code
|
|
//
|
|
|
|
// look it up
|
|
slot = g_zips.getSlot(&h);
|
|
// multiple places might have this hash
|
|
for ( ; slot>=0 ; slot=g_zips.getNextSlot(slot,&h)){
|
|
// get the place
|
|
ZipDesc *zd =(ZipDesc *)g_zips.
|
|
getValueFromSlot(slot);
|
|
// skip if full
|
|
if ( np >= maxPlaces ) continue;
|
|
// point to the right place to store into
|
|
pp = &places[np];
|
|
// sanity check
|
|
if ( ! h ) { char *xx=NULL;*xx=0; }
|
|
// make a place
|
|
pp->m_a = j;
|
|
pp->m_b = k+1;
|
|
pp->m_alnumA = alnumPos + alnumCount;
|
|
pp->m_alnumB = alnumPos + alnumCount+subcount;
|
|
pp->m_type = PT_ZIP;
|
|
pp->m_str = wptrs[j];
|
|
pp->m_strlen = plen;
|
|
pp->m_hash = h;
|
|
pp->m_adm1[0] = zd->m_adm1[0];
|
|
pp->m_adm1[1] = zd->m_adm1[1];
|
|
pp->m_crid = zd->m_crid;
|
|
pp->m_bits = PLF_INFILE | flags ;
|
|
// we use these for zip codes mostly
|
|
pp->m_cityHash= zd->m_cityHash;
|
|
pp->m_cityStr = g_cityBuf + zd->m_cityOffset;
|
|
// inc it
|
|
np++;
|
|
// sanity check
|
|
if ( np >= maxPlaces ) {char*xx=NULL;*xx=0;}
|
|
}
|
|
}
|
|
}
|
|
return np;
|
|
}
|
|
*/
|
|
|
|
uint32_t getCityId32 ( uint64_t cityHash64, char *adm1Str ) {
|
|
// sanity checks
|
|
//if ( is_upper_a(adm1Str[0]) ) { char *xx=NULL;*xx=0; }
|
|
//if ( is_upper_a(adm1Str[1]) ) { char *xx=NULL;*xx=0; }
|
|
//if ( adm1Str[2] ) { char *xx=NULL;*xx=0; }
|
|
// make it lower case to normalize hash
|
|
char na[3];
|
|
na[0] = to_lower_a(adm1Str[0]);
|
|
na[1] = to_lower_a(adm1Str[1]);
|
|
na[2] = '\0';
|
|
// simple hash value
|
|
uint32_t adm1Hash32 = (uint32_t)*((uint16_t *)na);//adm1Str);
|
|
// get the hash
|
|
uint32_t cid32 = hash32h ( (uint32_t)cityHash64 , adm1Hash32 );
|
|
// . now normalize city if its an abbreviation
|
|
// . if we got the citystatehash for "SF, CA" we want to map it to
|
|
// "San Francisco, CA"'s citystatehash. this normalizes the cityid.
|
|
// . likewise "SF, NM" --> "Santa Fe, NM"
|
|
uint32_t *ah = (uint32_t *)g_aliases.getValue (&cid32);
|
|
// use that if we had it
|
|
if ( ah ) return *ah;
|
|
// otherwise, we were the real deal
|
|
return cid32;
|
|
}
|
|
|
|
// . make all possible addresses from Places in that section
|
|
// . use the Address class
|
|
// . only keep the address with maximum score/probability
|
|
// . record the section it was found in as well via the Section ptr
|
|
// . assign an address probability/score from 0 to 1.0
|
|
// . allow inheriting of city or adm1 from title tag or tagdb rec
|
|
// (consider other inheritable places and areas later)
|
|
// . must have agreeing street,placeName,adm1 and city
|
|
// . zip is optional
|
|
// . base score is .20
|
|
// . then add streetScore*0.30 + placeScore*0.30
|
|
// . add .10 if we got a valid agreeable zip code
|
|
// . add .03 if we got a valid suite
|
|
// . add (20-X)/20 * .07 where X is the avg # of alnum words between
|
|
// all possible pairs of the places involved. do not consider
|
|
// inherited Places in this calculation. actually weight the distance
|
|
// involving the place name half as much as other pairs since
|
|
// place name is often in a subtitle...
|
|
// . if first section's m_numOccurences > 1, stop... otherwise...
|
|
// . get parent section of that first section
|
|
// . and repeat as if it were the first section
|
|
// . "startAlnum" is where we expect the city to be in order to set the
|
|
// AF_INLINED bit for this address
|
|
// . zip code does NOT override a non-zip code address if the city or adm1
|
|
// are derived from the zip code! or from title or tag!
|
|
bool Addresses::addAddress ( Place *name1 ,
|
|
Place *name2 ,
|
|
Place *suite ,
|
|
Place *street ,
|
|
Place *city ,
|
|
Place *adm1 ,
|
|
Place *zip ,
|
|
Place *ctry ,
|
|
Section *addrSec ,
|
|
// where we expect the city to be in an inlined
|
|
// address. because we can have multiple streets
|
|
// for one place name we need this to be
|
|
// after all such streets.
|
|
// "abq conv ctr 401 2nd st nw po box 1293 abq nm"
|
|
// http://www.yelp.com/biz/pizza-9-albuquerque too
|
|
int32_t startAlnum ,
|
|
char flags3 ,
|
|
Address **retAddr ) {
|
|
|
|
if ( retAddr ) *retAddr = NULL;
|
|
|
|
if ( flags3 & AF2_LATLON ) {
|
|
// assume to store the new address here, the destination
|
|
Address *dst = NULL;
|
|
if ( ! dst ) dst = (Address *)m_am.getMem(sizeof(Address));
|
|
if ( ! dst ) return false;
|
|
if ( retAddr ) *retAddr = dst;
|
|
dst->m_hash = 0;
|
|
dst->m_score2 = 0;
|
|
// now just do ptrs
|
|
dst->m_name1 = name1;
|
|
dst->m_name2 = name2;
|
|
dst->m_suite = suite;
|
|
dst->m_street = street;
|
|
dst->m_city = city;
|
|
dst->m_adm1 = adm1;
|
|
dst->m_zip = zip;
|
|
dst->m_placedbNames = NULL;
|
|
dst->m_alias = NULL;
|
|
dst->m_latitude = NO_LATITUDE; // 999.0;
|
|
dst->m_longitude = NO_LONGITUDE; // 999.0;
|
|
dst->m_latLonScore = 0;
|
|
dst->m_latLonDist = 9999999;
|
|
// reset this for the geocoder lookup
|
|
dst->m_geocoderLat = NO_LATITUDE;
|
|
dst->m_geocoderLon = NO_LONGITUDE;
|
|
// make placedbkey
|
|
//dst->m_placedbKey = dst->makePlacedbKey(m_docId,false,false);
|
|
dst->m_bestPlacedbName = NULL;
|
|
// sanity check
|
|
//if ( dst->m_placedbKey.n1 == 0LL ) { char *xx=NULL;*xx=0; }
|
|
// force this to true
|
|
dst->m_flags = AF_INLINED;
|
|
dst->m_replyFlags = 0;
|
|
dst->m_domHash32 = m_domHash32;
|
|
dst->m_ip = m_ip;
|
|
dst->m_section = NULL;
|
|
dst->m_flags3 = flags3;
|
|
dst->m_importedLatitude = NO_LATITUDE;
|
|
dst->m_importedLongitude = NO_LONGITUDE;
|
|
dst->m_importedVotes = -1;
|
|
return true;
|
|
}
|
|
|
|
// no room left?
|
|
//if ( m_na >= MAX_ADDRESSES ) {
|
|
// // note it
|
|
// if ( ! m_firstBreach ) return true;
|
|
// m_firstBreach = false;
|
|
// log("addr: got address breach for %s",m_url->getUrl());
|
|
// return true;
|
|
// char *xx=NULL; *xx=0;
|
|
// return true;
|
|
//}
|
|
|
|
// maybe we should try to speed up msg2c by quickly validating
|
|
// whether the street is in that city/state using zak's db... but
|
|
// i don't think truncating the addresses is the right approach
|
|
/*
|
|
if ( m_am.getNumPtrs() >= 10000 ) {
|
|
// note it
|
|
if ( ! m_firstBreach ) return true;
|
|
m_firstBreach = false;
|
|
m_breached = true;
|
|
log("addr: got address breach for %s",m_url->getUrl());
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// if we have a city and the zip does not agree and the
|
|
// zip is after the city, the nuke the zip
|
|
//if ( city && zip && zip->m_cityHash != city->m_hash &&
|
|
// zip->m_a > city->m_a )
|
|
// zip = NULL;
|
|
|
|
// skip if zip does not agree with state
|
|
if ( adm1 && zip && zip->m_adm1Bits != adm1->m_adm1Bits )
|
|
return true;
|
|
// or agree with city
|
|
if ( city && zip && ! (zip->m_adm1Bits & city->m_adm1Bits ) )
|
|
return true;
|
|
|
|
static bool hset = false;
|
|
static int64_t h_zip;
|
|
static int64_t h_code;
|
|
static int64_t h_postal;
|
|
static int64_t h_zipcode;
|
|
static int64_t h_usa;
|
|
if ( ! hset ) {
|
|
hset = true;
|
|
h_zip = hash64n("zip");
|
|
h_code = hash64n("code");
|
|
h_postal = hash64n("postal");
|
|
h_zipcode = hash64n("zipcode");
|
|
h_zipcode = hash64n("usa");
|
|
}
|
|
|
|
//
|
|
// set zipAlnumA
|
|
//
|
|
int32_t zipAlnumA ;
|
|
if ( zip ) zipAlnumA = zip->m_alnumA;
|
|
// scan to left of zip to change zipAlnumA to allow for acceptable
|
|
// words in between it
|
|
int32_t zipa = -1; if ( zip ) zipa = zip->m_a - 1;
|
|
int32_t mini = zipa - 10;
|
|
if ( mini < 0 ) mini = 0;
|
|
int32_t count = 0;
|
|
for ( int32_t i = zipa ; i >= mini ; i-- ) {
|
|
if ( ! m_wids[i] ) continue;
|
|
if ( m_wids[i] == h_zip ) count++;
|
|
else if ( m_wids[i] == h_code ) count++;
|
|
else if ( m_wids[i] == h_postal ) count++;
|
|
else if ( m_wids[i] == h_zipcode ) count++;
|
|
else if ( m_wids[i] == h_usa ) count++;
|
|
else break;
|
|
}
|
|
//if ( count > 0 )
|
|
// log("hey");
|
|
// adjust it to allow for words in between
|
|
zipAlnumA -= count;
|
|
|
|
|
|
/*
|
|
// if zip and no state or city,do not allow unless right next to street
|
|
if ( zip && ! adm1 && ! city && zipAlnumA != startAlnum )
|
|
return true;
|
|
// or if no state, but we have a city and zip, then zip must follow
|
|
// the city or the street
|
|
if ( zip && ! adm1 && city &&
|
|
zipAlnumA != startAlnum &&
|
|
zipAlnumA != city->m_alnumB &&
|
|
zip->m_alnumB != city->m_alnumA )
|
|
return true;
|
|
|
|
// or if a state and no city...
|
|
if ( zip && adm1 && ! city &&
|
|
zipAlnumA != startAlnum &&
|
|
zipAlnumA != adm1->m_alnumB &&
|
|
zip->m_alnumB != adm1->m_alnumA )
|
|
return true;
|
|
*/
|
|
|
|
// set cityhash immediately
|
|
uint64_t cityHash = 0;
|
|
if ( city ) cityHash = city->m_hash;
|
|
else if ( zip ) cityHash = zip->m_cityHash;
|
|
if ( ! cityHash ) return true;
|
|
|
|
// set these
|
|
uint64_t adm1Bits;
|
|
char *adm1Str = NULL;
|
|
if ( adm1 ) {
|
|
adm1Bits = adm1->m_adm1Bits;
|
|
adm1Str = adm1->m_adm1;
|
|
}
|
|
else if ( zip ) {
|
|
adm1Bits = zip->m_adm1Bits;
|
|
adm1Str = zip->m_adm1;
|
|
}
|
|
//else if ( city && (city->m_adm1Bits & CF_UNIQUE ) )
|
|
// adm1Bits = city->m_adm1Bits;
|
|
else
|
|
return true;
|
|
|
|
// zip cannot be suite #
|
|
if ( suite && zip && zip ->intersects ( suite ) ) return true;
|
|
if ( suite && zip && suite->intersects ( zip ) ) return true;
|
|
|
|
bool inlined = true;
|
|
|
|
// . are we an inlined address? that means the city and adm1 (state)
|
|
// are right after the street address
|
|
// . therefore we are not inlined if we inherited the city or the
|
|
// adm1 (state) from a tag or the title of the doc
|
|
pbits_t flags = PLF_FROMTAG | PLF_FROMTITLE;
|
|
// do not use PLF_FROMTITLE if street is in title too though
|
|
if ( m_sections &&
|
|
(m_sections->m_sectionPtrs[street->m_a]->m_flags & SEC_IN_TITLE) )
|
|
flags = PLF_FROMTAG;
|
|
bool cityOut = false;
|
|
bool adm1Out = false;
|
|
bool zipOut = false;
|
|
if ( ! city ) cityOut = true;
|
|
if ( ! adm1 ) adm1Out = true;
|
|
if ( ! zip ) zipOut = true;
|
|
if ( city && ( city->m_bits & flags ) ) cityOut = true;
|
|
if ( adm1 && ( adm1->m_bits & flags ) ) adm1Out = true;
|
|
if ( zip && ( zip ->m_bits & flags ) ) zipOut = true;
|
|
|
|
|
|
// if we have a suite to the right of the street, it must be
|
|
// RIGHT after the street for now (TODO: allow colon)
|
|
if ( suite && suite->m_a > street->m_a && startAlnum !=suite->m_alnumA)
|
|
inlined = false;
|
|
|
|
|
|
bool cityInline = false;
|
|
// what is between street and city.
|
|
if ( city && ! suite && startAlnum == city->m_alnumA )
|
|
cityInline = true;
|
|
// suite to the right of street
|
|
if ( city && suite && suite->m_alnumB == city->m_alnumA )
|
|
cityInline = true;
|
|
// suite to the left of street
|
|
if ( city && suite &&
|
|
suite->m_a < street->m_a &&
|
|
startAlnum == city->m_alnumA )
|
|
cityInline = true;
|
|
// or if a colon is before city
|
|
// "Street: 4904 4th St NW \nCity/Town: Albuquerque, NM"
|
|
//www.dukecityfix.com/xn/detail/1233957:Event:391851?xg_source=activity
|
|
bool gotColon = false;
|
|
bool gotWord = false;
|
|
int32_t x;
|
|
if ( city ) x = city->m_a - 1;
|
|
// only loop if city not inlined from above
|
|
for ( ; city && ! cityInline && x >= street->m_alnumB ; x-- ) {
|
|
// skip if tag
|
|
if ( m_tids[x] ) {
|
|
// just ignore
|
|
if ( ! gotColon ) continue;
|
|
// must have had a word
|
|
if ( ! gotWord ) continue;
|
|
// we need a breaking tag now!
|
|
if ( ! isBreakingTagId ( m_tids[x] ) ) continue;
|
|
// all done!
|
|
cityInline = true;
|
|
// stop
|
|
break;
|
|
}
|
|
// alnum word???
|
|
if ( m_wids[x] ) {
|
|
// if got alnum word before getting colon, no good!
|
|
if ( ! gotColon ) break;
|
|
// mark this
|
|
gotWord = true;
|
|
// otherwise ignore
|
|
continue;
|
|
}
|
|
// got colon?
|
|
if ( m_words->hasChar(x,':') ) gotColon = true;
|
|
}
|
|
|
|
// assume we have no city right after the street...
|
|
x = 0;
|
|
int32_t xend = -1;
|
|
char c = 0;
|
|
// also allow something like "123 main st (downtown mall) las cruces"
|
|
// to fix http://www.newmexico.org/calendar/events/index.php?com=
|
|
// detail&eID=22180&year=2011&month=01
|
|
if ( city &&
|
|
city->m_a >= 0 &&
|
|
city->m_a > street->m_b &&
|
|
city->m_a - street->m_b < 20 ) {
|
|
x = street->m_b;
|
|
xend = city->m_a;
|
|
}
|
|
// loop from end of street to beginning of city looking for '('
|
|
for ( ; x < xend ; x++ ) {
|
|
// skip if tag
|
|
if ( m_tids[x] ) continue;
|
|
// stop on word!
|
|
if ( m_wids[x] ) {
|
|
// unless in parens!
|
|
if ( c ) continue;
|
|
// crap... Msg13.cpp when it sets the dates does not
|
|
// filter out html entities for speed, so watch
|
|
// out for crap after an ampersand or &#. this
|
|
// was causing some americantowns.com urls to
|
|
// not get their address inlined!
|
|
if (x>0 && m_wptrs[x][-1] =='&' ) continue;
|
|
if (x>1 && m_wptrs[x][-1] =='#'&&m_wptrs[x][-2]=='&' )
|
|
continue;
|
|
// otherwise, really stop
|
|
break;
|
|
}
|
|
// check for '(' or '['
|
|
char *p = m_wptrs[x];
|
|
char *pend = p + m_wlens[x];
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// check for ( or [
|
|
if ( *p=='(' ) c = '(';
|
|
if ( *p=='[' ) c = '[';
|
|
if ( *p==')' ) c = 0;
|
|
if ( *p==']' ) c = 0;
|
|
continue;
|
|
}
|
|
}
|
|
// if we scanned all the way through, that's great, we are inlined
|
|
if ( x == xend ) cityInline = true;
|
|
|
|
|
|
// turn it off
|
|
if ( city && ! cityInline ) inlined = false;
|
|
|
|
|
|
// this restriction was inspidered by "The TAVERN, 4701 Menaul,
|
|
// between Washington and Carlisle..." making gb think that it is
|
|
// in the city of Carlisle in Washington...
|
|
if (city && adm1 && city->m_alnumB != adm1->m_alnumA )
|
|
// but if city is "unique" like albuquerque, we allow it
|
|
//!(city->m_adm1Bits & CF_UNIQUE) )
|
|
inlined = false;
|
|
|
|
/*
|
|
// . wow, "less than 1 mile away from Abq NM" inspired me to require
|
|
// that the street be adjacent to the city now!
|
|
// . but i am seeing more false positives, so restrict things more
|
|
//if ( ! suite && street->m_alnumB != city->m_alnumA )
|
|
if ( ! suite && startAlnum != city->m_alnumA )
|
|
inlined = false;
|
|
// if we have a suite, and it is left of the street, that is ok too
|
|
if ( suite && suite->m_a < street->m_a &&
|
|
//street->m_alnumB != city->m_alnumA )
|
|
startAlnum != city->m_alnumA )
|
|
inlined = false;
|
|
if ( suite && suite->m_a > street->m_a &&
|
|
suite->m_alnumB != city->m_alnumA )
|
|
inlined = false;
|
|
*/
|
|
|
|
// if you got a zip, must follow adm1 immediately
|
|
// fixes http://www.estrelladelnortevineyard.com/SFV_retloc.php
|
|
//if ( zip && adm1 && adm1->m_alnumB != zipAlnumA )
|
|
// inlined = false;
|
|
|
|
bool zipInline = false;
|
|
// . zip right after street is good
|
|
// . but the city/adm1 must in title or tag, not after the zip
|
|
// otherwise we end up inlining bad cities after the zip like
|
|
// "house, nm"
|
|
if ( zip ) {
|
|
if ( startAlnum == zipAlnumA ) zipInline = true;
|
|
if ( suite && suite->m_alnumB == zipAlnumA ) zipInline = true;
|
|
// . or if zip follows city where city is tight
|
|
// . "114 Coronado Road, Corrales, 87048"
|
|
if ( city && city->m_alnumB == zipAlnumA ) zipInline = true;
|
|
if ( adm1 && adm1->m_alnumB == zipAlnumA ) zipInline = true;
|
|
// turn it off
|
|
if ( ! zipInline ) inlined = false;
|
|
}
|
|
|
|
// set this
|
|
bool adm1Inline = false;
|
|
if ( adm1 ) {
|
|
if ( adm1->m_alnumA == street->m_alnumB )
|
|
adm1Inline = true;
|
|
if ( city && adm1->m_alnumA == city->m_alnumB )
|
|
adm1Inline = true;
|
|
if ( ! adm1Inline ) inlined = false;
|
|
}
|
|
|
|
// fix for http://www.ucomparehealthcare.com/drs/washington/
|
|
// obstetrics_and_gynecology/Seattle.html
|
|
// 1959 NE Pacific St
|
|
// University Washington Medical Center
|
|
// Seattle, WA 98195
|
|
// gets "University" as a city in "Washington" state!
|
|
if ( adm1 ) {
|
|
int32_t ab = adm1->m_b;
|
|
int64_t *wids = m_words->getWordIds();
|
|
char **wptrs = m_words->getWordPtrs();
|
|
int32_t *wlens = m_words->getWordLens();
|
|
nodeid_t *tids = m_words->getTagIds();
|
|
int32_t nw = m_words->getNumWords();
|
|
if ( inlined && ab-1>= 0 && ab+1 < nw && ! tids[ab] &&
|
|
! wids[ab] &&
|
|
wlens[ab]==1 &&
|
|
// this was hurting
|
|
// "195 Crystie Street, Suite 20<br>\nNew York, NY USA"
|
|
// so i added this constraint
|
|
wlens[ab-1] == 1 &&
|
|
is_wspace_utf8(wptrs[ab]) &&
|
|
is_upper_utf8(wptrs[adm1->m_a]) &&
|
|
is_upper_utf8(wptrs[ab+1]) )
|
|
inlined = false;
|
|
}
|
|
|
|
// TEMPORARY HACK TO DEBUG URL
|
|
//if ( city && (city->m_flags2 & PLF2_REQUIRED) )
|
|
// inlined = true;
|
|
|
|
/////////////////////
|
|
//
|
|
// set the address hash (combo of street,city,adm1)
|
|
//
|
|
/////////////////////
|
|
uint64_t ch = getAddressHash ( street, city, adm1, zip );
|
|
|
|
// do not add it if street name is lower case and adm1 and city
|
|
// are inlined and upper. should fix "4 barrack Oakland CA" and
|
|
// "3 spacios - Seattle WA" for graffiti.org
|
|
//if ( inlined &&
|
|
// ! (street->m_bits & PLF_HAS_UPPER ) &&
|
|
// ! (street->m_flags2 & PLF2_HAD_INDICATOR ) &&
|
|
// is_upper_utf8(wptrs[adm1->m_a]) &&
|
|
// is_upper_utf8(wptrs[city->m_a]) )
|
|
// return true;
|
|
|
|
|
|
// . now compare to other address with this same street
|
|
for ( int32_t i = m_am.getNumPtrs() - 1 ; i >= 0 ; i-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *prev = (Address *)m_am.getPtr(i);
|
|
// if not our street, bail!
|
|
if ( prev->m_street->m_a != street->m_a ) break;
|
|
// if he is inlined and we are not!
|
|
if ( prev->m_flags & AF_INLINED ) {
|
|
// if we are not, bail, do not add us
|
|
if ( ! inlined ) return true;
|
|
}
|
|
// if he is not inlined and we are, we overwrite him
|
|
else if ( inlined ) {
|
|
// overwrite him
|
|
//dst = prev;
|
|
// kill him
|
|
//m_na--;
|
|
m_am.rewind ( 1 );
|
|
// print him
|
|
//log("DELETING the following address 1:");
|
|
//dst->print();
|
|
// try to kill more
|
|
continue;
|
|
}
|
|
// ok, we are not inlined and previous got isn't either...
|
|
break;
|
|
}
|
|
|
|
// . now for the remaining address with this same street, they are
|
|
// all, including ourselves, either inlined or not inlined
|
|
// . assign a score to each address for a particular street
|
|
// . the address with the highest score wins and the others
|
|
// are removed. in the case of a tie we keep all of them.
|
|
// . we only do this comparison to addresses that have the same
|
|
// address hash,
|
|
int32_t score = 0;
|
|
// inlining always trumps all others
|
|
//if ( inlined ) score += 10000;
|
|
// and then if all else is equal, having a zip is better than just
|
|
// a city because it is more specific
|
|
if ( zip ) score += 1000;
|
|
// having a valid adm1 is good (might not have one explicitly if city
|
|
// is unique to a particular state)
|
|
if ( adm1 ) score += 100;
|
|
// prefer city over no city
|
|
if ( city ) score += 10;
|
|
// sanity check
|
|
if ( score <= 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
Address *dst = NULL;
|
|
|
|
// now compare to other address with this same address hash
|
|
for ( int32_t i = m_am.getNumPtrs() - 1 ; i >= 0 ; i-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Address *prev = (Address *)m_am.getPtr(i);
|
|
// stop if for a different street
|
|
if ( prev->m_street->m_a != street->m_a ) break;
|
|
if ( prev->m_street->m_b != street->m_b ) break;
|
|
// skip if should not compare
|
|
if ( prev->m_hash != ch ) continue;
|
|
// do not add us if he is higher score
|
|
if ( prev->m_score2 > score ) return true;
|
|
// if a tie, that is strange!
|
|
if ( prev->m_score2 == score ) return true;
|
|
// overwrite him
|
|
dst = prev;
|
|
// an undo for the m_na down below
|
|
//m_na--;
|
|
//m_am.rewind ( 1 );
|
|
// one at a time
|
|
break;
|
|
// print him
|
|
//log("DELETING the following address 1:");
|
|
//dst->print();
|
|
// try to kill more
|
|
//continue;
|
|
}
|
|
|
|
// assume to store the new address here, the destination
|
|
if ( ! dst ) dst = (Address *)m_am.getMem(sizeof(Address));
|
|
if ( ! dst ) return false;
|
|
|
|
if ( retAddr ) *retAddr = dst;
|
|
|
|
//dst->m_cityHash = cityHash;
|
|
//dst->m_adm1Bits = adm1Bits;
|
|
dst->m_cityId32 = getCityId32 ( cityHash , adm1Str );
|
|
dst->m_hash = ch;
|
|
dst->m_score2 = score;
|
|
|
|
// now just do ptrs
|
|
dst->m_name1 = name1;
|
|
dst->m_name2 = name2;
|
|
dst->m_suite = suite;
|
|
dst->m_street = street;
|
|
dst->m_city = city;
|
|
dst->m_adm1 = adm1;
|
|
dst->m_zip = zip;
|
|
|
|
dst->m_placedbNames = NULL;
|
|
|
|
// nuke this for comparing for setting AF_AMBIGUOUS bit
|
|
//if ( zip ) dst->m_zip->m_hash = 0;
|
|
|
|
// reset this too
|
|
dst->m_alias = NULL;
|
|
|
|
dst->m_latitude = NO_LATITUDE; // 999.0;
|
|
dst->m_longitude = NO_LONGITUDE; // 999.0;
|
|
dst->m_latLonScore = 0;
|
|
dst->m_latLonDist = 9999999;
|
|
|
|
// reset this for the geocoder lookup
|
|
dst->m_geocoderLat = NO_LATITUDE;
|
|
dst->m_geocoderLon = NO_LONGITUDE;
|
|
|
|
// make placedbkey
|
|
dst->m_placedbKey = dst->makePlacedbKey ( m_docId , false, false );
|
|
|
|
dst->m_bestPlacedbName = NULL;
|
|
|
|
// the address voting table key is based on the placedb key but needs
|
|
// to be unique for each address! there are often times the same
|
|
// street address with a different place name, and since the placedb
|
|
// key does not even take the place name into account, we need to
|
|
// for this...
|
|
//dst->m_avtKey = dst->makeAddressVotingTableKey ( );
|
|
|
|
// need these
|
|
//if ( ! tmp->m_name ) { char *xx=NULL;*xx=0; }
|
|
if ( ! street ) { char *xx=NULL;*xx=0; }
|
|
if ( ! city && ! zip ) { char *xx=NULL;*xx=0; }
|
|
// unique cities like Albuquerque imply a state
|
|
if ( ! adm1 && ! zip && ! city->m_adm1[0] ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// sanity check
|
|
if ( ! street->m_hash ) { char *xx=NULL;*xx=0; }
|
|
//if ( ! street->m_streetNumHash ) { char *xx=NULL;*xx=0; }
|
|
if ( city && ! city->m_hash ) { char *xx=NULL;*xx=0; }
|
|
if ( adm1 && ! adm1->m_adm1Bits ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity check
|
|
if ( dst->m_placedbKey.n1 == 0LL ) { char *xx=NULL;*xx=0; }
|
|
|
|
// reset flags
|
|
dst->m_flags = 0;
|
|
dst->m_replyFlags = 0;
|
|
|
|
if ( inlined ) dst->m_flags |= AF_INLINED;
|
|
|
|
// . HACK! if our m_str referenced our m_adm1, fix that!
|
|
// . see "HACK" above to where we did this
|
|
//if ( adm1->m_str == adm1->m_adm1 )
|
|
// dst->m_adm1->m_str = dst->m_adm1->m_adm1;
|
|
|
|
// set m_b for the address so we can use it when as a boundary
|
|
// for harvesting place names for following addresses above
|
|
/*
|
|
int32_t max = -1;
|
|
if ( dst->m_street->m_b > max ) max = dst->m_street->m_b;
|
|
if ( dst->m_adm1->m_b > max && inlined ) max = dst->m_adm1->m_b;
|
|
if ( dst->m_city->m_b > max && inlined ) max = dst->m_city->m_b;
|
|
// do not require inlineness for a zip!
|
|
if ( zip && dst->m_zip->m_b > max ) max = dst->m_zip->m_b;
|
|
// or for a suite!
|
|
if ( suite && suite->m_b > max ) max = suite->m_b;
|
|
dst->m_b = max;
|
|
|
|
// and the left most point not including place name
|
|
dst->m_a = dst->m_street->m_a;
|
|
// suite might be before street sometimes
|
|
if ( suite && suite->m_a < dst->m_a ) dst->m_a = suite->m_a;
|
|
*/
|
|
|
|
// add these in
|
|
dst->m_domHash32 = m_domHash32;
|
|
dst->m_ip = m_ip;
|
|
|
|
// get the section containing all components
|
|
int32_t a = dst->m_street->m_a;
|
|
int32_t b = dst->m_street->m_b;
|
|
// increase address range?
|
|
if ( suite && suite->m_a < a ) a = suite->m_a;
|
|
if ( suite && suite->m_b > b ) b = suite->m_b;
|
|
// sometimes the city/adm1/zip is in the title or something
|
|
// so only use it if within reach!!
|
|
if ( ! cityOut && city && city->m_b > b && city->m_b < b + 20 )
|
|
b = city->m_b;
|
|
if ( ! adm1Out && adm1 && adm1->m_b > b && adm1->m_b < b + 20 )
|
|
b = adm1->m_b;
|
|
if ( ! zipOut && zip && zip ->m_b > b && zip->m_b < b + 20 )
|
|
b = zip->m_b;
|
|
|
|
//if ( ! cityOut && city && city->m_a < a ) a = city->m_a;
|
|
//if ( ! adm1Out && adm1 && adm1->m_a < a ) a = adm1->m_a;
|
|
//if ( ! zipOut && zip && zip ->m_a < a ) a = zip->m_a;
|
|
|
|
if ( a < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get section
|
|
Section *as = NULL;
|
|
if ( m_sections ) as = m_sections->m_sectionPtrs[a];
|
|
// telescope up until contains all inlined things in address
|
|
//for ( ; as ; as = as->m_parent )
|
|
// // stop if contained
|
|
// if ( as->m_a <= a && as->m_b >= b ) break;
|
|
// store that
|
|
dst->m_section = as;
|
|
dst->m_flags3 = flags3;
|
|
//dst->m_latitude = latitude;
|
|
//dst->m_longitude = longitude;
|
|
|
|
// reset the imported lat/lon
|
|
dst->m_importedLatitude = NO_LATITUDE;
|
|
dst->m_importedLongitude = NO_LONGITUDE;
|
|
dst->m_importedVotes = -1;
|
|
|
|
// advance m_na iff we did not overwrite a previous address
|
|
//m_na++;
|
|
|
|
//log("addr: u=%s addr # = %"INT32"",m_url->m_url,m_na-1);
|
|
// uncomment this for debug to the log
|
|
//dst->print ( );
|
|
|
|
return true;
|
|
}
|
|
|
|
uint64_t getAddressHash ( Place *street ,
|
|
Place *city ,
|
|
Place *adm1 ,
|
|
Place *zip ) {
|
|
|
|
int64_t ch = 0;
|
|
ch ^= street->m_hash;
|
|
ch ^= street->m_streetNumHash;
|
|
ch ^= street->m_streetIndHash;
|
|
// adm1
|
|
char *adm1Str = NULL;
|
|
if ( adm1 ) adm1Str = adm1->m_adm1;
|
|
else if ( zip ) adm1Str = zip->m_adm1;
|
|
else if ( city && city->m_adm1[0] ) adm1Str = city->m_adm1;
|
|
else { char *xx=NULL;*xx=0; }
|
|
// xor in adm1
|
|
//ch ^= (int64_t)*((uint16_t *)adm1Str);
|
|
// and city hash
|
|
uint64_t cityHash = 0;
|
|
if ( city ) cityHash = city->m_hash;
|
|
else if ( zip ) cityHash = zip->m_cityHash;
|
|
if ( ! cityHash ) { char *xx=NULL;*xx=0; }
|
|
//ch ^= cityHash;
|
|
// . use this instead. it will convert "SF,CA" to "San Francisco"
|
|
// . use a special adm1 bit in the bit vector to indicate its an alias
|
|
// . if its an alias we check the g_aliases table to see what the
|
|
// cityHash64 should really be
|
|
uint32_t cid32 = getCityId32(cityHash,adm1Str);
|
|
ch ^= cid32;
|
|
return ch;
|
|
}
|
|
|
|
bool setFromStr ( Address *a, char *s, pbits_t flags ,
|
|
PlaceMem *pm ,
|
|
int32_t niceness ) {
|
|
// clear it up
|
|
a->reset();
|
|
// int16_tcuts
|
|
//Place *city = NULL;
|
|
//Place *adm1 = NULL;
|
|
a->m_latitude = NO_LATITUDE;
|
|
a->m_longitude = NO_LONGITUDE;
|
|
a->m_geocoderLat = NO_LATITUDE;
|
|
a->m_geocoderLon = NO_LONGITUDE;
|
|
// ctry is always empty, because its always the US
|
|
// name1;name2;suite;street;city;adm1;zip;ctry;domhash;ip;origurl;lat;lon;addrHash\0
|
|
// . loop it
|
|
for ( int32_t i = 0 ; i <= 13 ; i++ , s++ ) {
|
|
// stop if no more fields
|
|
if ( ! *s ) break;
|
|
// save it
|
|
char *start = s;
|
|
// advance s to ;
|
|
//while ( *s && *s != ';' && *s !='(' ) s++;
|
|
while ( *s && *s != ';' ) s++;
|
|
// site hash?
|
|
if ( i == 8 ) {
|
|
a->m_domHash32 = 0;
|
|
// panic if none!
|
|
if ( *start == ';' ) { char *xx=NULL;*xx=0;}//continue;
|
|
a->m_domHash32 = (uint32_t)atoll(start);
|
|
continue;
|
|
}
|
|
// ip?
|
|
if ( i == 9 ) {
|
|
a->m_ip = 0;
|
|
if ( *start == ';' ) continue;
|
|
a->m_ip = atoip(start,s-start);
|
|
// 0 -1 not allowed
|
|
if ( a->m_ip==0 || a->m_ip==-1) {char *xx=NULL;*xx=0;}
|
|
continue;
|
|
}
|
|
// skip orig url
|
|
if ( i == 10 ) {
|
|
// skip if empty
|
|
if ( *start == ';' ) continue;
|
|
continue;
|
|
}
|
|
// latitude
|
|
if ( i == 11 ) {
|
|
// skip if empty
|
|
if ( *start == ';' ) continue;
|
|
// set it
|
|
a->m_latitude = atod2 (start,s-start);
|
|
continue;
|
|
}
|
|
// longitude
|
|
if ( i == 12 ) {
|
|
// skip if empty
|
|
if ( *start == ';' ) continue;
|
|
// set it
|
|
a->m_longitude = atod2 (start,s-start);
|
|
// skip semicolon
|
|
if ( ! *s ) break;
|
|
continue;
|
|
}
|
|
// addrHash
|
|
if ( i == 13 ) {
|
|
// skip if empty
|
|
if ( *start == ';' ) continue;
|
|
// must be digit
|
|
//if ( is_digit(*p) )
|
|
a->m_hash = strtoull(start,NULL,10);//atoll(p);
|
|
// skip semicolon
|
|
break;
|
|
}
|
|
// timezone offset
|
|
//if ( i == 13 ) {
|
|
// // skip if empty
|
|
// if ( *start == ';' ) continue;
|
|
// // set it
|
|
// a->m_timeZoneOffset= atol2 (start,s-start);
|
|
// // skip semicolon
|
|
// if ( *s && *s == ';' ) s++;
|
|
// continue;
|
|
//}
|
|
// ptr to a place
|
|
//Place *p = NULL;
|
|
// get length of place
|
|
int32_t slen = s - start;
|
|
// skip if empty
|
|
if ( slen <= 0 ) continue;
|
|
// do not breach
|
|
//if ( *np >= maxPlaces ) { char *xx=NULL;*xx=0; }
|
|
// ok, add this entry
|
|
Place *p = (Place *)pm->getMem(sizeof(Place));//&places[*np];
|
|
if ( ! p ) { char *xx=NULL;*xx=0; }
|
|
// advance np
|
|
//*np = *np + 1;
|
|
// pt = "place type"
|
|
int32_t pt;
|
|
if ( i == 0 ) { a->m_name1 = p; pt = PT_NAME_1;}
|
|
if ( i == 1 ) { a->m_name2 = p; pt = PT_NAME_2;}
|
|
if ( i == 2 ) { a->m_suite = p; pt = PT_SUITE;}
|
|
if ( i == 3 ) { a->m_street = p; pt = PT_STREET;}
|
|
if ( i == 4 ) { a->m_city = p; pt = PT_CITY;}
|
|
if ( i == 5 ) { a->m_adm1 = p; pt = PT_STATE;}
|
|
if ( i == 6 ) { a->m_zip = p; pt = PT_ZIP; }
|
|
if ( i == 7 ) { continue; }// p = a->m_ctry; pt = PT_CTRY;}
|
|
// clear it
|
|
p->reset();
|
|
// set it
|
|
p->m_type = pt;
|
|
p->m_a = -7;
|
|
p->m_b = -6;
|
|
p->m_alnumA = -5;
|
|
p->m_alnumB = -4;
|
|
p->m_str = start;
|
|
p->m_strlen = slen;
|
|
p->m_bits = 0;
|
|
// set adm1 bits if adm1
|
|
if ( pt == PT_STATE ) {
|
|
p->m_adm1Bits = getAdm1Bits ( start );
|
|
// set the state two-letter abbr as well
|
|
p->m_adm1[0] = start[0];
|
|
p->m_adm1[1] = start[1];
|
|
}
|
|
/*
|
|
// we got a parenthetical?
|
|
char *parens = NULL;
|
|
// skip semicolon
|
|
if ( *s && *s == '(' ) {
|
|
// what is this from now?
|
|
char *xx=NULL;*xx=0;
|
|
// skip parens
|
|
s++;
|
|
// mark it
|
|
parens = s;
|
|
// skip to end
|
|
for ( ; *s && *s != ';' ; s++ );
|
|
}
|
|
*/
|
|
// skip semicolon
|
|
//if ( *s && *s == ';' ) s++;
|
|
// store it in Address class if not NULL
|
|
if ( ! p->m_str ) continue;
|
|
|
|
// incorporate the flags. usually PLF_FROMTAG
|
|
p->m_bits = flags;
|
|
|
|
// clear these
|
|
p->m_flags2 = 0;
|
|
|
|
// two letter country code in parentheses
|
|
//if ( i == 7 && parens && parens[2] == ')' )
|
|
// p->m_crid = getCountryId ( parens );
|
|
// . two letter admin code in parentheses
|
|
// . usually only city names and zip codes have this
|
|
//if ( i != 7 && parens && parens[2] == ')' ) {
|
|
// p->m_adm1[0] = parens[0];
|
|
// p->m_adm1[1] = parens[1];
|
|
//}
|
|
|
|
// and make the word non-overlappable
|
|
p->m_a = -3;
|
|
p->m_b = -2;
|
|
// null it out
|
|
p->m_hash = 0LL;
|
|
p->m_streetIndHash = 0LL;
|
|
p->m_streetNumHash = 0LL;
|
|
|
|
// set m_streetHash, m_streetIndHash, m_streetNumHash of
|
|
// this Place, p
|
|
setHashes ( p , NULL , niceness );
|
|
|
|
// do not take streets from tag, must be on the page itself
|
|
if ( i == 3 && (flags & PLF_FROMTAG) ) continue;
|
|
// do not take name from tag either!
|
|
if ( i == 0 && (flags & PLF_FROMTAG) ) continue;
|
|
if ( i == 1 && (flags & PLF_FROMTAG) ) continue;
|
|
// nor suite
|
|
if ( i == 2 && (flags & PLF_FROMTAG) ) continue;
|
|
|
|
// and make the word non-overlappable
|
|
//p->m_a = -3;
|
|
//p->m_b = -2;
|
|
|
|
// save these
|
|
//if ( i == 4 ) city = p;
|
|
//if ( i == 5 ) adm1 = p;
|
|
|
|
// if we are a city OR a zip code, we must set m_hash since
|
|
// addAddress() uses it to check for dups!
|
|
/*
|
|
if ( i == 4 || i == 5 ) {
|
|
Words w;
|
|
// i guess just use "version" of 0
|
|
if ( ! w.set (p->m_str , p->m_strlen,0,true,niceness))
|
|
return false;
|
|
// int16_tcut
|
|
int64_t *wids = w.getWordIds();
|
|
// zero out the hash
|
|
int64_t h = 0LL;
|
|
// loop em
|
|
for ( int32_t j = 0 ; j < w.m_numWords ; j++ ) {
|
|
// skip if not alnum
|
|
if ( ! wids[j] ) continue;
|
|
// mix it up
|
|
h <<= 1;
|
|
// xor it in
|
|
h ^= wids[j];
|
|
}
|
|
// set that hash
|
|
p->m_hash = h;
|
|
}
|
|
*/
|
|
// update crid
|
|
if ( i == 7 ) {
|
|
/*
|
|
// get numeric id
|
|
uint8_t crid = getCountryId(p->m_str);
|
|
// set it
|
|
p->m_crid = crid;
|
|
// and for adm1
|
|
adm1->m_crid = crid;
|
|
// and city
|
|
city->m_crid = crid;
|
|
*/
|
|
}
|
|
}
|
|
|
|
// if it was a lat/lon only contact address it will not have a
|
|
// city, so this is NULL. perhaps, just give up on that?
|
|
// this is not the case any more since we insert the foreign
|
|
// country and state and city sometimes
|
|
if ( ! a->m_city || ! a->m_adm1 )
|
|
a->m_flags3 |= AF2_LATLON;
|
|
|
|
// set adm1 bits last from the two character string code
|
|
if ( a->m_city && a->m_adm1 ) {
|
|
a->m_city->m_adm1Bits = a->m_adm1->m_adm1Bits;
|
|
a->m_city->m_adm1[0] = a->m_adm1->m_adm1[0];
|
|
a->m_city->m_adm1[1] = a->m_adm1->m_adm1[1];
|
|
}
|
|
if ( a->m_zip ) {
|
|
a->m_zip->m_adm1Bits = a->m_adm1->m_adm1Bits;
|
|
a->m_zip->m_adm1[0] = a->m_adm1->m_adm1[0];
|
|
a->m_zip->m_adm1[1] = a->m_adm1->m_adm1[1];
|
|
}
|
|
|
|
// require ip
|
|
if ( a->m_ip == 0 || a->m_ip == -1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// do we need this?
|
|
a->m_cityId32 = 0;
|
|
/*
|
|
// adm1
|
|
char *adm1Str = NULL;
|
|
if ( a->m_adm1 )
|
|
adm1Str = a->m_adm1->m_adm1;
|
|
else if ( a->m_zip )
|
|
adm1Str = a->m_zip->m_adm1;
|
|
else if ( a->m_city && a->m_city->m_adm1[0] )
|
|
adm1Str = a->m_city->m_adm1;
|
|
else { char *xx=NULL;*xx=0; }
|
|
// use city hash
|
|
a->m_cityId64 = getCityId64 ( a->m_city->m_hash , adm1Str );
|
|
*/
|
|
|
|
// update "m_crid" member on all relevant places
|
|
return true;
|
|
}
|
|
|
|
void setFromStr2 ( char *addr ,
|
|
char **name1 ,
|
|
char **name2 ,
|
|
char **suite ,
|
|
char **street ,
|
|
char **city ,
|
|
char **adm1 ,
|
|
char **zip ,
|
|
char **country,
|
|
double *lat ,
|
|
double *lon ) {
|
|
// use this
|
|
static char s_addr[2048];
|
|
//int32_t alen = gbstrlen(addr);
|
|
//char *aend = addr + alen;
|
|
//int32_t *tzoff ) {
|
|
if ( name1 ) *name1 = NULL;
|
|
if ( name2 ) *name2 = NULL;
|
|
if ( suite ) *suite = NULL;
|
|
if ( street ) *street = NULL;
|
|
if ( city ) *city = NULL;
|
|
if ( adm1 ) *adm1 = NULL;
|
|
if ( zip ) *zip = NULL;
|
|
if ( country) *country= NULL;
|
|
if ( lon ) *lon = 999.00;
|
|
if ( lat ) *lat = 999.00;
|
|
|
|
// breach check
|
|
int32_t len = gbstrlen(addr);
|
|
if ( len + 1 > 2048 ) {
|
|
log("addr: address is too big to parse");
|
|
return;
|
|
}
|
|
|
|
// copy into our static buffer
|
|
gbmemcpy ( s_addr , addr , len+1 );
|
|
|
|
// parse it in our static buffer so we do not destroy it
|
|
char *p = s_addr;
|
|
|
|
// if we are double called on the same "addr" string we have to
|
|
// expect to encounter \0 just as we would ';'... and we do this
|
|
// now from PageResults.cpp because it uses ExpandedResults, where
|
|
// an event that has a date like "every wednesday" results in like
|
|
// 104 search results, so that search result has to be repeated
|
|
// in the listings using the same address "addr" over and over again,
|
|
// and each time it calls setFromStr2, so since this is destructive
|
|
// that way, be prepared!
|
|
if ( name1 ) *name1 = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
|
|
if ( name2 ) *name2 = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
|
|
if ( suite ) *suite = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
|
|
if ( street ) *street = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
|
|
if ( city ) *city = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
|
|
if ( adm1 ) *adm1 = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
|
|
if ( zip ) *zip = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
|
|
if ( country) *country= p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
|
|
//for ( ; *p != ';' ; p++ ); p++; // was country
|
|
for ( ; *p != ';' ; p++ ); p++; // domhash?
|
|
for ( ; *p != ';' ; p++ ); p++; // ip
|
|
for ( ; *p != ';' ; p++ ); p++; // orig url
|
|
if ( lat && *p!=';' ) *lat = atof(p);
|
|
for ( ; *p != ';' ; p++ ); p++;
|
|
if ( lon && *p ) *lon = atof(p);
|
|
//if ( tzoff ) *tzoff= atol(p);
|
|
//s anity check
|
|
//if ( p > aend ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// . year is like "2011" or whatever
|
|
// . assume we are in greenwhich england (timezone=+0)
|
|
// . BUT apply the american daylight start/end times
|
|
// . currently in affect from 2nd sunday in march to first sunday in nov @ 2am
|
|
void getDSTInterval ( int32_t year , int32_t *a , int32_t *b ) {
|
|
// find the 2nd sunday in march for this year
|
|
*a = getDOWStart ( year, 3, 1, 2); // 3=march 1=sunday, 2=2nd
|
|
// 2am?
|
|
*a += 2*3600;
|
|
// the end point now
|
|
*b = getDOWStart ( year, 11, 1, 1); // 11=nov 1=sunday 1=1st
|
|
// 2am
|
|
*b += 2*3600;
|
|
}
|
|
// . nowUTC is # secs elapsed since epoch in UTC (no DST)
|
|
// . currently in affect from 2nd sunday in march to first sunday in nov @ 2am
|
|
bool getIsDST ( int32_t nowUTC , char timezone2 ) {
|
|
// mod the time
|
|
time_t mod = (time_t)nowUTC ;
|
|
// add if known
|
|
if ( timezone2 != UNKNOWN_TIMEZONE ) {
|
|
// sanity check, make sure its the offset, not in seconds
|
|
if ( timezone2 > 13 ) { char *xx=NULL;*xx=0; }
|
|
if ( timezone2 < -13 ) { char *xx=NULL;*xx=0; }
|
|
mod += timezone2*3600;
|
|
}
|
|
// get DOW now
|
|
struct tm *timeStruct = gmtime ( &mod );
|
|
// certain months are always dst. jan = 0. goes from 0 to 11.
|
|
int32_t mon = timeStruct->tm_mon;
|
|
// feb=1,mar=2,apr=3,may=4,jun=5,jul=6,aug=7,sep=8,oct=9,nov=10,dec=11
|
|
if ( mon >= 3 && mon <= 9 ) return true;
|
|
// not in dec
|
|
if ( mon == 11 ) return false;
|
|
// not in jan or feb
|
|
if ( mon >= 0 && mon <= 1 ) return false;
|
|
// get dow. 0 to 6. 0 being sunday.
|
|
int32_t dow = timeStruct->tm_wday;
|
|
// what # of dow are we? i.e. xth monday, where x=dowCount
|
|
int32_t dowCount = 1 + timeStruct->tm_mday / 7;
|
|
// for march, if we are the 2nd dow, and not sunday, return true
|
|
if ( mon == 2 ) {
|
|
if ( dowCount <= 1 ) return false;
|
|
if ( dowCount >= 3 ) return true;
|
|
if ( dowCount == 2 && dow != 0 ) return true;
|
|
// if before 2nd sunday at 2am, not yet summer time
|
|
if ( dowCount == 2 && dow == 0 )
|
|
return ( timeStruct->tm_hour >= 2 );
|
|
}
|
|
// november
|
|
if ( mon == 10 ) {
|
|
if ( dowCount >= 2 ) return false;
|
|
if ( dowCount == 1 && dow != 0 ) return false;
|
|
// if before 1st sunday at 2am, it is still summer time
|
|
if ( dowCount == 1 && dow == 0 )
|
|
return ( timeStruct->tm_hour < 2 );
|
|
}
|
|
// how did we get here?
|
|
char *xx=NULL;*xx=0;
|
|
return false;
|
|
}
|
|
|
|
class CityStateDesc {
|
|
public:
|
|
float m_latitude;
|
|
float m_longitude;
|
|
char m_timeZoneOffset;
|
|
char m_useDST;
|
|
//uint8_t m_crid;
|
|
// id within that country
|
|
//uint8_t m_stateId;
|
|
};
|
|
|
|
|
|
bool getCityLatLonFromAddrStr ( char *addr , double *lat , double *lon ) {
|
|
// get city from string
|
|
uint32_t cid32 = 0;
|
|
if ( addr[0] ) cid32 = getCityIdFromAddr ( addr );
|
|
// assume city/state not found in our list
|
|
*lat = NO_LATITUDE;
|
|
*lon = NO_LONGITUDE;
|
|
// now get lat lon of that city
|
|
bool status = getLatLon ( cid32 , lat , lon );
|
|
// returns false if city not found
|
|
return status;
|
|
}
|
|
|
|
uint32_t getCityIdFromAddr ( char *addr ) {
|
|
// get city and adm1 from address
|
|
char *p = addr;
|
|
int32_t semiCount = 0;
|
|
char *adm1 = NULL;
|
|
char *city = NULL;
|
|
for ( ; ; p++ ) {
|
|
// skip if not border
|
|
if ( *p != ';' ) continue;
|
|
// inc it
|
|
semiCount++;
|
|
// city?
|
|
if ( semiCount == 4 ) {
|
|
city = p + 1;
|
|
continue;
|
|
}
|
|
if ( semiCount == 5 ) {
|
|
adm1 = p + 1;
|
|
continue;
|
|
}
|
|
if ( semiCount != 6 ) continue;
|
|
|
|
break;
|
|
}
|
|
// if no city try lat/lon
|
|
if ( city[0] == ';' ) {
|
|
double lat = 0.0;
|
|
double lon = 0.0;
|
|
getLatLonFromStr ( addr , &lat , &lon );
|
|
float distInMilesSquared = 0.0;
|
|
uint32_t cid32 = getNearestCityId ( lat , lon , 0,
|
|
&distInMilesSquared);
|
|
if ( distInMilesSquared > 1000 )
|
|
cid32 = 0;
|
|
// how can this be 0?
|
|
//if ( cid32 == 0 ) { char *xx=NULL;*xx=0; }
|
|
return cid32;
|
|
}
|
|
// ok, we got both now
|
|
char *semi1 = adm1 - 1;
|
|
char *semi2 = p;
|
|
// temp null term
|
|
*semi1 = '\0';
|
|
*semi2 = '\0';
|
|
// fix Denver's so we do not return unknown timezone
|
|
if ( semi1[-1]=='s' && semi1[-2]=='\'' ) semi1[-2]='\0';
|
|
// get city hash
|
|
int64_t h = getWordXorHash(city);
|
|
// TODO: make state into two letter abbr?
|
|
//if ( gbstrlen(adm1) != 2 ) { char *xx=NULL;*xx=0; }
|
|
// use this now
|
|
uint32_t cid32 = (uint64_t)getCityId32(h,adm1);
|
|
// put back
|
|
*semi1 = ';';
|
|
*semi2 = ';';
|
|
// put apostrophe back if we stripped it
|
|
if ( ! semi1[-2] ) semi1[-2] = '\'';
|
|
return cid32;
|
|
}
|
|
|
|
PlaceDesc *getCityPlaceDescFromAddrLatLon_new ( char *addr ) {
|
|
double lat = 0.0;
|
|
double lon = 0.0;
|
|
getLatLonFromStr ( addr , &lat , &lon );
|
|
float distInMilesSquared = 0.0;
|
|
PlaceDesc *pd = getNearestCity_new (lat,lon,0,&distInMilesSquared);
|
|
if ( distInMilesSquared < 1000 ) return pd;
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
char getTimeZoneFromAddr ( char *addr , char *useDST ) {
|
|
|
|
// . try this new function
|
|
// . if no city explicitly, use lat/lon to get nearest city?
|
|
// . returns NULL if no nearby city
|
|
PlaceDesc *pd = getCityPlaceDescFromAddrLatLon_new ( addr );
|
|
|
|
if ( pd && useDST ) {
|
|
*useDST = 0;
|
|
if ( pd->m_flags & PDF_USE_DST ) *useDST = 1;
|
|
}
|
|
if ( pd ) return pd->m_timeZoneOffset;
|
|
|
|
// i guess we choose not to store the lat/lon for US cities
|
|
// because we can look them up by name here...
|
|
uint32_t cid32 = getCityIdFromAddr ( addr );
|
|
// if it had a city specified, or its lat/lon was nearby a city,
|
|
// then use that city id to get the timezone
|
|
if ( cid32 ) return getTimeZone3 ( cid32 , useDST );
|
|
// if doesn't have a city or the specified lat/lon is not close
|
|
// to a city in our list then let's use the lat lon to get the
|
|
// timezone
|
|
|
|
|
|
double lat = 0.0;
|
|
double lon = 0.0;
|
|
getLatLonFromStr ( addr, &lat, &lon );
|
|
if ( lat == NO_LATITUDE ) return UNKNOWN_TIMEZONE;
|
|
if ( lon == NO_LATITUDE ) return UNKNOWN_TIMEZONE;
|
|
// ASSUME THEY USE DST! WE DON'T KNOW REALLY!!
|
|
if ( useDST ) *useDST = 1;
|
|
return (char)(int32_t)(lon / (360.0/24.0));
|
|
}
|
|
|
|
|
|
/*
|
|
// . hash city and state together then lookup in g_timeZones table
|
|
// . name1;name2;suite;street;city;adm1;zip;domhash;ip;origurl;lat;lon\0
|
|
// . uint32_t getCityHash32 ( char *addr , uint32_t *adm1Hash ) {
|
|
char getTimeZoneFromAddr ( char *addr , char *useDST ) {
|
|
|
|
// get city and adm1 from address
|
|
char *p = addr;
|
|
int32_t semiCount = 0;
|
|
char *adm1 = NULL;
|
|
char *city = NULL;
|
|
for ( ; ; p++ ) {
|
|
// skip if not border
|
|
if ( *p != ';' ) continue;
|
|
// inc it
|
|
semiCount++;
|
|
// city?
|
|
if ( semiCount == 4 ) {
|
|
city = p + 1;
|
|
continue;
|
|
}
|
|
if ( semiCount == 5 ) {
|
|
adm1 = p + 1;
|
|
continue;
|
|
}
|
|
if ( semiCount != 6 ) continue;
|
|
|
|
break;
|
|
}
|
|
// ok, we got both now
|
|
char *semi1 = adm1 - 1;
|
|
char *semi2 = p;
|
|
// temp null term
|
|
*semi1 = '\0';
|
|
*semi2 = '\0';
|
|
|
|
// fix Denver's so we do not return unknown timezone
|
|
if ( semi1[-1]=='s' && semi1[-2]=='\'' ) semi1[-2]='\0';
|
|
|
|
char tzoff = getTimeZone2 ( city , adm1 , useDST );
|
|
// put back
|
|
*semi1 = ';';
|
|
*semi2 = ';';
|
|
// put apostrophe back if we stripped it
|
|
if ( ! semi1[-2] ) semi1[-2] = '\'';
|
|
return tzoff;
|
|
}
|
|
*/
|
|
|
|
char getTimeZone2 ( char *city , char *state , char *useDST ) {
|
|
// get the words
|
|
//Words ww; ww.set3 ( city );
|
|
// int16_tcut
|
|
//int64_t *wids = ww.m_wordIds;
|
|
// limit hash
|
|
//int32_t count = 0;
|
|
// get city hash
|
|
int64_t h = getWordXorHash(city);
|
|
// TODO: make state into two letter abbr?
|
|
// crap, if state is taken from class ZipDesc it is only
|
|
// 2 letters and has no \0 in it
|
|
//if ( gbstrlen(state) != 2 ) { char *xx=NULL;*xx=0; }
|
|
// use this now
|
|
uint32_t cid32 = (uint64_t)getCityId32(h,state);
|
|
// and call this
|
|
return getTimeZone3 ( cid32 , useDST );
|
|
}
|
|
|
|
char getTimeZone3 ( uint32_t cid32 , char *useDST ) {
|
|
// now lookup timezone
|
|
int32_t slot = g_timeZones.getSlot ( &cid32 );//&cityStateHash );
|
|
// return 0 if not found
|
|
if ( slot < 0 ) {
|
|
log("addr: gettimezone3: unknown timezone");
|
|
return UNKNOWN_TIMEZONE;
|
|
// Denver Art Museum;;;100 West 14th Avenue Parkway;Denver's;
|
|
// co;;;1993583704;173.203.24.218;;;
|
|
}
|
|
// otherwise, set m_timeZoneOffset appropriately
|
|
CityStateDesc *csd=(CityStateDesc *)g_timeZones.getValueFromSlot(slot);
|
|
*useDST = csd->m_useDST;
|
|
// sanity corruption check
|
|
if ( *useDST != 0 && *useDST != 1 ) { char *xx=NULL;*xx=0; }
|
|
char tz = csd->m_timeZoneOffset;
|
|
if ( tz < -13 || tz > 13 ) { char *xx=NULL;*xx=0; }
|
|
return tz;
|
|
}
|
|
|
|
|
|
// . for now just get the closest city to the user and use that timezone
|
|
// . this is not 100% accurate but should be like 99.9%
|
|
// . no, just use the GeoCityLite.dat call, that returns the city/state already
|
|
char getTimeZoneFromUserIP ( int32_t uip , int32_t niceness , char *useDST ) {
|
|
double lat;
|
|
double lon;
|
|
double radius;
|
|
char *city,*state,*ctry;
|
|
// use this by default
|
|
//int32_t ip = r->m_userIP;
|
|
// ip for testing?
|
|
//int32_t iplen;
|
|
//char *ips = r->getString("uip",&iplen);
|
|
//if ( ips ) ip = atoip(ips);
|
|
// returns true if found in db
|
|
char buf[128];
|
|
getIPLocation ( uip ,
|
|
&lat ,
|
|
&lon ,
|
|
&radius,
|
|
&city ,
|
|
&state ,
|
|
&ctry ,
|
|
buf ,
|
|
128 ) ;
|
|
// 999 means unknown timezone offset
|
|
if ( ! city || ! state ) {
|
|
log("addr: got unknown timezone for user");
|
|
return UNKNOWN_TIMEZONE;
|
|
}
|
|
// get timezone offset from this
|
|
return getTimeZone2 ( city , state , useDST );
|
|
}
|
|
|
|
// used by SearchInput.cpp to get timezone of the user from user's lat/lon
|
|
char getTimeZoneFromLatLon ( float lat,float lon,int32_t niceness,char *useDST ) {
|
|
// get nearest city/state
|
|
float distInMilesSquared = 0.0;
|
|
uint32_t cid32 = getNearestCityId ( lat , lon , niceness ,
|
|
&distInMilesSquared );
|
|
if ( distInMilesSquared > 1000 )
|
|
cid32 = 0;
|
|
// then its easy
|
|
return getTimeZone3 ( cid32 , useDST );
|
|
}
|
|
|
|
static int32_t *s_latList = NULL;
|
|
static int32_t s_latListSize = 0;
|
|
//static int32_t *s_lonList = NULL;
|
|
static int32_t s_ni = 0;
|
|
|
|
// . we need a list of the city ids sorted by lat, and a list sorted by lon
|
|
// . then we do b-stepping on each list
|
|
// . bstep down to a 20 mile by 20 mile box
|
|
// . then intersect using a hashtable
|
|
// . if empty, then increase to 30 by 30 mile box, etc.
|
|
// . there are 123k US cities in cities.dat
|
|
// . these 2 lists should be about 2MB then
|
|
// . then lookup cityid in g_timezones to get timezone
|
|
uint32_t getNearestCityId ( float lat ,
|
|
float lon ,
|
|
int32_t niceness ,
|
|
float *distInMilesSquared ) {
|
|
|
|
// radius is 5 miles, put miles into degrees
|
|
float radius = 5.0 / 69.0;
|
|
CityStateDesc *csd;
|
|
|
|
tryagain:
|
|
|
|
int32_t step = s_ni / 2;
|
|
// get lat boundaries using bstep
|
|
int32_t start = s_ni / 2;
|
|
// do the bstepping
|
|
for ( ; ; ) {
|
|
// get that city
|
|
int32_t citySlot = s_latList[start];
|
|
// get csd
|
|
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(citySlot);
|
|
if ( ! csd ) { char *xx=NULL;*xx=0; }
|
|
// increase resolution for next round
|
|
step /= 2;
|
|
//if ( step <= 0 ) step = 1;
|
|
// step it down?
|
|
if ( lat < csd->m_latitude ) start -= step;
|
|
// use " - radius" here as well to avoid infinite loop?
|
|
else if ( lat > csd->m_latitude ) start += step;
|
|
// ok, we are in range, done
|
|
else break;
|
|
// avoid breaching!
|
|
if ( start < 0 ) { start = 0 ; break; }
|
|
if ( start >= s_ni ) { start = s_ni-1; break; }
|
|
// stop if we hit steps of 0
|
|
if ( step <= 0 ) break;
|
|
// if step was 0 and we failed, than need to increase radius
|
|
//if ( step > 0 ) continue;
|
|
// ok, we failed, we will increase radius below and try again
|
|
// increase stripe width
|
|
//radius += 5.0;
|
|
// try again
|
|
//goto tryagain;
|
|
}
|
|
|
|
//getCityRange ( s_latList , lat , radius , &lata , &latb );
|
|
//getCityRange ( s_lonList , lon , radius , &lona , &lonb );
|
|
// now take intersection of the ranges
|
|
//int32_t numCities = lata - latb;
|
|
//HashTableX ih;
|
|
//if(! ih.set ( 4 , 0 , numCities , ihbuf, 3000 , false , niceness )){
|
|
// char *xx=NULL;*xx=0; }
|
|
|
|
int32_t lata = start;
|
|
int32_t latb = start;
|
|
int32_t count = 0;
|
|
// TODO: do b-step on these too, takes like 3500 iterations for
|
|
// both of these loops
|
|
// adjust lata/latb until just out of range
|
|
for ( ; lata > 0 ; lata-- ) {
|
|
// get csd
|
|
int32_t slot = s_latList[lata];
|
|
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(slot);
|
|
if ( csd->m_latitude < lat - radius ) break;
|
|
count++;
|
|
}
|
|
for ( ; latb < s_ni ; latb++ ) {
|
|
// get csd
|
|
int32_t slot = s_latList[latb];
|
|
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(slot);
|
|
if ( csd->m_latitude > lat + radius ) break;
|
|
count++;
|
|
}
|
|
|
|
float min = -1.0;
|
|
int32_t minSlot = -1;
|
|
// add in the lat cities
|
|
for ( int32_t i = lata ; i <= latb ; i++ ) {
|
|
// break?
|
|
if ( i >= s_ni ) break;
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get that city
|
|
int32_t citySlot = s_latList[i];
|
|
// get cd
|
|
CityStateDesc *csd;
|
|
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(citySlot);
|
|
// just compute distance
|
|
float latDiff = csd->m_latitude - lat;
|
|
float lonDiff = csd->m_longitude - lon;
|
|
// add up
|
|
float dist = latDiff*latDiff + lonDiff*lonDiff;
|
|
// min?
|
|
if ( dist > min && minSlot >= 0 ) continue;
|
|
// set it
|
|
min = dist;
|
|
minSlot = citySlot;
|
|
}
|
|
|
|
// must have one
|
|
if ( minSlot == -1 ) {
|
|
// note it
|
|
log("addr: what the hell.");
|
|
// increase stripe width
|
|
radius += 10.0;
|
|
// try again
|
|
goto tryagain;
|
|
}
|
|
|
|
if ( distInMilesSquared ) *distInMilesSquared = min;
|
|
|
|
uint32_t *cidp = (uint32_t *)g_timeZones.getKeyFromSlot(minSlot);
|
|
|
|
// get that then
|
|
return *cidp;
|
|
}
|
|
|
|
|
|
int latcmp ( const void *arg1 , const void *arg2 ) {
|
|
int32_t slot1 = *(int32_t *)arg1;
|
|
int32_t slot2 = *(int32_t *)arg2;
|
|
// get the addresses
|
|
CityStateDesc *cd1;
|
|
CityStateDesc *cd2;
|
|
cd1 = (CityStateDesc *)g_timeZones.getValueFromSlot(slot1);
|
|
cd2 = (CityStateDesc *)g_timeZones.getValueFromSlot(slot2);
|
|
// simple compare
|
|
if ( cd1->m_latitude < cd2->m_latitude ) return -1;
|
|
if ( cd1->m_latitude > cd2->m_latitude ) return 1;
|
|
return 0;
|
|
}
|
|
|
|
//int loncmp ( const void *arg1 , const void *arg2 ) {
|
|
// // get the addresses
|
|
// CityDesc *cd1 = *(CityDesc **)arg1;
|
|
// CityDesc *cd2 = *(CityDesc **)arg2;
|
|
// // simple compare
|
|
// return ( cd1->m_longitude - cd2->m_longitude );
|
|
//}
|
|
|
|
// . our data is used by getNearestCityId
|
|
// . about 123k cities, sort them by lat in one list, lon in the other
|
|
// . 4 bytes per entry, we are talking 1.2MB for both lists
|
|
bool initCityLists ( ) {
|
|
// scan city table
|
|
int32_t ns = g_timeZones.m_numSlots;
|
|
// need this
|
|
int32_t used = g_timeZones.m_numSlotsUsed;
|
|
// how much space to alloc?
|
|
int32_t need = used * 4;
|
|
// alloc it
|
|
char *space = (char *)mmalloc(need,"latlist");
|
|
if ( ! space ) return false;
|
|
char *p = space;
|
|
s_latList = (int32_t *)p;
|
|
s_latListSize = need;
|
|
//p += 4 * used;
|
|
//s_lonList = (CityDesc **)p;
|
|
// reset
|
|
s_ni = 0;
|
|
// scan the slots
|
|
for ( int32_t i = 0 ; i < ns ; i++ ) {
|
|
// skip empties
|
|
if ( ! g_timeZones.m_flags[i] ) continue;
|
|
// get it
|
|
CityStateDesc *csd;
|
|
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(i);
|
|
// add to the list
|
|
s_latList[s_ni] = i;
|
|
//s_lonList[s_ni] = cd;
|
|
s_ni++;
|
|
}
|
|
// now sort each list
|
|
gbqsort ( s_latList , s_ni , 4 , latcmp , 0 );
|
|
//gbqsort ( s_lonList , s_ni , 4 , loncmp , 0 );
|
|
return true;
|
|
}
|
|
|
|
char Address::getTimeZone ( char *useDST ) {
|
|
|
|
// need this
|
|
char *adm1Str = NULL;
|
|
char *cityStr = NULL;
|
|
if ( m_adm1 ) adm1Str = m_adm1->m_adm1;
|
|
else if ( m_zip ) {
|
|
cityStr = m_zip->m_cityStr;
|
|
adm1Str = m_zip->m_adm1;
|
|
}
|
|
else if ( m_city && m_city->m_adm1[0] ) {
|
|
adm1Str = m_city->m_adm1;
|
|
}
|
|
// this sets m_cityId32 to the nearest city to the lat/lon
|
|
else if ( (m_flags3 & AF2_LATLON) && m_cityId32 ) ;
|
|
// if we failed to set city id because no city was nearby
|
|
// then just guess based on lat/lon
|
|
else if ( m_flags3 & AF2_LATLON ) {
|
|
// ASSUME THEY USE IT! WE DON'T KNOW REALLY!!
|
|
if ( useDST ) *useDST = 1;
|
|
char timeZone = (char)(int32_t)(m_longitude / (360.0/24.0));
|
|
if ( timeZone < -12 || timeZone > 12 ) { char *xx=NULL;*xx=0;}
|
|
return timeZone;
|
|
}
|
|
else { char *xx=NULL;*xx=0; }
|
|
// normalize this
|
|
//char aa[3];
|
|
//aa[0] = to_lower_a(adm1Str[0]);
|
|
//aa[1] = to_lower_a(adm1Str[1]);
|
|
//aa[2] = 0;
|
|
// hash state hash
|
|
//uint32_t adm1Hash32 = (uint32_t)*((uint16_t *)aa);
|
|
//uint32_t cityHash32 = (uint32_t)m_cityHash;
|
|
// combine the two hashes
|
|
//uint32_t cityStateHash = hash32h(cityHash32,adm1Hash32);
|
|
|
|
// use this now
|
|
//uint32_t cid32 = (uint32_t)m_cityId64;
|
|
|
|
// now lookup timezone
|
|
int32_t slot = g_timeZones.getSlot ( &m_cityId32 );
|
|
// return 0 if not found
|
|
if ( slot < 0 ) {
|
|
// nte it
|
|
if ( cityStr && adm1Str ) {
|
|
log("addr: could not find timezone in g_timezones, "
|
|
"trying to call getTimeZone2");
|
|
char tzoff = getTimeZone2 ( cityStr, adm1Str, useDST );
|
|
if ( tzoff != UNKNOWN_TIMEZONE )
|
|
return tzoff;
|
|
}
|
|
log("addr: got unknown timezone for addr");
|
|
*useDST = 1;
|
|
return UNKNOWN_TIMEZONE;
|
|
}
|
|
// otherwise, set m_timeZoneOffset appropriately
|
|
CityStateDesc *csd;
|
|
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(slot);
|
|
char tzoff = csd->m_timeZoneOffset;
|
|
if ( tzoff < - 13 || tzoff > 13 ) { char *xx=NULL;*xx=0; }
|
|
*useDST = csd->m_useDST;
|
|
return tzoff;
|
|
}
|
|
|
|
/*
|
|
bool Addresses::addToTagRec ( TagRec *gr , int32_t ip , int32_t timestamp ,
|
|
char *origUrl , int32_t maxAddrBytes ,
|
|
char *tagName ) {
|
|
// inherit Places that all the Addresses in the list agree on
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// get it
|
|
Address *ai = (Address *)m_am.getPtr(i);
|
|
// do not add this to tagdb if not inlined!
|
|
if ( ! ( ai->m_flags & AF_INLINED ) ) continue;
|
|
// add address #i
|
|
if ( ! ai->addToTagRec (gr,ip,timestamp,origUrl,
|
|
maxAddrBytes,tagName) )
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// can xmldoc use this for venue addresses?
|
|
bool Address::addToTagRec ( TagRec *gr , int32_t ip , int32_t timestamp ,
|
|
char *origUrl , int32_t maxAddrBytes ,
|
|
char *tagName ) {
|
|
|
|
//
|
|
// we are no longer storing contact info addresses
|
|
//
|
|
//return true;
|
|
|
|
// use ; as delimter
|
|
char buf[5003];
|
|
// . size includes the terminating \0
|
|
// . include the Address::m_hash for deduping in XmlDoc.cpp
|
|
int32_t size = serialize ( buf , 5000 , origUrl , false , true );
|
|
// returns -1 and sets g_errno on error
|
|
if ( size < 0 ) return false;
|
|
|
|
//
|
|
// point to end of data excluding the origUrl for deduping
|
|
//
|
|
char *end1 = buf + size - 1;
|
|
for ( ; end1 > buf && *end1 != ';' ; end1-- ) ;
|
|
// the length without that
|
|
int32_t len1 = end1 - buf;
|
|
|
|
//
|
|
// how many address bytes are we using currently? only need to
|
|
// compute this if we have a limit, i.e. "maxAddrBytes" >= 0
|
|
//
|
|
// count those bytes
|
|
int32_t used = 0;
|
|
if ( maxAddrBytes >= 0 ) {
|
|
// our tag type
|
|
int32_t tt = getTagTypeFromStr ( tagName );//"contactaddress" );
|
|
// taken from TagRec::getTag() function
|
|
Tag *tag = gr->getFirstTag();
|
|
// loop over all contact info addresses in the TagRec
|
|
for ( ; tag ; tag = gr->getNextTag(tag) ){
|
|
// skip if not a "address" tag (ci=contactInfo)
|
|
if ( tag->m_type != tt ) continue;
|
|
// get str
|
|
used += tag->m_dataSize;
|
|
// point to end of data excluding the origUrl for
|
|
// deduping contact addresses in the tag rec
|
|
char *end2 = tag->m_data + tag->m_dataSize - 1;
|
|
for ( ; end2 > tag->m_data && *end2 != ';' ; end2-- ) ;
|
|
// get lengths
|
|
int32_t len2 = end2 - tag->m_data;
|
|
// is it a dup?
|
|
if ( len1 != len2 ) continue;
|
|
if ( memcmp(tag->m_data, buf, len1 ) ) continue;
|
|
// it was a dup!
|
|
return true;
|
|
}
|
|
}
|
|
// can we fit it? if not, do not add it
|
|
if ( maxAddrBytes >= 0 && used + size > maxAddrBytes ) return true;
|
|
|
|
// store it
|
|
//int32_t now = getTimeGlobal();
|
|
// returns false and sets g_errno on error
|
|
return gr->addTag (tagName,timestamp,"xmldoc",ip,buf,size);
|
|
}
|
|
*/
|
|
|
|
// . hash city and state together then lookup in g_timeZones table
|
|
// . name1;name2;suite;street;city;adm1;zip;country;domhash;ip;origurl;lat;lon;hash\0
|
|
// . uint32_t getCityHash32 ( char *addr , uint32_t *adm1Hash ) {
|
|
uint64_t getHashFromAddr ( char *addr ) {
|
|
char *p = addr;
|
|
int32_t semiCount = 0;
|
|
for ( ; *p ; p++ ) {
|
|
// skip if not border
|
|
if ( *p != ';' ) continue;
|
|
// inc it
|
|
semiCount++;
|
|
// hash?
|
|
if ( semiCount != 13 ) continue;
|
|
// got it
|
|
break;
|
|
}
|
|
// none?
|
|
if ( ! *p ) { char *xx=NULL;*xx=0; }
|
|
// skip semi
|
|
p++;
|
|
// must be digit
|
|
if ( ! is_digit(*p) ) { char *xx=NULL;*xx=0; }
|
|
// get that value
|
|
uint64_t ah = strtoull(p,NULL,10);//atoll(p);
|
|
// that's what we want
|
|
return ah;
|
|
}
|
|
|
|
// . used by Address::serialize
|
|
// . filter out back to back spaces
|
|
// . covert \n and \t to ' '
|
|
int32_t memcpy2 ( char *dst , char *src , int32_t bytes , bool filterCommas ,
|
|
int32_t dstMaxBytes ) {
|
|
char *srcEnd = src + bytes;
|
|
// do not start with a space, so set this to 1
|
|
char lastWasSpace = 1;
|
|
char *dstStart = dst;
|
|
char fc = ' ';
|
|
if ( filterCommas ) fc = ',';
|
|
bool inTag = false;
|
|
char *dstEnd = NULL;
|
|
if ( dstMaxBytes >= 0 ) dstEnd = dstStart + dstMaxBytes;
|
|
char cs ;
|
|
//if ( src[0]=='G' && src[1]=='o' && src[2]=='n' )
|
|
// log("hey");
|
|
for ( ; src < srcEnd ; src += cs ) {
|
|
// set it
|
|
cs = getUtf8CharSize(src);
|
|
// remove tags
|
|
if ( *src == '<' ) {
|
|
inTag = true;
|
|
// skip if bold tag
|
|
if ( to_lower_a(src[1])=='b' && src[2]=='>' ) continue;
|
|
// skip if italic
|
|
if ( to_lower_a(src[1])=='i' && src[2]=='>' ) continue;
|
|
// skip if already had printed space
|
|
if ( lastWasSpace ) continue;
|
|
// otherwise print the space
|
|
*dst++ = ' ';
|
|
// and set this flag
|
|
lastWasSpace = true;
|
|
continue;
|
|
}
|
|
if ( *src == '>' ) { inTag = false; continue;}
|
|
if ( inTag ) continue;
|
|
// . when serializing address semicolons have special meaning
|
|
// . deal special with spaces. treat comma as a space too now!
|
|
if ( is_wspace_utf8 (src) || *src == fc || *src == ';' ) {
|
|
// stop if would breach
|
|
if ( dstEnd && dst + 1 > dstEnd ) break;
|
|
if ( ! lastWasSpace ) *dst++ = ' ';//*src;
|
|
lastWasSpace = 1;
|
|
continue;
|
|
}
|
|
// reset
|
|
lastWasSpace = 0;
|
|
// stop if would breach
|
|
if ( dstEnd && dst + cs > dstEnd ) break;
|
|
// everything else
|
|
if( cs == 1 ) { *dst++ = *src; continue; }
|
|
// otherwise characters is > 1 byte
|
|
gbmemcpy ( dst , src , cs );
|
|
dst += cs;
|
|
}
|
|
// return bytes written
|
|
return dst - dstStart;
|
|
}
|
|
|
|
// "olen" is length of origUrl to be stored
|
|
int32_t Address::getStoredSize ( int32_t ulen , bool includeHash ) {
|
|
// how much buffer space do we need?
|
|
int32_t need = 0;
|
|
if ( m_name1 ) need += m_name1 ->m_strlen + 1;
|
|
if ( m_name2 ) need += m_name2 ->m_strlen + 1;
|
|
if ( m_suite ) need += m_suite ->m_strlen + 1;
|
|
if ( m_street ) need += m_street->m_strlen + 1;
|
|
if ( m_city ) need += m_city ->m_strlen + 1;
|
|
if ( m_zip ) need += m_zip ->m_strlen + 1;
|
|
if ( m_adm1 ) need += m_adm1 ->m_strlen + 1;
|
|
//if ( m_ctry ) need += m_ctry ->m_strlen + 1;
|
|
// if city our adm1 or country is NULL, guess because it
|
|
// will be looked up and supplied based on lat/lon
|
|
if ( ! m_city ) need += 64 + 1;
|
|
if ( ! m_adm1 ) need += 2 + 1;
|
|
// country!
|
|
need += 3;
|
|
// country is now just ;
|
|
//need++;
|
|
// domainhash
|
|
need += 10 + 1;
|
|
// ip string
|
|
need += 16;
|
|
// this includes the "..." of truncated urls
|
|
need += ulen;
|
|
// latitude
|
|
need += 12;
|
|
// longitude
|
|
need += 12;
|
|
// address hash -- printing out a uint64_t in ascii
|
|
// 18446744073709551615LL = 20 digits + semicolon before it
|
|
need += 21;
|
|
// null term
|
|
need++;
|
|
// timezoneoffset
|
|
//need += 4;
|
|
return need;
|
|
}
|
|
|
|
bool Address::serializeVerified ( SafeBuf *sb ) {
|
|
// get min # of bytes needed
|
|
int32_t need = getStoredSize ( 0 , false );
|
|
// make room
|
|
if ( ! sb->reserve ( need ) ) return false;
|
|
// store it here
|
|
char *buf = sb->getBuf();
|
|
// do it
|
|
int32_t written = serialize ( buf , need , NULL , true , false );
|
|
// sanity check
|
|
if ( written > need ) { char *xx=NULL;*xx=0; }
|
|
// update it
|
|
sb->incrementLength ( written );
|
|
// success
|
|
return true;
|
|
}
|
|
|
|
// . returns -1 and sets g_errno on error
|
|
// . name1;name2;suite;street;city;adm1;zip;country;domHash32;ipStr;url;lat;lon;addHash
|
|
// . setfromstr() above
|
|
int32_t Address::serialize ( char *buf , int32_t bufSize , char *origUrl ,
|
|
bool verifiedOnly , bool includeHash ) {
|
|
|
|
char *p = buf;
|
|
|
|
// sanity check. these should be filtered out
|
|
//if ( m_score <= 0.0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// also truncate at semicolon in urls since that is our delimiter
|
|
char *o = origUrl;
|
|
for ( ; o && *o && *o !=';' ; o++ );
|
|
// truncate this if we should
|
|
int32_t olen = o - origUrl; // gbstrlen(origUrl);
|
|
bool trunc = false;
|
|
if ( olen > 128 ) { olen = 96; trunc = true; }
|
|
// if a semicolon kicked us out, we were truncated as well
|
|
else if ( o && *o == ';' ) trunc = true;
|
|
// include ...
|
|
int32_t extra = 0;
|
|
if ( trunc ) extra = 3;
|
|
|
|
// how much buffer space do we need?
|
|
int32_t need = getStoredSize( olen + extra , includeHash );
|
|
|
|
// silently ignore overflow errors
|
|
if ( need > bufSize ) return -1;
|
|
|
|
PlaceDesc *pd = NULL;
|
|
// guess the city/state names if we got lat/lon only
|
|
if ( m_flags3 & AF2_LATLON ) {
|
|
float distInMilesSquared = 0.0;
|
|
pd = getNearestCity_new ( m_latitude ,
|
|
m_longitude ,
|
|
0 , // niceness
|
|
&distInMilesSquared );
|
|
if ( distInMilesSquared >= 1000 ) pd = NULL;
|
|
}
|
|
|
|
|
|
Place *d ;
|
|
|
|
char flags = m_flags;
|
|
if ( ! verifiedOnly ) flags |= AF_VERIFIED_PLACE_NAME_1;
|
|
if ( ! verifiedOnly ) flags |= AF_VERIFIED_PLACE_NAME_2;
|
|
|
|
d = m_name1;
|
|
if ( d && (flags & AF_VERIFIED_PLACE_NAME_1) ) {
|
|
// bytes written may be different than d->m_strlen since
|
|
// memcpy2() filters out back-to-back spaces
|
|
// should also remove semicolons
|
|
p += memcpy2(p,d->m_str,d->m_strlen,false);
|
|
}
|
|
*p++ = ';';
|
|
|
|
d = m_name2;
|
|
if ( d && (flags & AF_VERIFIED_PLACE_NAME_2) ) {
|
|
// bytes written may be different than d->m_strlen since
|
|
// memcpy2() filters out back-to-back spaces
|
|
p += memcpy2(p,d->m_str,d->m_strlen,false);
|
|
}
|
|
*p++ = ';';
|
|
|
|
d = m_suite;
|
|
if ( d ) {
|
|
// bytes written may be different than d->m_strlen since
|
|
// memcpy2() filters out back-to-back spaces
|
|
p += memcpy2(p,d->m_str,d->m_strlen,true);
|
|
}
|
|
*p++ = ';';
|
|
|
|
d = m_street;
|
|
if ( d ) {
|
|
// bytes written may be different than d->m_strlen since
|
|
// memcpy2() filters out back-to-back spaces
|
|
p += memcpy2(p,d->m_str,d->m_strlen,true);
|
|
}
|
|
*p++ = ';';
|
|
|
|
d = m_city;
|
|
if ( d ) {
|
|
// bytes written may be different than d->m_strlen since
|
|
// memcpy2() filters out back-to-back spaces
|
|
p += memcpy2(p,d->m_str,d->m_strlen,true);
|
|
// append the adm1 code
|
|
//if ( d->m_adm1[0] ) {
|
|
// *p++ = '(';
|
|
// gbmemcpy(p,d->m_adm1,2);
|
|
// p += 2;
|
|
// *p++ = ')';
|
|
//}
|
|
}
|
|
// if city is NULL it must be implied from zip code
|
|
else if ( m_zip ) {
|
|
char *cs = m_zip->m_cityStr;
|
|
if ( gbstrlen(cs) == 0 ) { char *xx=0;*xx=0; }
|
|
p += memcpy2(p,cs,gbstrlen(cs),true);
|
|
}
|
|
else if ( m_flags3 & AF2_LATLON ) {
|
|
if ( pd ) {
|
|
char *str = pd->m_officialNameOffset + g_pbuf;
|
|
int32_t slen = gbstrlen(str);
|
|
// limit to 64 since that is getStoredSize() number
|
|
if ( slen > 64 ) slen = 64;
|
|
gbmemcpy ( p , str ,slen );
|
|
p += slen;
|
|
}
|
|
}
|
|
// otherwise, we have an issue, it must be impliable
|
|
else {
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
*p++ = ';';
|
|
|
|
// mdw mdw
|
|
d = m_adm1;
|
|
if ( d ) {
|
|
// bytes written may be different than d->m_strlen since
|
|
// memcpy2() filters out back-to-back spaces
|
|
//p += memcpy2(p,d->m_str,d->m_strlen,true);
|
|
// to save space use two letter abbr
|
|
p += memcpy2(p,d->m_adm1,2,true);
|
|
// append the adm1 code
|
|
//if ( d->m_adm1[0] ) {
|
|
// *p++ = '(';
|
|
// gbmemcpy(p,d->m_adm1,2);
|
|
// p += 2;
|
|
// *p++ = ')';
|
|
//}
|
|
}
|
|
// if city is NULL it must be implied from zip code
|
|
else if ( m_zip ) {
|
|
p += memcpy2(p,m_zip->m_adm1,2,true);
|
|
}
|
|
// imply from city if city is unique
|
|
//else if ( m_city && (m_city->m_adm1Bits & CF_UNIQUE) ) {
|
|
// p += memcpy2(p,m_city->m_adm1,2,true);
|
|
//}
|
|
else if ( m_flags3 & AF2_LATLON ) {
|
|
// this is the nearest city's state based on our lat/lon
|
|
if ( pd && pd->m_adm1[0] && pd->m_adm1[1] ) {
|
|
gbmemcpy ( p , pd->m_adm1 ,2 );
|
|
p += 2;
|
|
}
|
|
}
|
|
// otherwise, we have an issue, it must be impliable
|
|
else {
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
*p++ = ';';
|
|
|
|
d = m_zip;
|
|
if ( d ) {
|
|
// bytes written may be different than d->m_strlen since
|
|
// memcpy2() filters out back-to-back spaces
|
|
p += memcpy2(p,d->m_str,d->m_strlen,true);
|
|
// append the adm1 code
|
|
//if ( d->m_adm1[0] ) {
|
|
// *p++ = '(';
|
|
// gbmemcpy(p,d->m_adm1,2);
|
|
// p += 2;
|
|
// *p++ = ')';
|
|
//}
|
|
}
|
|
*p++ = ';';
|
|
|
|
// use country code from "crid"
|
|
//char *cn = (char *)g_countryCode.getAbbr(m_adm1->m_crid-1);
|
|
//if ( cn ) {
|
|
// gbmemcpy(p,cn,gbstrlen(cn));
|
|
// p += gbstrlen(cn);
|
|
//}
|
|
if ( m_flags3 & AF2_LATLON ) {
|
|
if ( pd && pd->m_crid ) {
|
|
char *cc = getCountryCode(pd->m_crid);
|
|
gbmemcpy ( p , cc , 2 );
|
|
p += 2;
|
|
}
|
|
}
|
|
*p++ = ';';
|
|
|
|
|
|
// sanity check
|
|
if ( m_domHash32 == 0 ) { char *xx=NULL;*xx=0; }
|
|
// serialize 32-bit domain hash
|
|
p += sprintf( p , "%"UINT32"", m_domHash32 );
|
|
*p++ = ';';
|
|
|
|
// sanity check
|
|
if ( m_ip == 0 || m_ip == -1 ) { char *xx=NULL;*xx=0;}
|
|
// serialize ip string
|
|
p += sprintf( p , "%s", iptoa(m_ip));
|
|
*p++ = ';';
|
|
|
|
|
|
if ( origUrl ) {
|
|
// bytes written may be different than d->m_strlen since
|
|
// memcpy2() filters out back-to-back spaces
|
|
p += memcpy2(p,origUrl,olen,false);
|
|
if ( trunc ) p += memcpy2 (p,"...",3,false);
|
|
}
|
|
*p++ = ';';
|
|
|
|
// then latitude
|
|
if ( m_latitude != NO_LATITUDE && m_latitude != AMBIG_LATITUDE )
|
|
p += sprintf(p,"%f",m_latitude);
|
|
|
|
*p++ = ';';
|
|
|
|
// then longitude
|
|
if ( m_longitude != NO_LONGITUDE && m_longitude != AMBIG_LONGITUDE )
|
|
p += sprintf(p,"%f",m_longitude);
|
|
|
|
if ( includeHash ) {
|
|
*p++ = ';';
|
|
// finally the address hash in ascii
|
|
p += sprintf ( p , "%"UINT64"" , m_hash );
|
|
}
|
|
|
|
// . then timezone off, a single signed byte really
|
|
// . we add 100 to this to signify that it does NOT use DST
|
|
//p += sprintf(p,"%"INT32"", (int32_t)m_timeZoneOffset);
|
|
|
|
*p++ = '\0';
|
|
|
|
// count the semicolons to make sure data did not insert extra ones
|
|
char *s = buf;
|
|
int32_t semiCount = 0;
|
|
int32_t semiNeed = 12;
|
|
if ( includeHash ) semiNeed++;
|
|
for ( ; *s ; s++ ) if ( *s == ';' ) semiCount++;
|
|
if ( semiCount != semiNeed ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t size = p - buf;
|
|
// sanity check
|
|
if ( size > bufSize ) { char *xx=NULL;*xx=0; }
|
|
// all done
|
|
return size;
|
|
}
|
|
|
|
|
|
int32_t Address::print ( ) {
|
|
return print2 ( 0,NULL,0);
|
|
}
|
|
|
|
int32_t Address::print2 ( int32_t i , SafeBuf *pbuf , int64_t uh64 ) {
|
|
|
|
// print out each candidate for debug
|
|
SafeBuf sb;
|
|
|
|
//bool validAddr = ( (m_flags) & AF_INLINED );
|
|
// old sanity checker to ensure div ids were unique
|
|
//static bool s_init = false;
|
|
//static HashTableX ht;
|
|
//if ( ! s_init ) {
|
|
// s_init = true;
|
|
// ht.set ( 4 , 4 , 128 , NULL , 0 , false , 2 );
|
|
//}
|
|
//if ( validAddr ) {
|
|
// if ( ht.isInTable ( &m_divId) ) { char *xx=NULL;*xx=0; }
|
|
// ht.addKey ( &m_divId );
|
|
//}
|
|
|
|
// print out to a table?
|
|
if ( pbuf ) {
|
|
// dump it
|
|
// . for the sake of doing delta diffs in Test.cpp
|
|
// eliminate the number!
|
|
//pbuf->safePrintf ( "<td>%"INT32"/%"INT32"</td>", num ,m_street.m_a);
|
|
//if ( m_street.m_a >= 0 )
|
|
// pbuf->safePrintf ( "<td>%"INT32"</td>", m_street.m_a);
|
|
//else
|
|
int32_t napos = -1;
|
|
if ( m_name1 ) napos = m_name1->m_a;
|
|
|
|
int32_t stra = -1;
|
|
if ( m_street ) stra = m_street->m_a;
|
|
pbuf->safePrintf ( "<td>%"INT32"/%"INT32"</td>", napos,stra );
|
|
|
|
//pbuf->safePrintf ( "<td>%.06f</td>", m_score );
|
|
//pbuf->safePrintf("<td>0x%"XINT32"</td>", m_section->m_tagHash);
|
|
|
|
printEssentials ( pbuf , false , uh64 );
|
|
|
|
// print flags
|
|
pbuf->safePrintf("<td><nobr>");
|
|
//if ( (m_flags) & AF_IGNORE )
|
|
// pbuf->safePrintf("ignore ");
|
|
if ( m_flags & AF_VENUE_DEFAULT )
|
|
pbuf->safePrintf("venueaddress ");
|
|
if ( (m_flags) & AF_INLINED )
|
|
pbuf->safePrintf("inlined ");
|
|
else
|
|
pbuf->safePrintf("notinlined ");
|
|
if ( m_alias )
|
|
pbuf->safePrintf("alias[a=%"INT32"] ",
|
|
m_alias->m_street->m_a);
|
|
|
|
if ( m_flags3 & AF2_HAS_REQUIRED_CITY )
|
|
pbuf->safePrintf("requiredcity ");
|
|
if ( m_flags3 & AF2_HAS_REQUIRED_STATE )
|
|
pbuf->safePrintf("requiredstate ");
|
|
if ( m_street && (m_street->m_flags2 & PLF2_COLLISION) )
|
|
pbuf->safePrintf("streetcollision ");
|
|
|
|
// means that we are inlined and the city FOLLOWS the state
|
|
//if ( (m_flags) & AF_BADORDER )
|
|
// pbuf->safePrintf("badorder ");
|
|
if ( (m_flags) & AF_AMBIGUOUS )
|
|
pbuf->safePrintf("ambig ");
|
|
if ( (m_flags3) & AF2_BADCITYSTATE )
|
|
pbuf->safePrintf("badcitystate ");
|
|
if ( (m_flags) & AF_VERIFIED_STREET )
|
|
pbuf->safePrintf("verifiedstreet ");
|
|
if ( (m_flags) & AF_VERIFIED_STREET_NUM )
|
|
pbuf->safePrintf("verifiedstreetnum ");
|
|
if ( (m_flags) & AF_VERIFIED_PLACE_NAME_1 )
|
|
pbuf->safePrintf("verifiedplacename1 ");
|
|
if ( (m_flags) & AF_VERIFIED_PLACE_NAME_2 )
|
|
pbuf->safePrintf("verifiedplacename2 ");
|
|
|
|
if ( m_street &&(m_street->m_flags3 & PLF3_SUPPLANTED))
|
|
pbuf->safePrintf("<b>supplanted</b> ");
|
|
if ( m_street &&(m_street->m_flags3 & PLF3_LATLONDUP))
|
|
pbuf->safePrintf("<b>latlondup</b> ");
|
|
|
|
if ( m_street &&(m_street->m_flags2 & PLF2_INTERSECTION) )
|
|
pbuf->safePrintf("intersection ");
|
|
if ( m_street &&(m_street->m_flags2 & PLF2_IS_NAME ))
|
|
pbuf->safePrintf("streetisname ");
|
|
if ( m_street &&(m_street->m_flags2 & PLF2_AFTER_AT) )
|
|
pbuf->safePrintf("afterat ");
|
|
if ( m_street &&(m_street->m_flags2 & PLF2_TICKET_PLACE) )
|
|
pbuf->safePrintf("ticketplace ");
|
|
// when the event hours are not "store hours" we flag the
|
|
// place name so as to avoid it as the event title in
|
|
// Events.cpp
|
|
//if ( m_name1 && (m_name1->m_flags2 & PLF2_STORE_NAME) )
|
|
// pbuf->safePrintf("storename ");
|
|
|
|
//if ( (m_flags) & AF_VERIFIED_STREET_IND )
|
|
// pbuf->safePrintf("verifiedstreetind ");
|
|
if ( !(m_flags) )
|
|
pbuf->safePrintf(" ");
|
|
pbuf->safePrintf("</nobr></td>");
|
|
|
|
// print the address ptr, but make it an offset so
|
|
// it doesn't show up on the test qa run diffs
|
|
//int32_t offset = this - base;
|
|
int32_t offset = i;
|
|
pbuf->safePrintf("<td>%"UINT32"</td>",(int32_t)offset);
|
|
|
|
pbuf->safePrintf("<td><nobr>0x%"XINT64" (%"INT32")</nobr></td>",
|
|
m_hash,m_score2);
|
|
|
|
|
|
// print placedb names
|
|
pbuf->safePrintf("<td><nobr>");
|
|
char *s = m_placedbNames;
|
|
char *send = m_placedbNamesEnd;
|
|
// scan them
|
|
for ( ; s && s < send ; ) {
|
|
// skip score
|
|
s += 4;
|
|
// empty? strange...
|
|
if ( ! *s ) { char *xx=NULL;*xx=0; }
|
|
if ( s > m_placedbNames + 4 )
|
|
pbuf->pushChar(',');
|
|
// print that
|
|
pbuf->safePrintf("%s",s);
|
|
// skip that and the \0
|
|
s += gbstrlen(s) + 1;
|
|
}
|
|
pbuf->safePrintf("</nobr></td>");
|
|
|
|
// adm1
|
|
char *adm1Str = "\0\0";
|
|
if ( m_adm1 ) adm1Str = m_adm1->m_adm1;
|
|
else if ( m_zip ) adm1Str = m_zip->m_adm1;
|
|
//else if ( m_city && m_city->m_adm1[0] )
|
|
// adm1Str = m_city->m_adm1;
|
|
else if ( m_flags3 & AF2_LATLON );
|
|
else { char *xx=NULL;*xx=0; }
|
|
// city
|
|
int64_t cityHash = 0LL;
|
|
if ( m_city ) cityHash = m_city->m_hash;
|
|
else if ( m_zip ) cityHash = m_zip->m_cityHash;
|
|
else if ( m_flags3 & AF2_LATLON );
|
|
else { char *xx=NULL;*xx=0; }
|
|
uint32_t cityId = getCityId32(cityHash,adm1Str);
|
|
// ripped from XmlDoc.cpp placedb logic
|
|
key128_t *k2 = &m_placedbKey;
|
|
int64_t bigHash = g_placedb.getBigHash ( k2 );
|
|
int64_t docId = g_placedb.getDocId ( k2 );
|
|
int32_t snh = g_placedb.getStreetNumHash ( k2 );
|
|
int64_t nh1 = 0;
|
|
int64_t nh2 = 0;
|
|
if ( m_name1 ) nh1 = m_name1->m_hash;
|
|
if ( m_name2 ) nh2 = m_name2->m_hash;
|
|
int64_t strh = 0LL;
|
|
if ( m_street ) strh = m_street->m_hash;
|
|
pbuf->safePrintf("<td><nobr>"
|
|
"k.n1=0x%16"XINT64" n0=0x%16"XINT64" "
|
|
//"addrhash=0x%"XINT64" "
|
|
"bigHash64=0x%016"XINT64" "
|
|
"docId=%"UINT64" "
|
|
"streetNumHash25=0x%08"XINT32" "
|
|
"cityHash=0x%016"XINT64" "
|
|
"cityId=0x08%"XINT32" "
|
|
"streetHash=0x%016"XINT64" "
|
|
"adm1Hash=0x%04"XINT32" "
|
|
"name1Hash=0x%016"XINT64" "
|
|
"name2Hash=0x%016"XINT64" "
|
|
"</nobr>"
|
|
"</td>"
|
|
,
|
|
k2->n1 , k2->n0 ,
|
|
//m_hash,
|
|
bigHash,
|
|
docId,
|
|
snh ,
|
|
cityHash,//m_city->m_hash,
|
|
(int32_t)cityId,
|
|
strh, // m_street->m_hash,
|
|
(int32_t)*(uint16_t *)adm1Str,
|
|
nh1,nh2
|
|
);
|
|
|
|
|
|
/*
|
|
char *b1 = " ";
|
|
char *b2 = " ";
|
|
char *b3 = " ";
|
|
if ( m_flags & AF_VERIFIED_STREET ) b1 = "yes";
|
|
if ( m_flags & AF_VERIFIED_STREET_NUM ) b2 = "yes";
|
|
if ( m_flags & AF_VERIFIED_PLACE_NAME ) b3 = "yes";
|
|
pbuf->safePrintf("<td>%s</td>",b1);
|
|
pbuf->safePrintf("<td>%s</td>",b2);
|
|
pbuf->safePrintf("<td>%s</td>",b3);
|
|
*/
|
|
|
|
/*
|
|
pbuf->safePrintf("<td>%.02f</td>",
|
|
m_scoreBase);
|
|
pbuf->safePrintf("<td>%.02f</td>",
|
|
m_scoreNameBeforeStreet);
|
|
pbuf->safePrintf("<td>%.02f</td>",
|
|
m_scoreDistanceNameToStreet);
|
|
pbuf->safePrintf("<td>%.02f</td>",
|
|
m_scoreOldVoteMod);
|
|
pbuf->safePrintf("<td>%.02f</td>",
|
|
m_scoreNewVoteMod);
|
|
pbuf->safePrintf("<td>%.02f</td>",
|
|
m_scoreDistanceNameToStreetValue);
|
|
*/
|
|
|
|
// wrap up the table row
|
|
pbuf->safePrintf ( "</tr>\n");
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
if ( m_name1 ) {
|
|
sb.safePrintf("name1=");
|
|
sb.safeMemcpy(m_name1->m_str,m_name1->m_strlen);
|
|
}
|
|
if ( m_name2 && m_name2->m_str ) {
|
|
sb.safePrintf(" name2=");
|
|
sb.safeMemcpy(m_name2->m_str,m_name2->m_strlen);
|
|
}
|
|
if ( m_street ) {
|
|
sb.safePrintf(" street[%"INT32"]=",m_street->m_a);
|
|
sb.safeMemcpy(m_street->m_str,m_street->m_strlen);
|
|
}
|
|
//if ( m_zip ) {
|
|
// sb.safePrintf(" zip=");
|
|
// sb.safeMemcpy(m_zip->m_str,m_zip->m_strlen);
|
|
//}
|
|
if ( m_suite ) {
|
|
sb.safePrintf(" suite=");
|
|
sb.safeMemcpy(m_suite->m_str,m_suite->m_strlen);
|
|
}
|
|
if ( m_city ) {
|
|
sb.safePrintf(" city[%"INT32"]=",m_city->m_a);
|
|
sb.safeMemcpy(m_city->m_str,m_city->m_strlen);
|
|
}
|
|
if ( m_adm1 ) {
|
|
sb.safePrintf(" adm1[%"INT32"]=",m_adm1->m_a);
|
|
sb.safeMemcpy(m_adm1->m_str,m_adm1->m_strlen);
|
|
sb.pushChar('|');
|
|
sb.safeMemcpy(m_adm1->m_adm1,2);//str,m_adm1->m_strlen);
|
|
}
|
|
if ( m_zip ) {
|
|
sb.safePrintf(" zip=");
|
|
sb.safeMemcpy(m_zip->m_str,m_zip->m_strlen);
|
|
}
|
|
//if ( m_adm2 && m_adm2->m_str ) {
|
|
// sb.safePrintf(" adm2=");
|
|
// sb.safeMemcpy(m_adm2->m_str,m_adm2->m_strlen);
|
|
//}
|
|
//if ( m_ctry->m_str ) {
|
|
// sb.safePrintf(" country=");
|
|
// sb.safeMemcpy(m_ctry->m_str,m_ctry->m_strlen);
|
|
//}
|
|
|
|
sb.safePrintf(" score2=%"INT32"",m_score2);
|
|
|
|
sb.safePrintf(" flags=");
|
|
if ( (m_flags) & AF_INLINED )
|
|
sb.safePrintf("inlined ");
|
|
else
|
|
sb.safePrintf("notinlined ");
|
|
// means that we are inlined and the city FOLLOWS the state
|
|
//if ( (m_flags) & AF_BADORDER )
|
|
// sb.safePrintf("badorder ");
|
|
if ( (m_flags) & AF_AMBIGUOUS )
|
|
sb.safePrintf("ambig ");
|
|
if ( (m_flags) & AF_VERIFIED_STREET )
|
|
sb.safePrintf("verifiedstreet ");
|
|
if ( (m_flags) & AF_VERIFIED_STREET_NUM )
|
|
sb.safePrintf("verifiedstreetnum ");
|
|
if ( (m_flags) & AF_VERIFIED_PLACE_NAME_1 )
|
|
sb.safePrintf("verifiedplacename1 ");
|
|
if ( (m_flags) & AF_VERIFIED_PLACE_NAME_2 )
|
|
sb.safePrintf("verifiedplacename2 ");
|
|
if ( m_street && (m_street->m_flags2 & PLF2_INTERSECTION ))
|
|
sb.safePrintf("intersection ");
|
|
if ( m_street && (m_street->m_flags2 & PLF2_IS_NAME ))
|
|
sb.safePrintf("streetisname ");
|
|
if ( m_street && (m_street->m_flags2 & PLF2_AFTER_AT ))
|
|
sb.safePrintf("afterat ");
|
|
|
|
//sb.safePrintf(" a=%"INT32" b=%"INT32"",m_a,m_b);
|
|
|
|
// null term
|
|
sb.safeMemcpy ( "\0",1 );
|
|
//sb.safePrintf(" =");
|
|
//sb.safeMemcpy(m_->m_str,m_->m_strlen);
|
|
//logf(LOG_DEBUG,"events: addr score=%.06f %s",
|
|
logf(LOG_DEBUG,"events: %s",
|
|
sb.getBufStart() );
|
|
|
|
return 1;
|
|
}
|
|
|
|
void Address::printEssentials ( SafeBuf *pbuf , bool forEvents ,
|
|
int64_t uh64 ) {
|
|
|
|
|
|
pbuf->safePrintf ( "<td><nobr>");
|
|
|
|
// . this is for XmlDoc::validateOutput()
|
|
// . we use javascriptEncode() to convert &'s to & since
|
|
// the javascript escape() function does that before
|
|
// converting into a url encoded character for some
|
|
// reason, which is very annoying!!!! maybe tagInner
|
|
// does that! yeah, probably, it returns normalized output
|
|
// as i've seen it reorganize the attributes of html tags.
|
|
if ( uh64 ) {
|
|
pbuf->safePrintf(
|
|
"<!--ignore-->" // ignore for Test.cpp diff
|
|
"<span class=validated>"
|
|
"<input type=checkbox "
|
|
"onclick=\"senddiv(this,'%"INT64"');\" "
|
|
"unchecked> "
|
|
"<div class=validated style=\"display:none\">",
|
|
// this must be unsigned
|
|
uh64);
|
|
//char *p = pbuf->getBuf();
|
|
//
|
|
// map utf8 characters into &#xxxx entities because
|
|
// the senddiv() function maps all utuf8 chars to
|
|
// crap like "%u2019" for the apostrophe for instance
|
|
//
|
|
if ( m_name1 )
|
|
pbuf->javascriptEncode(m_name1->m_str,m_name1->m_strlen);
|
|
pbuf->pushChar(';');
|
|
if ( m_name2 )
|
|
pbuf->javascriptEncode(m_name2->m_str,m_name2->m_strlen);
|
|
pbuf->pushChar(';');
|
|
if ( m_suite )
|
|
pbuf->javascriptEncode(m_suite->m_str,m_suite->m_strlen);
|
|
pbuf->pushChar(';');
|
|
if ( m_street )
|
|
pbuf->javascriptEncode(m_street->m_str,m_street->m_strlen);
|
|
pbuf->pushChar(';');
|
|
if ( m_city )
|
|
pbuf->javascriptEncode(m_city->m_str,m_city->m_strlen);
|
|
else if ( m_zip )
|
|
pbuf->javascriptEncode(m_zip->m_cityStr,
|
|
gbstrlen(m_zip->m_cityStr));
|
|
else if ( m_flags3 & AF2_LATLON );
|
|
else { char *xx=NULL;*xx=0; }
|
|
pbuf->pushChar(';');
|
|
// now print adm1 abbr
|
|
char *as = NULL;
|
|
int32_t aslen = 2;
|
|
// mdw mdw
|
|
if ( m_adm1 )
|
|
as = m_adm1->m_adm1;
|
|
else if ( m_zip )
|
|
as = m_zip->m_adm1;
|
|
//else if ( m_city && (m_city->m_adm1Bits & CF_UNIQUE) )
|
|
// as = m_city->m_adm1;
|
|
else if ( m_flags3 & AF2_LATLON );
|
|
else { char *xx=NULL;*xx=0; }
|
|
if ( as ) pbuf->javascriptEncode(as,aslen);
|
|
pbuf->pushChar(';');
|
|
if ( m_zip )
|
|
pbuf->javascriptEncode(m_zip->m_str,m_zip->m_strlen);
|
|
pbuf->pushChar(';');
|
|
//if ( m_ctry->m_str )
|
|
// pbuf->javascriptEncode(m_ctry->m_str,m_ctry->m_strlen);
|
|
// now we include lat and long, but only if we got both valid
|
|
if ( m_longitude != NO_LONGITUDE &&
|
|
m_latitude != NO_LONGITUDE ) {
|
|
pbuf->pushChar(';');
|
|
pbuf->safePrintf("%f",m_latitude);
|
|
pbuf->pushChar(';');
|
|
pbuf->safePrintf("%f",m_longitude);
|
|
}
|
|
// now also check the lat/lon we import
|
|
if ( m_importedLatitude != NO_LATITUDE )
|
|
pbuf->safePrintf(";ilat=%f",m_importedLatitude);
|
|
if ( m_importedLongitude != NO_LONGITUDE )
|
|
pbuf->safePrintf(";ilon=%f",m_importedLongitude);
|
|
|
|
//char *pend = pbuf->getBuf();
|
|
pbuf->safePrintf ("\n</div>" );
|
|
pbuf->safePrintf ("</span>" );
|
|
}
|
|
|
|
|
|
// set these
|
|
int32_t nameLen1 = 0;
|
|
char *name1 = NULL;
|
|
if ( m_name1 ) {
|
|
name1 = m_name1->m_str;
|
|
nameLen1 = m_name1->m_strlen;
|
|
}
|
|
if ( forEvents && !(m_flags & AF_VERIFIED_PLACE_NAME_1) )
|
|
name1 = NULL;
|
|
if ( forEvents && m_alias ) {
|
|
name1 = m_alias->m_name1->m_str;
|
|
nameLen1 = m_alias->m_name1->m_strlen;
|
|
}
|
|
if ( ! name1 ) {
|
|
name1 = " ";
|
|
nameLen1 = gbstrlen(name1);
|
|
}
|
|
|
|
//pbuf->safePrintf("<td><nobr>");
|
|
if ( m_alias && forEvents ) {
|
|
pbuf->safePrintf("(alias = ");
|
|
// this will have STREET_IS_NAME set so use the street
|
|
// not name 1
|
|
//pbuf->safeMemcpy(m_name1->m_str,m_name1->m_strlen);
|
|
pbuf->safeMemcpy(m_street->m_str,m_street->m_strlen);
|
|
pbuf->safePrintf(") ");
|
|
};
|
|
pbuf->safeMemcpy(name1,nameLen1);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
|
|
int32_t nameLen2 = 0;
|
|
char *name2 = NULL;
|
|
if ( m_name2 ) {
|
|
nameLen2 = m_name2->m_strlen;
|
|
name2 = m_name2->m_str;
|
|
}
|
|
if ( forEvents && !(m_flags & AF_VERIFIED_PLACE_NAME_2) )
|
|
name2 = NULL;
|
|
if ( forEvents && m_alias ) {
|
|
name2 = m_alias->m_name2->m_str;
|
|
nameLen2 = m_alias->m_name2->m_strlen;
|
|
}
|
|
if ( ! name2 ) {
|
|
name2 = " ";
|
|
nameLen2 = gbstrlen(name2);
|
|
}
|
|
pbuf->safePrintf("<td><nobr>");
|
|
pbuf->safeMemcpy(name2,nameLen2);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
|
|
int32_t suiteLen = 0;
|
|
char *suite = NULL;
|
|
if ( m_suite ) {
|
|
suiteLen = m_suite->m_strlen;
|
|
suite = m_suite->m_str;
|
|
}
|
|
if ( forEvents && m_alias ) {
|
|
suite = m_alias->m_suite->m_str;
|
|
suiteLen = m_alias->m_suite->m_strlen;
|
|
}
|
|
if ( ! suite ) {
|
|
suite = " ";
|
|
suiteLen = gbstrlen(suite);
|
|
}
|
|
pbuf->safePrintf("<td><nobr>");
|
|
pbuf->safeMemcpy(suite,suiteLen);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
|
|
int32_t streetLen = 0;
|
|
char *street = NULL;
|
|
if ( m_street ) {
|
|
streetLen = m_street->m_strlen;
|
|
street = m_street->m_str;
|
|
}
|
|
if ( forEvents && m_alias ) {
|
|
street = m_alias->m_street->m_str;
|
|
streetLen = m_alias->m_street->m_strlen;
|
|
}
|
|
if ( ! street ) {
|
|
street = " ";
|
|
streetLen = gbstrlen(street);
|
|
}
|
|
pbuf->safePrintf("<td><nobr>");
|
|
//pbuf->safeMemcpy(street,streetLen);
|
|
// print it right. niceness = 0
|
|
pbuf->htmlEncode ( street,streetLen, true,0);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
|
|
int32_t cityLen = 0;
|
|
char *city = NULL;
|
|
if ( m_city ) {
|
|
cityLen = m_city->m_strlen;
|
|
city = m_city->m_str;
|
|
}
|
|
if ( forEvents && m_alias ) {
|
|
city = m_alias->m_city->m_str;
|
|
cityLen = m_alias->m_city->m_strlen;
|
|
}
|
|
if ( ! city ) {
|
|
city = " ";
|
|
cityLen = gbstrlen(city);
|
|
}
|
|
pbuf->safePrintf("<td><nobr>");
|
|
pbuf->safeMemcpy(city,cityLen);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
|
|
|
|
int32_t adm1Len = 0;
|
|
char *adm1 = NULL;
|
|
if ( m_adm1 ) {
|
|
adm1Len = 2;//m_adm1->m_strlen;
|
|
adm1 = m_adm1->m_adm1;//str;
|
|
}
|
|
if ( forEvents && m_alias ) {
|
|
adm1 = m_alias->m_adm1->m_adm1;//str;
|
|
adm1Len = 2;//m_alias->m_adm1->m_strlen;
|
|
}
|
|
if ( ! adm1 ) {
|
|
adm1 = " ";
|
|
adm1Len = gbstrlen(adm1);
|
|
}
|
|
pbuf->safePrintf("<td><nobr>");
|
|
pbuf->safeMemcpy(adm1,adm1Len);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
|
|
int32_t zipLen = 0;
|
|
char *zip = NULL;
|
|
if ( m_zip ) {
|
|
zipLen = m_zip->m_strlen;
|
|
zip = m_zip->m_str;
|
|
}
|
|
if ( forEvents && m_alias ) {
|
|
zip = m_alias->m_zip->m_str;
|
|
zipLen = m_alias->m_zip->m_strlen;
|
|
}
|
|
if ( ! zip ) {
|
|
zip = " ";
|
|
zipLen = gbstrlen(zip);
|
|
}
|
|
pbuf->safePrintf("<td><nobr>");
|
|
pbuf->safeMemcpy(zip,zipLen);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
|
|
pbuf->safePrintf("<td><nobr>");
|
|
/*
|
|
// ctry is special
|
|
char *ctry = m_ctry->m_str;
|
|
if ( forEvents && m_alias ) ctry = m_alias->m_ctry->m_str;
|
|
if ( ! ctry ) {
|
|
Place *cp = &m_adm1;
|
|
char *cn = (char *)g_countryCode.getName(cp->m_crid-1);
|
|
if ( cn ) pbuf->safeMemcpy ( cn,gbstrlen(cn) );
|
|
else pbuf->safePrintf("unknown");
|
|
}
|
|
else
|
|
pbuf->safePrintf("%s",ctry);
|
|
*/
|
|
pbuf->safePrintf("</nobr></td>");
|
|
|
|
double lat = m_latitude;
|
|
double lon = m_longitude;
|
|
|
|
// geocoder lat/lon
|
|
lat = m_geocoderLat;
|
|
lon = m_geocoderLon;
|
|
pbuf->safePrintf("<td><nobr>");
|
|
if ( lat != NO_LATITUDE && lat != AMBIG_LATITUDE )
|
|
pbuf->safePrintf("%f",lat);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
pbuf->safePrintf("<td><nobr>");
|
|
if ( lon != NO_LONGITUDE && lon != AMBIG_LONGITUDE )
|
|
pbuf->safePrintf("%f",lon);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
// then lat/lon
|
|
lat = m_latitude;
|
|
lon = m_longitude;
|
|
pbuf->safePrintf("<td><nobr>");
|
|
if ( lat != NO_LATITUDE && lat != AMBIG_LATITUDE )
|
|
pbuf->safePrintf("%f",lat);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
pbuf->safePrintf("<td><nobr>");
|
|
if ( lon != NO_LONGITUDE && lon != AMBIG_LONGITUDE )
|
|
pbuf->safePrintf("%f",lon);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
// IMPORTED lat/lon
|
|
lat = m_importedLatitude;
|
|
lon = m_importedLongitude;
|
|
pbuf->safePrintf("<td><nobr>");
|
|
if ( lat != NO_LATITUDE && lat != AMBIG_LATITUDE )
|
|
pbuf->safePrintf("%f (%"INT32")",lat,m_importedVotes);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
|
|
pbuf->safePrintf("<td><nobr>");
|
|
if ( lon != NO_LONGITUDE && lon != AMBIG_LONGITUDE )
|
|
pbuf->safePrintf("%f (%"INT32")",lon,m_importedVotes);
|
|
pbuf->safePrintf("</nobr></td>\n");
|
|
}
|
|
|
|
void printPlaces ( PlaceMem *pm , SafeBuf *pbuf , Sections *sections,
|
|
Address *base ) {
|
|
|
|
if ( pbuf ) pbuf->safePrintf ( "<table cellpadding=3 border=1>"
|
|
//"<tr><td>#</td>"
|
|
"<td><b>simple place</b></td>"
|
|
|
|
//"<td><b>score</b></td>"
|
|
//"<td><b>indBoost</b></td>"
|
|
|
|
"<td><b>flags</b></td>"
|
|
|
|
"<td><b><nobr>place hash"
|
|
"</nobr></b></td>"
|
|
|
|
"<td><b><nobr>address ptr"
|
|
"</nobr></b></td>"
|
|
|
|
"<td><b><nobr>word a</nobr></b></td>"
|
|
"<td><b><nobr>word b</nobr></b></td>"
|
|
"<td><b><nobr>alnum word a</nobr>"
|
|
"</b></td>"
|
|
"<td><b><nobr>alnum word b</nobr>"
|
|
"</b></td>"
|
|
//"<td><b>depth</b></td>"
|
|
//"<td><b><nobr>section #</nobr></b></td>"
|
|
//"<td><b><nobr>parent section #</nobr>"
|
|
//"</b></td>"
|
|
"<td><b><nobr>section tagHash</nobr>"
|
|
"</b></td>"
|
|
|
|
"</tr>\n" );
|
|
|
|
// just streets really, or fake streets
|
|
for ( int32_t i = 0 ; i < pm->getNumPtrs() ; i++ ) { // np
|
|
Place *pi = (Place *)pm->getPtr(i);
|
|
char *p = pi->m_str;
|
|
char *pend = p + pi->m_strlen;
|
|
char c = *pend;
|
|
*pend = 0;
|
|
int32_t flags = pi->m_bits;
|
|
char fbuf[1000];
|
|
char *f = fbuf;
|
|
// skip if filtered out from the city/adm1 loop above
|
|
if ( ! pi->m_type ) { *pend = c; continue; }
|
|
f += sprintf ( f , "type=" );
|
|
/*
|
|
if ( pi->m_type == PT_SCH )
|
|
f += sprintf ( f , "school " );
|
|
if ( pi->m_type == PT_PRK )
|
|
f += sprintf ( f , "park " );
|
|
if ( pi->m_type == PT_CITY )
|
|
f += sprintf ( f , "city " );
|
|
if ( pi->m_type == PT_STATE )
|
|
f += sprintf ( f , "adm1 " );
|
|
if ( pi->m_type == PT_ADM2 )
|
|
f += sprintf ( f , "adm2 " );
|
|
if ( pi->m_type == PT_ADM3 )
|
|
f += sprintf ( f , "adm3 " );
|
|
if ( pi->m_type == PT_ADM4 )
|
|
f += sprintf ( f , "adm4 " );
|
|
if ( pi->m_type == PT_CTRY )
|
|
f += sprintf ( f , "ctry " );
|
|
if ( pi->m_type == PT_ZIP )
|
|
f += sprintf ( f , "zip " );
|
|
if ( pi->m_type == PT_SUITE )
|
|
f += sprintf ( f , "suite " );
|
|
if ( pi->m_type == PT_NAME_1 )
|
|
f += sprintf ( f , "name1 " );
|
|
if ( pi->m_type == PT_NAME_2 )
|
|
f += sprintf ( f , "name2 " );
|
|
*/
|
|
if ( pi->m_type == PT_STREET )
|
|
f += sprintf ( f , "street " );
|
|
else if ( pi->m_type == PT_CITY )
|
|
f += sprintf ( f , "city " );
|
|
else if ( pi->m_type == PT_STATE )
|
|
f += sprintf ( f , "state " );
|
|
else if ( pi->m_type == PT_NAME_1 )
|
|
f += sprintf ( f , "name1 " );
|
|
else if ( pi->m_type == PT_NAME_2 )
|
|
f += sprintf ( f , "name2 " );
|
|
else if ( pi->m_type == PT_SUITE )
|
|
f += sprintf ( f , "suite " );
|
|
else if ( pi->m_type == PT_ZIP )
|
|
f += sprintf ( f , "zip " );
|
|
else if ( pi->m_type == PT_LATLON )
|
|
f += sprintf ( f , "latlon " );
|
|
else { char *xx=NULL;*xx=0; }
|
|
|
|
f += sprintf ( f , "flags=" );
|
|
char *of = f;
|
|
//if ( flags & PLF_HAS_UPPER )
|
|
// f += sprintf ( f , "hasupper " );
|
|
//if ( flags & PLF_ALT )
|
|
// f += sprintf ( f , "alt " );
|
|
//if ( flags & PLF_IGNORE )
|
|
// f += sprintf ( f , "ignore " );
|
|
//if ( flags & PLF_PARTIAL )
|
|
// f += sprintf ( f , "partial " );
|
|
//if ( flags & PLF_AMBIGUOUS )
|
|
// f += sprintf ( f , "ambig " );
|
|
|
|
|
|
if ( pi->m_flags2 & PLF2_COLLISION )
|
|
f += sprintf(f,"streetcollision ");
|
|
if ( pi->m_flags2 & PLF2_REQUIRED )
|
|
f += sprintf(f,"requiredplace ");
|
|
if ( pi->m_flags2 & PLF2_TICKET_PLACE )
|
|
f += sprintf(f,"ticketplace ");
|
|
if ( pi->m_flags2 & PLF2_INTERSECTION )
|
|
f += sprintf(f,"intersection ");
|
|
if ( pi->m_flags2 & PLF2_IS_NAME )
|
|
f += sprintf(f,"streetisname ");
|
|
if ( pi->m_flags2 & PLF2_AFTER_AT )
|
|
f += sprintf(f,"afterat ");
|
|
if ( pi->m_flags2 & PLF2_IS_POBOX )
|
|
f += sprintf(f,"ispobox ");
|
|
if ( pi->m_address )
|
|
f += sprintf(f,"inaddress ");
|
|
if ( pi->m_unverifiedAddress )
|
|
f += sprintf(f,"inunverifiedaddress ");
|
|
if ( pi->m_alias )
|
|
f += sprintf(f,"alias[a=%"INT32"] ",
|
|
pi->m_alias->m_street->m_a);
|
|
|
|
if ( flags & PLF_INFILE )
|
|
f += sprintf ( f , "infile " );
|
|
//if ( flags & PLF_INHERITED )
|
|
// f += sprintf ( f , "inherited " );
|
|
//if ( flags & PLF_FROMZIP )
|
|
// f += sprintf ( f , "fromzip ");
|
|
if ( flags & PLF_FROMTAG )
|
|
f += sprintf ( f , "fromtag " );
|
|
if ( flags & PLF_FROMTITLE )
|
|
f += sprintf ( f , "fromtitle " );
|
|
if ( flags & PLF_ABBR )
|
|
f += sprintf ( f , "abbr " );
|
|
//if ( f == of ) *f++ = ' ';
|
|
//else f[-1] = ' ';
|
|
if ( f == of )
|
|
f += sprintf(f," ");
|
|
|
|
/*
|
|
if ( flags & IND_NAME )
|
|
f += sprintf ( f , "ind_name " );
|
|
if ( flags & IND_SUITE )
|
|
f += sprintf ( f , "ind_suite " );
|
|
if ( flags & IND_STREET )
|
|
f += sprintf ( f , "ind_street " );
|
|
if ( flags & IND_DIR )
|
|
f += sprintf ( f , "ind_dir " );
|
|
*/
|
|
//if ( flags & IND_BITS )
|
|
// f += sprintf ( f , "ind_bits " );
|
|
|
|
// add state
|
|
//if ( pi->m_adm1[0] && pi->m_adm1[1] )
|
|
// f += sprintf(f,"adm1=%c%c ",
|
|
// pi->m_adm1[0],pi->m_adm1[1]);
|
|
|
|
// add country
|
|
//if ( pi->m_crid )
|
|
// f += sprintf(f,"ctry=%s ",
|
|
// g_countryCode.getName(pi->m_crid-1) );
|
|
|
|
*f = '\0';
|
|
|
|
// int16_tcut
|
|
Section **sp = sections->m_sectionPtrs;
|
|
// get section
|
|
Section *sn = NULL;
|
|
if ( pi->m_a >= 0 ) sn = sp [ pi->m_a ];
|
|
int32_t depth = -1;
|
|
if ( sn ) depth = sn->m_depth;
|
|
// sectio number
|
|
int32_t secNum = -1;
|
|
int32_t parentSecNum = -1;
|
|
if ( sn ) secNum = (int32_t)(sn - sp[0]);
|
|
Section *parent = NULL;
|
|
if ( sn ) parent = sn->m_parent;
|
|
if ( parent ) parentSecNum = (int32_t)(parent - sp[0]);
|
|
int32_t secHash = 0;
|
|
if ( sn ) secHash = sn->m_turkTagHash32;
|
|
// print the address we are in or the address we alias
|
|
Address *myaddr = NULL;
|
|
if ( pi->m_address ) myaddr = pi->m_address;
|
|
if ( pi->m_alias ) myaddr = pi->m_alias;
|
|
// make it relative so qa test run diff is ok
|
|
// MDW: might need to store the off in m_addressOff/m_aliasOff
|
|
// or something.. keep an eye on this
|
|
int32_t myoff = i;//myaddr - base;
|
|
if ( myaddr == NULL ) myoff = -1;
|
|
// sanity check
|
|
// no, we now allow a full address like
|
|
// "14th and curtis, denver co" to be an alias to a non
|
|
// intersection address "1000 14th street, denver co"
|
|
// as in devner.org
|
|
//if ( pi->m_address && pi->m_alias ) {char *xx=NULL;*xx=0;}
|
|
|
|
if ( pbuf ) {
|
|
pbuf->safePrintf ( "<tr>"
|
|
//"<td>%"INT32"</td>"
|
|
"<td><nobr>" );
|
|
// print it right. niceness = 0
|
|
pbuf->htmlEncode ( p , gbstrlen(p) , true,0);
|
|
pbuf->safePrintf ("</nobr></td>"
|
|
//"<td>%.02f</td>"
|
|
//"<td>%.02f</td>"
|
|
|
|
"<td><nobr>%s</nobr></td>"
|
|
"<td>0x%"XINT64"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
//"<td>%"INT32"</td>"
|
|
//"<td>%"INT32"</td>"
|
|
//"<td>%"INT32"</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"</tr>\n" ,
|
|
//i,
|
|
//p,
|
|
//pi->m_score,
|
|
//pi->m_indScore,
|
|
|
|
fbuf ,
|
|
pi->m_hash ,//m_hash
|
|
(int32_t)myoff,
|
|
(int32_t)pi->m_a ,
|
|
(int32_t)pi->m_b ,
|
|
(int32_t)pi->m_alnumA ,
|
|
(int32_t)pi->m_alnumB ,
|
|
//(int32_t)depth ,
|
|
//secNum,
|
|
//parentSecNum,
|
|
secHash);
|
|
}
|
|
else
|
|
logf(LOG_DEBUG,"events: place #%"INT32" \"%s\" "
|
|
"flags=%s alnuma=%"INT32" alnumb=%"INT32" "
|
|
//"taghash=0x%"XINT32""
|
|
,
|
|
i,p,
|
|
//pi->m_score,
|
|
fbuf,
|
|
pi->m_alnumA ,
|
|
pi->m_alnumB
|
|
//pi->m_indScore
|
|
//pi->m_tagHash);
|
|
);
|
|
// put char back
|
|
*pend = c;
|
|
|
|
// sanity
|
|
if ( ! ( pi->m_type ) ) { char *xx=NULL;*xx=0; }
|
|
|
|
}
|
|
if ( pbuf ) pbuf->safePrintf ( "</table><br>\n" );
|
|
}
|
|
|
|
|
|
|
|
// THINK ABOUT: discard phrases with number at end, no "suite" indicator, and
|
|
// has US as the country (do this last)
|
|
// "... be eligible to play. AYSO Region 1447 offers a fun..."
|
|
// "Sunday 9 . 6, Tuesday 10 - 4; " --> no street called "Tuesday"!
|
|
|
|
class AliasDesc {
|
|
public:
|
|
char *m_s1;
|
|
char *m_s2;
|
|
char *m_adm1;
|
|
char *m_mostPopStateAbbr;
|
|
// these are relative to the aliases as far as computing the best/
|
|
// default state that contains it. right now we just set santa fe
|
|
// down to 99 so that "sf" maps to "san francisco" by default.
|
|
int32_t m_pop;
|
|
};
|
|
|
|
static AliasDesc s_cityList[] = {
|
|
//{"abq","albquerque"}
|
|
//,{"alb","albquerque"}
|
|
//,{"albq","albquerque"}
|
|
|
|
{"ny","new york city","ny","ny",1000}
|
|
,{"nyc","new york city","ny","ny",1000}
|
|
,{"n y c","new york city","ny","ny",1000}
|
|
,{"la","los angeles","ca","ca",1000}
|
|
,{"lax","los angeles","ca","ca",1000}
|
|
,{"chi","chicago","il","il",1000}
|
|
,{"hou","houston","tx","tx",1000}
|
|
,{"phx","phoenix","az","az",1000}
|
|
,{"phoex","phoenix","az","az",1000}
|
|
,{"phi","philadelphia","pa","pa",1000}
|
|
,{"sa","san antonio","tx","tx",1000}
|
|
,{"sd","san diego","ca","ca",1000}
|
|
,{"dal","dallas","tx","tx",1000}
|
|
,{"sj","san jose","ca","ca",1000}
|
|
,{"det","detroit","mi","mi",1000}
|
|
,{"jax","jacksonville","fl","fl",1000}
|
|
,{"j-ville","jacksonville","fl","fl",1000}
|
|
,{"indy","indianapolis","in","in",1000}
|
|
,{"sf","san francisco","ca","ca",1000}
|
|
,{"san fran","san francisco","ca","ca",1000}
|
|
,{"sf","santa fe","nm","ca",99}
|
|
,{"cols","columbus","oh","oh",1000}
|
|
,{"colo","columbus","oh","oh",1000}
|
|
,{"atx","austin","tx","tx",1000}
|
|
,{"mem","memphis","tn","tn",1000}
|
|
,{"fw","fort worth","tx","tx",1000}
|
|
,{"ft worth","fort worth","tx","tx",1000}
|
|
,{"balto","baltimore","md","md",1000}
|
|
,{"clt","charlotte","nc","nc",1000}
|
|
|
|
,{"ept","El Paso","tx","tx",1000}
|
|
,{"elp","El Paso","tx","tx",1000} // airport
|
|
,{"bos","Boston","ma","ma",1000} // airport
|
|
,{"sea","Seattle","wa","wa",1000}
|
|
,{"mil","Milwaukee","wi","wi",1000}
|
|
,{"milw","Milwaukee","wi","wi",1000}
|
|
,{"mke","Milwaukee","wi","wi",1000}
|
|
,{"den","Denver","co","co",1000}
|
|
,{"denv","Denver","co","co",1000}
|
|
,{"lv","Las Vegas","nv","nv",1000} // postal
|
|
,{"las","Las Vegas","nv","nv",1000} // airport
|
|
,{"nash","Nashville","tn","tn",1000}
|
|
,{"nashv","Nashville","tn","tn",1000}
|
|
,{"bna","Nashville","tn","tn",1000}
|
|
,{"okc","Oklahoma City","ok","ok",1000}
|
|
,{"pdx","Portland","or","or",1000}
|
|
,{"port","Portland","or","or",1000}
|
|
,{"tuc","Tucson","az","az",1000}
|
|
,{"tucs","Tucson","az","az",1000}
|
|
,{"abq","Albuquerque","nm","nm",1000}
|
|
,{"alb","Albuquerque","nm","nm",1000}
|
|
,{"albq","Albuquerque","nm","nm",1000}
|
|
,{"q-town","Albuquerque","nm","nm",1000}
|
|
,{"atl","Atlanta","ga","ga",1000}
|
|
,{"lbc","Long Beach","ca","ca",1000}
|
|
,{"lb","Long Beach","ca","ca",1000}
|
|
,{"frs","Fresno","ca","ca",1000}
|
|
,{"sacto","Sacramento","ca","ca",1000}
|
|
,{"smf","Sacramento","ca","ca",1000} // airport
|
|
//,{"","Mesa","","",1000}
|
|
,{"kc","Kansas City","ks","ks",1000}
|
|
,{"cle","Cleveland","oh","oh",1000}
|
|
,{"cleve","Cleveland","oh","oh",1000}
|
|
,{"vab","Virginia Beach","va","va",1000}
|
|
,{"oma","Omaha","ne","ne",1000}
|
|
,{"mi","Miami","fl","fl",1000}
|
|
,{"oak","Oakland","ca","ca",1000}
|
|
//,{"","Tulsa","","",1000}
|
|
,{"hon","Honolulu","hi","hi",1000}
|
|
,{"hnl","Honolulu","hi","hi",1000}
|
|
,{"hono","Honolulu","hi","hi",1000}
|
|
,{"mpls","Minneapolis","mn","mn",1000}
|
|
,{"anc","Arlington","va","va",1000}
|
|
,{"wh","Wichita","ks","ks",1000}
|
|
//,{"","Raleigh","","",1000}
|
|
,{"stl","Saint Louis","mo","mo",1000}
|
|
,{"st louis","saint louis","mo","mo",1000}
|
|
,{"sna","Santa Ana","ca","ca",1000}
|
|
,{"aoc","Anaheim","ca","ca",1000} // anaheim orange county
|
|
,{"tpa","Tampa","fl","fl",1000}
|
|
,{"cinti","Cincinnati","oh","oh",1000}
|
|
,{"cincy","Cincinnati","oh","oh",1000}
|
|
,{"pitt","Pittsburgh","pa","pa",1000}
|
|
,{"pit","Pittsburgh","pa","pa",1000}
|
|
,{"pgh","Pittsburgh","pa","pa",1000}
|
|
,{"pitts","Pittsburgh","pa","pa",1000}
|
|
,{"bfd","Bakersfield","ca","ca",1000}
|
|
//,{"","Aurora","","",1000}
|
|
//,{"","Toledo","","",1000}
|
|
//,{"","Riverside","","",1000}
|
|
,{"sto","Stockton","ca","ca",1000}
|
|
,{"cctx","Corpus Christi","tx","tx",1000}
|
|
,{"cor chr","Corpus Christi","tx","tx",1000}
|
|
//,{"","Newark","","",1000}
|
|
,{"anch","Anchorage","ak","ak",1000}
|
|
,{"buff","Buffalo","ny","ny",1000}
|
|
,{"stpaul","Saint Paul","mn","mn",1000}
|
|
,{"st paul","Saint Paul","mn","mn",1000}
|
|
//,{"","Plano","","",1000}
|
|
,{"fwa","Fort Wayne","in","in",1000} // airport
|
|
//,{"ftw","Fort Wayne","","",1000}
|
|
,{"ft wayne","Fort Wayne","in","in",1000} // airport
|
|
,{"st petersburg","saint petersburg","fl","fl",1000}
|
|
//,{"","Glendale","","",1000}
|
|
,{"jc","Jersey City","nj","nj",1000}
|
|
//,{"","Lincoln","","",1000}
|
|
//,{"","Henderson","","",1000}
|
|
//,{"","Chandler","","",1000}
|
|
//,{"","Greensboro","","",1000}
|
|
//,{"","Scottsdale","","",1000}
|
|
,{"br","Baton Rouge","la","la",1000}
|
|
,{"bham","Birmingham","al","al",1000}
|
|
,{"b ham","Birmingham","al","al",1000}
|
|
,{"nflk","Norfolk","va","va",1000}
|
|
,{"madsn","Madison","wi","wi",1000}
|
|
,{"no","New Orleans","la","la",1000}
|
|
,{"north hempstead","Town of North Hempstead","ny","ny",1000}
|
|
,{"n hempstead","Town of North Hempstead","ny","ny",1000}
|
|
,{"n hemp","Town of North Hempstead","ny","ny",1000}
|
|
,{"north hemp","Town of North Hempstead","ny","ny",1000}
|
|
,{"chesp","Chesapeake","va","va",1000}
|
|
//,{"","Orlando","","",1000}
|
|
//,{"","Garland","","",1000}
|
|
//,{"","Hialeah","","",1000}
|
|
//,{"","Laredo","","",1000}
|
|
,{"cv","Chula Vista","ca","ca",1000}
|
|
//,{"","Lubbock","","",1000}
|
|
//,{"","Reno","","",1000}
|
|
//,{"","Akron","","",1000}
|
|
//,{"","Durham","","",1000}
|
|
,{"roch","Rochester","ny","ny",1000}
|
|
//,{"","Modesto","","",1000}
|
|
,{"mont","Montgomery","al","al",1000}
|
|
//,{"","Fremont","","",1000}
|
|
//,{"","Shreveport","","",1000}
|
|
//,{"","Arlington","","",1000}
|
|
//,{"","Glendale","","",1000}
|
|
};
|
|
|
|
|
|
bool addCity ( uint64_t ch64 ,
|
|
char *adm1 ,
|
|
int32_t pop ,
|
|
HashTableX *maxPops ) {
|
|
|
|
// see if already in the table
|
|
CityDesc *cdp = (CityDesc *)g_cities.getValue(&ch64);
|
|
|
|
//
|
|
// if contending with another state that has this
|
|
// same city name, check his city pop
|
|
//
|
|
// get the last max popularity for this state
|
|
int32_t *v=(int32_t *)maxPops->getValue(&ch64);
|
|
// save it into "lastPop" in case *v changes
|
|
int32_t lastPop = -1;
|
|
if ( v ) lastPop = *v;
|
|
// update pop with ours if bigger
|
|
if ( v && pop > *v ) *v = pop;
|
|
|
|
uint64_t adm1Bits = getAdm1Bits ( adm1 );
|
|
if ( ! adm1Bits ) { char *xx=NULL;*xx=0; }
|
|
|
|
// if there, or it in
|
|
if ( cdp ) cdp->m_adm1Bits |= adm1Bits;
|
|
|
|
//if ( ch64==2443313629685134902LL && adm1Bits==2147483648 ) {
|
|
// log("hey");
|
|
//}
|
|
|
|
// get our state
|
|
StateDesc *sd = getStateDesc ( adm1 );
|
|
// get our state index
|
|
int32_t stateIndex = sd - s_states;
|
|
|
|
// update most popular state index?
|
|
if ( cdp && pop > lastPop ) {
|
|
// change it to our state
|
|
cdp->m_mostPopularState = stateIndex;
|
|
return true;
|
|
}
|
|
|
|
// already there? then skip
|
|
if ( cdp ) return true;
|
|
|
|
// otherwise, add the pop for the first time
|
|
maxPops->addKey(&ch64,&pop);
|
|
|
|
// now this is CityDesc
|
|
CityDesc cd;
|
|
cd.m_adm1Bits = adm1Bits;
|
|
cd.m_mostPopularState = stateIndex;
|
|
|
|
// otherwise, just add it
|
|
g_cities.addKey ( &ch64 , &cd ) ; // adm1Bits );
|
|
return true;
|
|
}
|
|
|
|
// . ch64 is the 64bit hash of the original city name
|
|
// . "alias" is the alias name o fthe city
|
|
// . adm1Str is the state it is in
|
|
bool addAlias ( char *alias ,
|
|
char *adm1Str ,
|
|
uint64_t ch64 ,
|
|
int32_t pop ,
|
|
HashTableX *maxPops ) {
|
|
// sanity check
|
|
if ( is_upper_a(adm1Str[0]) ) { char *xx=NULL;*xx=0; }
|
|
if ( is_upper_a(adm1Str[1]) ) { char *xx=NULL;*xx=0; }
|
|
// get "hash" of state
|
|
uint32_t adm1Hash32 = (uint32_t)(*(uint16_t *)adm1Str);
|
|
// get hash of city name alias
|
|
uint64_t ah = getWordXorHash ( alias );
|
|
// nothing?
|
|
if ( ! ah ) return true;
|
|
// debug point
|
|
if ( !strcmp(alias,"sf") )
|
|
log("hey");
|
|
// get the bits
|
|
uint64_t adm1Bits = getAdm1Bits ( adm1Str );
|
|
// if already in g_cities for this state, do not add as alias!
|
|
CityDesc *test = (CityDesc *) g_cities.getValue(&ah);
|
|
if ( test && (test->m_adm1Bits & adm1Bits ) ) {
|
|
|
|
|
|
// no! strange... how is this happening...
|
|
//log("strange");
|
|
return true;
|
|
}
|
|
// hash city name alias and adm1 together
|
|
uint32_t aliasStateHash = hash32h ( (uint32_t)ah , adm1Hash32 );
|
|
// now that maps to the proper cityId32
|
|
uint32_t cid32 = getCityId32 ( ch64 , adm1Str ) ;
|
|
// must be a proper city name
|
|
CityDesc *cd = (CityDesc *)g_cities.getValue(&ch64);
|
|
if ( ! cd ) { char *xx=NULL;*xx=0; }
|
|
// make sure the city we are an alias for is in our state!
|
|
if ( !(cd->m_adm1Bits & adm1Bits) ) { char *xx=NULL;*xx=0; }
|
|
// add to alias table
|
|
if (!g_aliases.addKey (&aliasStateHash,&cid32)){char*xx=NULL;*xx=0;}
|
|
// sanity check -- verify the cityId works out
|
|
if ( ! g_timeZones.isInTable(&cid32) ) { char *xx=NULL;*xx=0;}
|
|
// then add to city table
|
|
addCity ( ah , adm1Str , pop , maxPops );
|
|
return true;
|
|
}
|
|
|
|
bool initPlaceDescTable ( ) {
|
|
|
|
// sanity check
|
|
if ( s_init ) { char *xx=NULL;*xx=0; }
|
|
|
|
// bail if not indexing events
|
|
//if ( ! g_conf.m_indexEventsOnly ) return true;
|
|
return true;
|
|
|
|
// . make this table
|
|
// . has words that can be lower case in a place name
|
|
//s_lc.set ( 8 , 0 , 0 , s_lcbuf , 2000 , false , 0 ,"plnametbl");
|
|
// stock the table (StopWords.cpp function)
|
|
if ( ! initWordTable ( &s_lc , s_lcWords ,
|
|
//sizeof(s_lcWords),
|
|
"plnametbl")){
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// we are init now
|
|
s_init = true;
|
|
|
|
// init indicator table
|
|
g_indicators.set ( 6 , // keySize
|
|
sizeof(IndDesc) , // dataSize
|
|
0 , // initial # slots
|
|
NULL , // initial buf
|
|
0 , // initial buf size
|
|
false , // allowDup keys?
|
|
0 , // niceness
|
|
"indictbl" );
|
|
|
|
// load inidcator table
|
|
//bool loadedIndicators = false;
|
|
/*
|
|
if ( g_indicators.load ( g_hostdb.m_dir , "indicators.dat" ) ) {
|
|
loadedIndicators = true;
|
|
int64_t h = hash64 ( "highway" , 7 );
|
|
// test the indicators
|
|
if ( g_indicators.getSlot ( &h ) < 0 ){char *xx=NULL;*xx=0; }
|
|
// test the indicators
|
|
h = hash64Lower_a ( "N" , 1 );
|
|
if ( g_indicators.getSlot ( &h ) < 0 ){char *xx=NULL;*xx=0; }
|
|
}
|
|
*/
|
|
// fix it
|
|
//loadedIndicators = true;
|
|
|
|
// keep these separate so we do not have to recompute any time we
|
|
// add or subtract to/from this list
|
|
addIndicator ( "airport" , IND_NAME , 1.0 );
|
|
addIndicator ( "airstrip" , IND_NAME , 1.0 );
|
|
addIndicator ( "area" , IND_NAME , 1.0 );
|
|
addIndicator ( "arena" , IND_NAME , 1.0 );
|
|
addIndicator ( "arroyo" , IND_NAME , 1.0 );
|
|
addIndicator ( "bank" , IND_NAME , 1.0 );
|
|
addIndicator ( "banks" , IND_NAME , 1.0 );
|
|
addIndicator ( "bar" , IND_NAME , 1.0 );
|
|
addIndicator ( "pub" , IND_NAME , 1.0 );
|
|
addIndicator ( "brewpub" , IND_NAME , 1.0 );
|
|
addIndicator ( "atrium" , IND_NAME , 1.0 );
|
|
addIndicator ( "base" , IND_NAME , 1.0 );
|
|
addIndicator ( "basin" , IND_NAME , 1.0 );
|
|
addIndicator ( "bay" , IND_NAME , 1.0 );
|
|
addIndicator ( "beach" , IND_NAME , 1.0 );
|
|
addIndicator ( "bluff" , IND_NAME , 1.0 );
|
|
addIndicator ( "bog" , IND_NAME , 1.0 );
|
|
addIndicator ( "boundary" , IND_NAME , 1.0 );
|
|
addIndicator ( "branch" , IND_NAME , 1.0 );
|
|
addIndicator ( "bridge" , IND_NAME , 1.0 );
|
|
addIndicator ( "brook" , IND_NAME , 1.0 );
|
|
addIndicator ( "building" , IND_NAME , 1.0 );
|
|
addIndicator ( "bunker" , IND_NAME , 1.0 );
|
|
addIndicator ( "burro" , IND_NAME , 1.0 );
|
|
addIndicator ( "butte" , IND_NAME , 1.0 );
|
|
addIndicator ( "cabin" , IND_NAME , 1.0 );
|
|
addIndicator ( "camp" , IND_NAME , 1.0 );
|
|
addIndicator ( "campground" , IND_NAME , 1.0 );
|
|
addIndicator ( "campgrounds" , IND_NAME , 1.0 );
|
|
addIndicator ( "campus" , IND_NAME , 1.0 );
|
|
addIndicator ( "canal" , IND_NAME , 1.0 );
|
|
addIndicator ( "canyon" , IND_NAME , 1.0 );
|
|
addIndicator ( "casa" , IND_NAME , 1.0 );
|
|
addIndicator ( "castle" , IND_NAME , 1.0 );
|
|
addIndicator ( "cathedral" , IND_NAME , 1.0 );
|
|
addIndicator ( "cave" , IND_NAME , 1.0 );
|
|
addIndicator ( "cemetery" , IND_NAME , 1.0 );
|
|
addIndicator ( "center" , IND_NAME , 1.0 );
|
|
addIndicator ( "centre" , IND_NAME , 1.0 );
|
|
// "channel 13 news"?
|
|
//addIndicator ( "channel" , IND_NAME , 1.0 );
|
|
addIndicator ( "chapel" , IND_NAME , 1.0 );
|
|
addIndicator ( "church" , IND_NAME , 1.0 );
|
|
// "bible study circle"
|
|
//addIndicator ( "circle" , IND_NAME , 1.0 );
|
|
addIndicator ( "cliffs" , IND_NAME , 1.0 );
|
|
addIndicator ( "clinic" , IND_NAME , 1.0 );
|
|
addIndicator ( "college" , IND_NAME , 1.0 );
|
|
addIndicator ( "company" , IND_NAME , 1.0 );
|
|
addIndicator ( "complex" , IND_NAME , 1.0 );
|
|
addIndicator ( "corner" , IND_NAME , 1.0 );
|
|
addIndicator ( "cottage" , IND_NAME , 1.0 );
|
|
addIndicator ( "course" , IND_NAME , 1.0 ); // golf
|
|
addIndicator ( "courthouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "courtyard" , IND_NAME , 1.0 );
|
|
addIndicator ( "cove" , IND_NAME , 1.0 );
|
|
addIndicator ( "creek" , IND_NAME , 1.0 );
|
|
addIndicator ( "dam" , IND_NAME , 1.0 );
|
|
addIndicator ( "den" , IND_NAME , 1.0 );
|
|
addIndicator ( "department" , IND_NAME , 1.0 );
|
|
addIndicator ( "depot" , IND_NAME , 1.0 );
|
|
addIndicator ( "dome" , IND_NAME , 1.0 );
|
|
addIndicator ( "downs" , IND_NAME , 1.0 );
|
|
addIndicator ( "fair" , IND_NAME , 1.0 );
|
|
addIndicator ( "fairgrounds" , IND_NAME , 1.0 );
|
|
addIndicator ( "fairground" , IND_NAME , 1.0 );
|
|
addIndicator ( "falls" , IND_NAME , 1.0 );
|
|
addIndicator ( "farm" , IND_NAME , 1.0 );
|
|
addIndicator ( "farms" , IND_NAME , 1.0 );
|
|
addIndicator ( "field" , IND_NAME , 1.0 );
|
|
addIndicator ( "fields" , IND_NAME , 1.0 );
|
|
addIndicator ( "flat" , IND_NAME , 1.0 );
|
|
addIndicator ( "flats" , IND_NAME , 1.0 );
|
|
addIndicator ( "forest" , IND_NAME , 1.0 );
|
|
addIndicator ( "fort" , IND_NAME , 1.0 );
|
|
addIndicator ( "fountain" , IND_NAME , 1.0 );
|
|
addIndicator ( "garden" , IND_NAME , 1.0 );
|
|
addIndicator ( "gardens" , IND_NAME , 1.0 );
|
|
addIndicator ( "gate" , IND_NAME , 1.0 );
|
|
addIndicator ( "glacier" , IND_NAME , 1.0 );
|
|
addIndicator ( "graveyard" , IND_NAME , 1.0 );
|
|
addIndicator ( "gulch" , IND_NAME , 1.0 );
|
|
addIndicator ( "gully" , IND_NAME , 1.0 );
|
|
addIndicator ( "hacienda" , IND_NAME , 1.0 );
|
|
addIndicator ( "hall" , IND_NAME , 1.0 );
|
|
addIndicator ( "halls" , IND_NAME , 1.0 );
|
|
addIndicator ( "harbor" , IND_NAME , 1.0 );
|
|
addIndicator ( "harbour" , IND_NAME , 1.0 );
|
|
addIndicator ( "hatchery" , IND_NAME , 1.0 );
|
|
addIndicator ( "headquarters" , IND_NAME , 1.0 );
|
|
addIndicator ( "heights" , IND_NAME , 1.0 );
|
|
addIndicator ( "heliport" , IND_NAME , 1.0 );
|
|
addIndicator ( "hill" , IND_NAME , 1.0 );
|
|
addIndicator ( "hillside" , IND_NAME , 1.0 );
|
|
addIndicator ( "hilton" , IND_NAME , 1.0 );
|
|
addIndicator ( "historical" , IND_NAME , 1.0 );
|
|
addIndicator ( "historic" , IND_NAME , 1.0 );
|
|
addIndicator ( "holy" , IND_NAME , 1.0 );
|
|
addIndicator ( "home" , IND_NAME , 1.0 );
|
|
addIndicator ( "homestead" , IND_NAME , 1.0 );
|
|
addIndicator ( "horn" , IND_NAME , 1.0 );
|
|
addIndicator ( "hospital" , IND_NAME , 1.0 );
|
|
addIndicator ( "hotel" , IND_NAME , 1.0 );
|
|
addIndicator ( "house" , IND_NAME , 1.0 );
|
|
addIndicator ( "howard" , IND_NAME , 1.0 ); // johnson's
|
|
addIndicator ( "inlet" , IND_NAME , 1.0 );
|
|
addIndicator ( "inn" , IND_NAME , 1.0 );
|
|
addIndicator ( "institute" , IND_NAME , 1.0 );
|
|
addIndicator ( "international" , IND_NAME , 1.0 );
|
|
addIndicator ( "isla" , IND_NAME , 1.0 );
|
|
addIndicator ( "island" , IND_NAME , 1.0 );
|
|
addIndicator ( "isle" , IND_NAME , 1.0 );
|
|
addIndicator ( "islet" , IND_NAME , 1.0 );
|
|
addIndicator ( "junction" , IND_NAME , 1.0 );
|
|
addIndicator ( "knoll" , IND_NAME , 1.0 );
|
|
addIndicator ( "lagoon" , IND_NAME , 1.0 );
|
|
addIndicator ( "laguna" , IND_NAME , 1.0 );
|
|
addIndicator ( "lake" , IND_NAME , 1.0 );
|
|
addIndicator ( "landing" , IND_NAME , 1.0 );
|
|
addIndicator ( "ledge" , IND_NAME , 1.0 );
|
|
addIndicator ( "lighthouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "lodge" , IND_NAME , 1.0 );
|
|
addIndicator ( "lookout" , IND_NAME , 1.0 );
|
|
addIndicator ( "mall" , IND_NAME , 1.0 ); // added
|
|
addIndicator ( "manor" , IND_NAME , 1.0 );
|
|
addIndicator ( "marina" , IND_NAME , 1.0 );
|
|
addIndicator ( "meadow" , IND_NAME , 1.0 );
|
|
addIndicator ( "mine" , IND_NAME , 1.0 );
|
|
addIndicator ( "mines" , IND_NAME , 1.0 );
|
|
addIndicator ( "monument" , IND_NAME , 1.0 );
|
|
addIndicator ( "motel" , IND_NAME , 1.0 );
|
|
addIndicator ( "museum" , IND_NAME , 1.0 );
|
|
addIndicator ( "office" , IND_NAME , 1.0 );
|
|
addIndicator ( "outlet" , IND_NAME , 1.0 );
|
|
addIndicator ( "palace" , IND_NAME , 1.0 );
|
|
addIndicator ( "park" , IND_NAME , 1.0 );
|
|
addIndicator ( "peaks" , IND_NAME , 1.0 );
|
|
addIndicator ( "peninsula" , IND_NAME , 1.0 );
|
|
addIndicator ( "pit" , IND_NAME , 1.0 );
|
|
|
|
addIndicator ( "place" , IND_STREET , 1.0 ); // leroy place
|
|
addIndicator ( "pl" , IND_STREET , 1.0 ); // place
|
|
|
|
addIndicator ( "plains" , IND_NAME , 1.0 );
|
|
addIndicator ( "plant" , IND_NAME , 1.0 );
|
|
addIndicator ( "plantation" , IND_NAME , 1.0 );
|
|
addIndicator ( "plateau" , IND_NAME , 1.0 );
|
|
addIndicator ( "playa" , IND_NAME , 1.0 );
|
|
addIndicator ( "plaza" , IND_NAME , 1.0 );
|
|
addIndicator ( "point" , IND_NAME , 1.0 );
|
|
addIndicator ( "pointe" , IND_NAME , 1.0 );
|
|
addIndicator ( "pond" , IND_NAME , 1.0 );
|
|
addIndicator ( "port" , IND_NAME , 1.0 );
|
|
addIndicator ( "ramada" , IND_NAME , 1.0 );
|
|
addIndicator ( "ranch" , IND_NAME , 1.0 );
|
|
addIndicator ( "rancho" , IND_NAME , 1.0 );
|
|
addIndicator ( "range" , IND_NAME , 1.0 );
|
|
addIndicator ( "reef" , IND_NAME , 1.0 );
|
|
addIndicator ( "refure" , IND_NAME , 1.0 );
|
|
addIndicator ( "reserve" , IND_NAME , 1.0 );
|
|
addIndicator ( "reservoir" , IND_NAME , 1.0 );
|
|
addIndicator ( "residence" , IND_NAME , 1.0 );
|
|
addIndicator ( "resort" , IND_NAME , 1.0 );
|
|
//addIndicator ( "rio" , IND_NAME , 1.0 );
|
|
//addIndicator ( "river" , IND_NAME , 1.0 );
|
|
//addIndicator ( "riverside" , IND_NAME , 1.0 );
|
|
//addIndicator ( "riverview" , IND_NAME , 1.0 );
|
|
// was getting "rock bands"
|
|
//addIndicator ( "rock" , IND_NAME , 1.0 );
|
|
addIndicator ( "sands" , IND_NAME , 1.0 ); // added
|
|
addIndicator ( "sawmill" , IND_NAME , 1.0 );
|
|
addIndicator ( "school" , IND_NAME , 1.0 );
|
|
// try to fix hadcolon algo for
|
|
// The+Webb+Schools:+Calendars+...
|
|
addIndicator ( "schools" , IND_NAME , 1.0 );
|
|
addIndicator ( "schoolhouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "shore" , IND_NAME , 1.0 );
|
|
addIndicator ( "spa" , IND_NAME , 1.0 );
|
|
addIndicator ( "spring" , IND_NAME , 1.0 );
|
|
addIndicator ( "springs" , IND_NAME , 1.0 );
|
|
addIndicator ( "stadium" , IND_NAME , 1.0 );
|
|
addIndicator ( "station" , IND_NAME , 1.0 );
|
|
addIndicator ( "strip" , IND_NAME , 1.0 );
|
|
addIndicator ( "suites" , IND_NAME , 1.0 );
|
|
addIndicator ( "temple" , IND_NAME , 1.0 );
|
|
addIndicator ( "terrace" , IND_NAME , 1.0 );
|
|
addIndicator ( "tower" , IND_NAME , 1.0 );
|
|
//addIndicator ( "trail" , IND_NAME , 1.0 );
|
|
addIndicator ( "travelodge" , IND_NAME , 1.0 );
|
|
addIndicator ( "triangle" , IND_NAME , 1.0 );
|
|
addIndicator ( "tunnel" , IND_NAME , 1.0 );
|
|
addIndicator ( "university" , IND_NAME , 1.0 );
|
|
//addIndicator ( "valley" , IND_NAME , 1.0 );
|
|
addIndicator ( "wall" , IND_NAME , 1.0 );
|
|
addIndicator ( "ward" , IND_NAME , 1.0 );
|
|
addIndicator ( "waterhole" , IND_NAME , 1.0 );
|
|
addIndicator ( "waters" , IND_NAME , 1.0 );
|
|
addIndicator ( "well" , IND_NAME , 1.0 );
|
|
addIndicator ( "wells" , IND_NAME , 1.0 );
|
|
addIndicator ( "wilderness" , IND_NAME , 1.0 );
|
|
addIndicator ( "windmill" , IND_NAME , 1.0 );
|
|
addIndicator ( "woodland" , IND_NAME , 1.0 );
|
|
addIndicator ( "woods" , IND_NAME , 1.0 );
|
|
|
|
|
|
// good stuff i added
|
|
// some from http://www.geonames.org/export/codes.html
|
|
addIndicator ( "gallery" , IND_NAME , 1.0 );
|
|
addIndicator ( "theater" , IND_NAME , 1.0 );
|
|
addIndicator ( "theatre" , IND_NAME , 1.0 );
|
|
addIndicator ( "playhouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "saloon" , IND_NAME , 1.0 );
|
|
addIndicator ( "nightclub" , IND_NAME , 1.0 );
|
|
addIndicator ( "lounge" , IND_NAME , 1.0 );
|
|
addIndicator ( "ultralounge" , IND_NAME , 1.0 );
|
|
addIndicator ( "brewery" , IND_NAME , 1.0 );
|
|
addIndicator ( "chophouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "tavern" , IND_NAME , 1.0 );
|
|
addIndicator ( "company" , IND_NAME , 1.0 );
|
|
addIndicator ( "rotisserie" , IND_NAME , 1.0 );
|
|
addIndicator ( "bistro" , IND_NAME , 1.0 );
|
|
addIndicator ( "parlor" , IND_NAME , 1.0 );
|
|
addIndicator ( "studio" , IND_NAME , 1.0 );
|
|
addIndicator ( "studios" , IND_NAME , 1.0 );
|
|
// albuquerque publishing co., int16_t for "company"
|
|
addIndicator ( "co" , IND_NAME , 0.9 );
|
|
addIndicator ( "bureau" , IND_NAME , 1.0 );
|
|
addIndicator ( "estates" , IND_NAME , 1.0 );
|
|
addIndicator ( "dockyard" , IND_NAME , 1.0 );
|
|
addIndicator ( "gym" , IND_NAME , 1.0 );
|
|
addIndicator ( "synagogue" , IND_NAME , 1.0 );
|
|
addIndicator ( "shrine" , IND_NAME , 1.0 );
|
|
addIndicator ( "mosque" , IND_NAME , 1.0 );
|
|
addIndicator ( "store" , IND_NAME , 1.0 );
|
|
addIndicator ( "mercantile" , IND_NAME , 1.0 );
|
|
addIndicator ( "mart" , IND_NAME , 1.0 );
|
|
addIndicator ( "amphitheatre" , IND_NAME , 1.0 );
|
|
addIndicator ( "kitchen" , IND_NAME , 1.0 );
|
|
addIndicator ( "casino" , IND_NAME , 1.0 );
|
|
addIndicator ( "diner" , IND_NAME , 1.0 );
|
|
addIndicator ( "eatery" , IND_NAME , 1.0 );
|
|
addIndicator ( "shop" , IND_NAME , 1.0 );
|
|
addIndicator ( "inc" , IND_NAME , 1.0 ); // incorporated
|
|
addIndicator ( "incorporated" , IND_NAME , 1.0 );
|
|
addIndicator ( "corporation" , IND_NAME , 1.0 );
|
|
addIndicator ( "limited" , IND_NAME , 1.0 );
|
|
addIndicator ( "llc" , IND_NAME , 1.0 );
|
|
addIndicator ( "foundation" , IND_NAME , 1.0 );
|
|
addIndicator ( "warehouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "roadhouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "foods" , IND_NAME , 1.0 );
|
|
addIndicator ( "cantina" , IND_NAME , 1.0 );
|
|
addIndicator ( "steakhouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "smokehouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "deli" , IND_NAME , 1.0 );
|
|
addIndicator ( "enterprises" , IND_NAME , 1.0 );
|
|
addIndicator ( "repair" , IND_NAME , 1.0 );
|
|
addIndicator ( "service" , IND_NAME , 1.0 );
|
|
addIndicator ( "services" , IND_NAME , 1.0 );
|
|
addIndicator ( "systems" , IND_NAME , 1.0 );
|
|
addIndicator ( "salon" , IND_NAME , 1.0 );
|
|
addIndicator ( "boutique" , IND_NAME , 1.0 );
|
|
addIndicator ( "preschool" , IND_NAME , 1.0 );
|
|
addIndicator ( "galleries" , IND_NAME , 1.0 );
|
|
addIndicator ( "bakery" , IND_NAME , 1.0 );
|
|
addIndicator ( "factory" , IND_NAME , 1.0 );
|
|
addIndicator ( "llp" , IND_NAME , 1.0 );
|
|
addIndicator ( "attorney" , IND_NAME , 1.0 );
|
|
addIndicator ( "association" , IND_NAME , 1.0 );
|
|
addIndicator ( "solutions" , IND_NAME , 1.0 );
|
|
addIndicator ( "facility" , IND_NAME , 1.0 );
|
|
addIndicator ( "cannery" , IND_NAME , 1.0 );
|
|
addIndicator ( "mill" , IND_NAME , 1.0 );
|
|
addIndicator ( "quarry" , IND_NAME , 1.0 );
|
|
addIndicator ( "monastery" , IND_NAME , 1.0 );
|
|
addIndicator ( "observatory" , IND_NAME , 1.0 );
|
|
addIndicator ( "nursery" , IND_NAME , 1.0 );
|
|
addIndicator ( "pagoda" , IND_NAME , 1.0 );
|
|
addIndicator ( "pier" , IND_NAME , 1.0 );
|
|
addIndicator ( "prison" , IND_NAME , 1.0 );
|
|
addIndicator ( "post" , IND_NAME , 1.0 );
|
|
addIndicator ( "ruin" , IND_NAME , 1.0 );
|
|
addIndicator ( "ruins" , IND_NAME , 1.0 );
|
|
addIndicator ( "storehouse" , IND_NAME , 1.0 );
|
|
addIndicator ( "square" , IND_NAME , 1.0 );
|
|
addIndicator ( "tomb" , IND_NAME , 1.0 );
|
|
addIndicator ( "wharf" , IND_NAME , 1.0 );
|
|
addIndicator ( "zoo" , IND_NAME , 1.0 );
|
|
addIndicator ( "mesa" , IND_NAME , 1.0 );
|
|
addIndicator ( "pass" , IND_NAME , 1.0 );
|
|
addIndicator ( "passage" , IND_NAME , 1.0 );
|
|
addIndicator ( "peak" , IND_NAME , 1.0 );
|
|
addIndicator ( "vineyard" , IND_NAME , 1.0 );
|
|
addIndicator ( "grove" , IND_NAME , 1.0 );
|
|
//addIndicator ( "" , IND_NAME , 1.0 );
|
|
|
|
|
|
|
|
// maple street dance space
|
|
addIndicator ( "space" , IND_NAME , 1.0 );
|
|
addIndicator ( "library" , IND_NAME , 1.0 );
|
|
addIndicator ( "school" , IND_NAME , 1.0 );
|
|
addIndicator ( "church" , IND_NAME , 1.0 );
|
|
addIndicator ( "park" , IND_NAME , 1.0 );
|
|
addIndicator ( "house" , IND_NAME , 1.0 );
|
|
// markets are sometimes more of events than place names
|
|
addIndicator ( "market" , IND_NAME , 0.5 );
|
|
addIndicator ( "marketplace" , IND_NAME , 0.75 );
|
|
addIndicator ( "university" , IND_NAME , 1.0 );
|
|
addIndicator ( "center" , IND_NAME , 1.0 );
|
|
addIndicator ( "restaurant" , IND_NAME , 1.0 );
|
|
//addIndicator ( "bar" , IND_NAME , 1.0 );
|
|
addIndicator ( "grill" , IND_NAME , 1.0 );
|
|
addIndicator ( "grille" , IND_NAME , 1.0 );
|
|
addIndicator ( "cafe" , IND_NAME , 1.0 );
|
|
addIndicator ( "cabana" , IND_NAME , 1.0 );
|
|
addIndicator ( "shack" , IND_NAME , 1.0 );
|
|
addIndicator ( "shoppe" , IND_NAME , 1.0 );
|
|
addIndicator ( "collesium" , IND_NAME , 1.0 );
|
|
addIndicator ( "colliseum" , IND_NAME , 1.0 );
|
|
addIndicator ( "pavilion" , IND_NAME , 1.0 );
|
|
// cafe with accent mark
|
|
char tmp[64];
|
|
sprintf(tmp,"caf"); tmp[3]=0xc3; tmp[4]=0xa9; tmp[5]=0;
|
|
addIndicator ( tmp , IND_NAME , 1.0 );
|
|
|
|
// Less effective place name indicators
|
|
addIndicator ( "club" , IND_NAME , 0.5 );
|
|
|
|
|
|
|
|
// . now add some more indicators to g_cities hash table
|
|
// . TODO: get these in other languages. use wikipedia page!
|
|
addIndicator ( "suite" , IND_SUITE , 1.0 );
|
|
addIndicator ( "ste" , IND_SUITE , 1.0 );
|
|
addIndicator ( "room" , IND_SUITE , 1.0 );
|
|
addIndicator ( "pier" , IND_SUITE , 1.0 );
|
|
addIndicator ( "department" , IND_SUITE , 0.5 );
|
|
addIndicator ( "rm" , IND_SUITE , 1.0 );
|
|
addIndicator ( "floor" , IND_SUITE , 1.0 );
|
|
addIndicator ( "bldg" , IND_SUITE , 1.0 );
|
|
addIndicator ( "bld" , IND_SUITE , 1.0 );
|
|
addIndicator ( "building" , IND_SUITE , 1.0 );
|
|
addIndicator ( "apartment" , IND_SUITE , 1.0 );
|
|
addIndicator ( "apt" , IND_SUITE , 1.0 );
|
|
addIndicator ( "po" , IND_SUITE , 1.0 );
|
|
addIndicator ( "pobox" , IND_SUITE , 1.0 );
|
|
//addIndicator("p.o. box" , IND_SUITE , 1.0 );
|
|
addIndicator ( "box" , IND_SUITE , 1.0 );
|
|
addIndicator ( "postbus" , IND_SUITE , 1.0 ); // european
|
|
addIndicator ( "post" , IND_SUITE , 1.0 ); // european
|
|
addIndicator ( "bus" , IND_SUITE , 1.0 ); // european
|
|
addIndicator ( "private" , IND_SUITE , 1.0 ); // australia
|
|
addIndicator ( "box" , IND_SUITE , 1.0 ); // australia
|
|
|
|
// TODO: get these in other languages. use wikipedia page!
|
|
addIndicator ( "north" , IND_DIR , 1.0 );
|
|
addIndicator ( "east" , IND_DIR , 1.0 );
|
|
addIndicator ( "south" , IND_DIR , 1.0 );
|
|
addIndicator ( "west" , IND_DIR , 1.0 );
|
|
|
|
addIndicator ( "northeast" , IND_DIR , 1.0 );
|
|
addIndicator ( "northwest" , IND_DIR , 1.0 );
|
|
addIndicator ( "southeast" , IND_DIR , 1.0 );
|
|
addIndicator ( "southwest" , IND_DIR , 1.0 );
|
|
|
|
addIndicator ( "north" , IND_DIR , 1.0 );
|
|
addIndicator ( "east" , IND_DIR , 1.0 );
|
|
addIndicator ( "south" , IND_DIR , 1.0 );
|
|
addIndicator ( "west" , IND_DIR , 1.0 );
|
|
|
|
addIndicator ( "n" , IND_DIR , 1.0 );
|
|
addIndicator ( "s" , IND_DIR , 1.0 );
|
|
addIndicator ( "e" , IND_DIR , 1.0 );
|
|
addIndicator ( "w" , IND_DIR , 1.0 );
|
|
addIndicator ( "ne" , IND_DIR , 1.0 );
|
|
addIndicator ( "nw" , IND_DIR , 1.0 );
|
|
addIndicator ( "se" , IND_DIR , 1.0 );
|
|
addIndicator ( "sw" , IND_DIR , 1.0 );
|
|
|
|
// TODO: get in other languages
|
|
addIndicator ( "highway" , IND_STREET , 1.0 );
|
|
addIndicator ( "hghway" , IND_STREET , 1.0 );
|
|
addIndicator ( "hiway" , IND_STREET , 1.0 );
|
|
addIndicator ( "hway" , IND_STREET , 1.0 );
|
|
addIndicator ( "hwy" , IND_STREET , 1.0 );
|
|
|
|
// county road
|
|
//addIndicator ( "cr" , IND_STREET , 1.0 );
|
|
// state route
|
|
//addIndicator ( "route" , IND_STREET , 1.0 );
|
|
|
|
addIndicator ( "avenue" , IND_STREET , 1.0 );
|
|
addIndicator ( "ave" , IND_STREET , 1.0 );
|
|
addIndicator ( "drive" , IND_STREET , 1.0 );
|
|
addIndicator ( "dr" , IND_STREET , 1.0 );
|
|
addIndicator ( "ln" , IND_STREET , 1.0 );
|
|
addIndicator ( "lane" , IND_STREET , 1.0 );
|
|
addIndicator ( "blvd" , IND_STREET , 1.0 );
|
|
addIndicator ( "boulevard" , IND_STREET , 1.0 );
|
|
addIndicator ( "street" , IND_STREET , 1.0 );
|
|
addIndicator ( "st" , IND_STREET , 1.0 );
|
|
addIndicator ( "circle" , IND_STREET , 1.0 );
|
|
addIndicator ( "place" , IND_STREET , 1.0 );
|
|
addIndicator ( "parkway" , IND_STREET , 1.0 );
|
|
addIndicator ( "pkway" , IND_STREET , 1.0 );
|
|
addIndicator ( "pkwy" , IND_STREET , 1.0 );
|
|
addIndicator ( "straße", IND_STREET , 1.0 ); //!test this!
|
|
addIndicator ( "strasse" , IND_STREET , 1.0 );
|
|
addIndicator ( "sr" , IND_STREET , 1.0 ); // state route
|
|
|
|
addIndicator ( "trail" , IND_STREET , 1.0 );
|
|
// 80 mosby's run
|
|
addIndicator ( "run" , IND_STREET , 1.0 );
|
|
addIndicator ( "entrada" , IND_STREET , 1.0 );
|
|
|
|
// these were taken from http://en.wikipedia.org/wiki/Street_name
|
|
addIndicator ( "Autobahn" , IND_STREET , 1.0 );
|
|
addIndicator ( "Auto-estrada" , IND_STREET , 1.0 );
|
|
addIndicator ( "Autoroute" , IND_STREET , 1.0 );
|
|
addIndicator ( "Autostrada" , IND_STREET , 1.0 );
|
|
addIndicator ( "Autostrasse" , IND_STREET , 1.0 );
|
|
addIndicator ( "Byway" , IND_STREET , 1.0 );
|
|
addIndicator ( "Expressway" , IND_STREET , 1.0 );
|
|
addIndicator ( "Freeway" , IND_STREET , 1.0 );
|
|
addIndicator ( "Motorway" , IND_STREET , 1.0 );
|
|
addIndicator ( "Pike" , IND_STREET , 1.0 );
|
|
addIndicator ( "Avenue" , IND_STREET , 1.0 );
|
|
addIndicator ( "Boulevard" , IND_STREET , 1.0 );
|
|
addIndicator ( "Road" , IND_STREET , 1.0 );
|
|
addIndicator ( "rd" , IND_STREET , 1.0 );
|
|
addIndicator ( "Street" , IND_STREET , 1.0 );
|
|
|
|
addIndicator ( "Alley" , IND_STREET , 1.0 );
|
|
addIndicator ( "Bay" , IND_STREET , 1.0 );
|
|
addIndicator ( "Drive" , IND_STREET , 1.0 );
|
|
addIndicator ( "Fairway" , IND_STREET , 1.0 );
|
|
addIndicator ( "Gardens" , IND_STREET , 1.0 );
|
|
addIndicator ( "Gate" , IND_STREET , 1.0 );
|
|
addIndicator ( "Grove" , IND_STREET , 1.0 );
|
|
addIndicator ( "Heights" , IND_STREET , 1.0 );
|
|
addIndicator ( "Highlands" , IND_STREET , 1.0 );
|
|
addIndicator ( "Knoll" , IND_STREET , 1.0 );
|
|
addIndicator ( "Lane" , IND_STREET , 1.0 );
|
|
addIndicator ( "Manor" , IND_STREET , 1.0 );
|
|
addIndicator ( "Mews" , IND_STREET , 1.0 );
|
|
addIndicator ( "Passage" , IND_STREET , 1.0 );
|
|
addIndicator ( "Pathway" , IND_STREET , 1.0 );
|
|
addIndicator ( "Place" , IND_STREET , 1.0 );
|
|
addIndicator ( "Row" , IND_STREET , 1.0 );
|
|
addIndicator ( "Terrace" , IND_STREET , 1.0 );
|
|
addIndicator ( "Trail" , IND_STREET , 1.0 );
|
|
addIndicator ( "View" , IND_STREET , 1.0 );
|
|
addIndicator ( "Way" , IND_STREET , 1.0 );
|
|
|
|
addIndicator ( "Close" , IND_STREET , 1.0 );
|
|
addIndicator ( "Court" , IND_STREET , 1.0 );
|
|
addIndicator ( "Cove" , IND_STREET , 1.0 );
|
|
addIndicator ( "Croft" , IND_STREET , 1.0 );
|
|
addIndicator ( "Garth" , IND_STREET , 1.0 );
|
|
addIndicator ( "Green" , IND_STREET , 1.0 );
|
|
addIndicator ( "Lawn" , IND_STREET , 1.0 );
|
|
addIndicator ( "Nook" , IND_STREET , 1.0 );
|
|
addIndicator ( "Place" , IND_STREET , 1.0 );
|
|
|
|
addIndicator ( "Circle" , IND_STREET , 1.0 );
|
|
addIndicator ( "Crescent" , IND_STREET , 1.0 );
|
|
addIndicator ( "Loop" , IND_STREET , 1.0 );
|
|
addIndicator ( "Lp" , IND_STREET , 1.0 ); // abbreviation for loop
|
|
addIndicator ( "Oval" , IND_STREET , 1.0 );
|
|
addIndicator ( "Quadrant" , IND_STREET , 1.0 );
|
|
addIndicator ( "Square" , IND_STREET , 1.0 );
|
|
|
|
addIndicator ( "Canyon" , IND_STREET , 1.0 );
|
|
addIndicator ( "Causeway" , IND_STREET , 1.0 );
|
|
addIndicator ( "Grade" , IND_STREET , 1.0 );
|
|
addIndicator ( "Hill" , IND_STREET , 1.0 );
|
|
addIndicator ( "Mount" , IND_STREET , 1.0 );
|
|
addIndicator ( "Parkway" , IND_STREET , 1.0 );
|
|
addIndicator ( "Rise" , IND_STREET , 1.0 );
|
|
addIndicator ( "Vale" , IND_STREET , 1.0 );
|
|
|
|
addIndicator ( "Approach" , IND_STREET , 1.0 );
|
|
addIndicator ( "Bypass" , IND_STREET , 1.0 );
|
|
addIndicator ( "Esplanade" , IND_STREET , 1.0 );
|
|
addIndicator ( "Frontage road" , IND_STREET , 1.0 );
|
|
addIndicator ( "Parade" , IND_STREET , 1.0 );
|
|
addIndicator ( "Park" , IND_STREET , 1.0 );
|
|
addIndicator ( "Plaza" , IND_STREET , 1.0 );
|
|
addIndicator ( "Promenade" , IND_STREET , 1.0 );
|
|
addIndicator ( "Quay" , IND_STREET , 1.0 );
|
|
addIndicator ( "Stravenue" , IND_STREET , 1.0 );
|
|
// was matching intersection "8k run and walk"
|
|
//addIndicator ( "Walk" , IND_STREET , 1.0 );
|
|
// italy?
|
|
addIndicator ( "via" , IND_STREET , 1.0 );
|
|
|
|
|
|
// try to load places.dat. the new junk first
|
|
if ( ! loadPlaces ( ) ) return false;
|
|
|
|
// we do zips separate now! use wordId as the key
|
|
if ( ! g_zips.set ( 8,sizeof(ZipDesc),0,NULL,0,true,0,"tbl-zipcodes")){
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// zip codes reference city strings stored in this buffer
|
|
char *cityBuf = NULL;
|
|
int32_t cityBufSize = 0;
|
|
// load zip code table
|
|
bool loadedZips = false;
|
|
if ( g_zips.load ( g_hostdb.m_dir,"zips.dat",&cityBuf,&cityBufSize)) {
|
|
// sanity check
|
|
//if ( g_zips.m_numSlotsUsed != 89471 ) { char*xx=NULL;*xx=0;}
|
|
if ( g_zips.m_numSlotsUsed != 43595 ) { char*xx=NULL;*xx=0;}
|
|
loadedZips = true;
|
|
int64_t h = hash64 ( "87109" , 5 );
|
|
// test the zips table
|
|
if ( g_zips.getSlot ( &h ) < 0 ){char *xx=NULL;*xx=0; }
|
|
// . assign it
|
|
// . ZipDesc::m_cityOffset reference this buffer
|
|
g_cityBuf = cityBuf;
|
|
g_cityBufSize = cityBufSize;
|
|
}
|
|
|
|
// . quickly set the states
|
|
// . map each name of a state to its index into s_states[] array
|
|
g_states.set ( 8 , 4 , 256 , NULL , 0 , false , 0 ,"adm1tbl");
|
|
int32_t size = sizeof(s_states);
|
|
// item count
|
|
int32_t n = (int32_t)size/ sizeof(StateDesc);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// get it
|
|
StateDesc *sd = &s_states[i];
|
|
// get hash of abbr
|
|
int64_t h = hash64n ( sd->m_adm1 );
|
|
// make the value
|
|
//int32_t val = 0;
|
|
// shift up
|
|
//val <<= 8;
|
|
// or in the position
|
|
//val |= i;
|
|
// no dups
|
|
if ( g_states.isInTable ( &h ) ) { char *xx=NULL;*xx=0; }
|
|
// store it
|
|
if ( ! g_states.addKey ( &h , &sd ) ) { char*xx=NULL;*xx=0; }
|
|
// stop if done
|
|
if ( ! sd->m_name1 ) continue;
|
|
// then the second name
|
|
h = getWordXorHash ( sd->m_name1 );
|
|
// must be there
|
|
if ( ! h ) { char *xx=NULL;*xx=0; }
|
|
// flag it
|
|
//val = 1;
|
|
// shift up
|
|
//val <<= 8;
|
|
// or in the position
|
|
//val |= i;
|
|
// no dups
|
|
if ( g_states.isInTable ( &h ) ) { char *xx=NULL;*xx=0; }
|
|
// store it
|
|
if ( ! g_states.addKey ( &h , &sd ) ) { char*xx=NULL;*xx=0; }
|
|
// and the second name
|
|
if ( ! sd->m_name2 ) continue;
|
|
// then the second name
|
|
h = getWordXorHash ( sd->m_name2 );
|
|
// must be there
|
|
if ( ! h ) { char *xx=NULL;*xx=0; }
|
|
// flag it as second name
|
|
//val = 2;
|
|
// shift up
|
|
//val <<= 8;
|
|
// or in the position
|
|
//val |= i;
|
|
// no dups
|
|
if ( g_states.isInTable ( &h ) ) { char *xx=NULL;*xx=0; }
|
|
// store it
|
|
if ( ! g_states.addKey ( &h , &sd ) ) { char*xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// . timezone table
|
|
// . hash of city and adm1 is the key
|
|
// . maps to a one byte timezone offset, usually negative
|
|
g_timeZones.set ( 4 ,
|
|
sizeof(CityStateDesc),// 1 byte date timezone offset
|
|
0 ,
|
|
NULL ,
|
|
0 ,
|
|
false , // dups?
|
|
0 , // niceness
|
|
"tbl-tzs" );
|
|
|
|
|
|
if ( loadedZips && !g_timeZones.load(g_hostdb.m_dir,"timezones.dat")){
|
|
log("places: failed to load timezones.dat");
|
|
loadedZips = false;
|
|
}
|
|
|
|
int32_t vv = 185747;
|
|
if ( g_timeZones.m_numSlotsUsed && g_timeZones.m_numSlotsUsed!=vv){
|
|
log("places: bad timezones.dat file %"INT32" != %"INT32"",
|
|
g_timeZones.m_numSlotsUsed,vv);
|
|
return false;
|
|
}
|
|
// sanity
|
|
if ( g_timeZones.m_numSlotsUsed ) {
|
|
char udst;
|
|
char tzoff;
|
|
tzoff = getTimeZone2 ( "houston", "tx", &udst );
|
|
if ( tzoff == UNKNOWN_TIMEZONE ) { char *xx=NULL;*xx=0; }
|
|
if ( tzoff != -5 ) { char *xx=NULL;*xx=0; }
|
|
tzoff = getTimeZone2 ( "woods hole", "ma", &udst );
|
|
if ( tzoff == UNKNOWN_TIMEZONE ) { char *xx=NULL;*xx=0; }
|
|
tzoff = getTimeZone2 ( "albuquerque", "nm", &udst );
|
|
if ( tzoff == UNKNOWN_TIMEZONE ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
|
|
// map a cityHash/state of an aliased city name to a normalized cityId
|
|
if ( ! g_aliases.set(4,4,128,NULL,0,false,0,"aliastab") )
|
|
return false;
|
|
|
|
// load the aliases
|
|
if ( loadedZips && g_aliases.load ( g_hostdb.m_dir , "aliases.dat")){
|
|
// match this
|
|
int32_t na = 11663;//11462;
|
|
// sanity check
|
|
if ( g_aliases.m_numSlotsUsed != na){char*xx=NULL;*xx=0;}
|
|
}
|
|
|
|
// . init the hash table
|
|
// . use an 8-byte hash for the key
|
|
// . xor the wids together for quick lookups
|
|
// . all subphrases that include the first word of the place name will
|
|
// be hashed, that way we know if we should hash further
|
|
// . also, we should allow dups!
|
|
// . use a 6 byte key (truncated wordId) to use up less space!
|
|
g_cities.set ( 8 , // keySize
|
|
sizeof(CityDesc) , // adm1 bit vector + mostpopcity
|
|
0 , // initial # slots
|
|
NULL , // initial buf
|
|
0 , // initial buf size
|
|
true , // allowDup keys?
|
|
0 , // niceness
|
|
"tbl-places" );
|
|
|
|
// try to load the binary hash table first
|
|
if ( loadedZips && g_cities.load ( g_hostdb.m_dir , "cities.dat" ) ) {
|
|
// sanity check
|
|
int32_t nc = 123347; // 123141;
|
|
if ( g_cities.m_numSlotsUsed != nc){char*xx=NULL;*xx=0;}
|
|
// another test
|
|
char *str;
|
|
//char *str = "nm";
|
|
//str = "madrid";
|
|
//int64_t h = hash64 (str,gbstrlen(str));
|
|
int64_t h = 0;
|
|
//h = hash64 ("santa",5);
|
|
//h ^= hash64 ("n",1);
|
|
h = hash64n ("jemez");
|
|
|
|
h <<= 1;
|
|
//h ^= hash64 ("fe",2);
|
|
//h ^= hash64 ("m",1);
|
|
h ^= hash64n("springs");
|
|
|
|
//str = "santa fe";
|
|
//str = "n.m.";
|
|
str = "jemez springs";
|
|
|
|
//h = hash64 ( "abq",3);
|
|
//str = "abq";
|
|
|
|
//h = hash64 ( "alb",3);
|
|
//str = "alb";
|
|
|
|
//str = "albuquerque";
|
|
//h = hash64 ( str,gbstrlen(str) );
|
|
|
|
str = "new york";
|
|
h = getWordXorHash ( str );
|
|
|
|
|
|
// make sure we got madrid nm
|
|
//int32_t slot = g_cities.getSlot ( &h );
|
|
|
|
//if ( slot < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
CityDesc *cd = (CityDesc *)g_cities.getValue(&h);
|
|
if ( ! cd ) { char *xx=NULL;*xx=0; }
|
|
|
|
uint64_t abits = getAdm1Bits ( "ny" );
|
|
if ( ! ( cd->m_adm1Bits & abits ) ) { char *xx=NULL;*xx=0;}
|
|
|
|
// check city ids
|
|
int64_t abqh1 = getWordXorHash("abq");
|
|
int64_t abqh2 = getWordXorHash("albuquerque");
|
|
uint32_t cid1 = getCityId32(abqh1,"nm");
|
|
uint32_t cid2 = getCityId32(abqh2,"nm");
|
|
if ( cid1 != cid2 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// get nm
|
|
int64_t hnm = getWordXorHash("new mexico");
|
|
// get state descriptor
|
|
int32_t pos = getStateOffset ( &hnm );
|
|
// sanity
|
|
if ( pos < 0 ) { char *xx=NULL;*xx=0; }
|
|
// make bit mask
|
|
uint64_t mask = 1LL << pos;
|
|
// and in nm
|
|
if ( ! ((cd->m_adm1Bits) & mask) ) { char *xx=NULL;*xx=0;}
|
|
/*
|
|
// a nested loop
|
|
for ( ; slot >= 0 ; slot = g_cities.getNextSlot(slot,&h)) {
|
|
// get the place
|
|
pd = (PlaceDesc *)g_cities.getValueFromSlot(slot);
|
|
|
|
// map to alias?
|
|
if ( pd->m_bits & PLF_ALIAS )
|
|
pd=(PlaceDesc *)g_cities.getValueFromSlot(pd->getSlot());
|
|
|
|
if ( ! is_ascii(pd->m_adm1[0]) ||
|
|
! is_ascii(pd->m_adm1[1]) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// print it
|
|
log("places: h=%s adm1=%c%c ctry=%s",
|
|
str,
|
|
pd->m_adm1[0],
|
|
pd->m_adm1[1],
|
|
g_countryCode.getName(pd->m_crid-1));
|
|
}
|
|
*/
|
|
// now hash for zip code
|
|
//h = hash64Lower_a("BC",2);
|
|
//int64_t h1 = hash64("n",1);
|
|
//int64_t h2 = hash64("m",1);
|
|
//int64_t h3 = (h1<<1LL) ^ h2;
|
|
|
|
char *zstr = "87102";
|
|
h = hash64 ( zstr,gbstrlen(zstr));
|
|
//h = hash64 ("78404",5);
|
|
//slot = g_cities.getSlot ( &h );
|
|
int32_t slot = g_zips.getSlot ( &h );
|
|
|
|
//char *city="Corpus Christi";
|
|
char *city="Albuquerque";
|
|
int64_t ch = hash64Lower_utf8(city,gbstrlen(city));
|
|
//int32_t ch = (int32_t)(th64&0xffffffff);
|
|
log("places: %s hash = %"UINT64"",city,ch);
|
|
// a nested loop
|
|
for ( ; slot >= 0 ; slot = g_zips.getNextSlot(slot,&h)) {
|
|
// get the place
|
|
ZipDesc *zd;
|
|
zd = (ZipDesc *)g_zips.getValueFromSlot(slot);
|
|
// convert adm1 bit to adm1 code
|
|
StateDesc *sd = getStateDescFromBits(zd->m_adm1Bits);
|
|
// must be there
|
|
if ( ! sd ) { char *xx=NULL;*xx=0; }
|
|
//if(!is_ascii(zd->m_adm1[0]) ) {char *xx=NULL;*xx=0;}
|
|
// print it
|
|
log("places: h=%s cityhash=%"UINT64" adm1=%s "//adm1=%c%c "
|
|
"pd=0x%"PTRFMT"",
|
|
zstr,
|
|
zd->m_cityHash,
|
|
sd->m_name1,
|
|
//zd->m_adm1[0],
|
|
//zd->m_adm1[1],
|
|
//g_countryCode.getName(zd->m_crid-1),
|
|
(PTRTYPE)zd);
|
|
if ( zd->m_cityHash != ch ) { char*xx=NULL;*xx=0; }
|
|
}
|
|
// exit until we get "nm" and "bc" for british columbia!!!
|
|
//log("hey hey!!!!!!!!!!!!!!!!! fix me you");
|
|
//exit(-1);
|
|
// otherwise, we passed
|
|
//if ( loadedIndicators ) return true;
|
|
return true;
|
|
//loadedCities = true;
|
|
}
|
|
|
|
// let them know that we are creating it
|
|
logf(LOG_INFO,"places: creating cities.dat");
|
|
|
|
g_cities.reset();
|
|
g_zips.reset();
|
|
g_timeZones.reset();
|
|
g_aliases.reset();
|
|
//g_states.reset();
|
|
|
|
// init with 8M slots
|
|
//g_cities.set ( 6,sizeof(PlaceDesc),6950000,NULL,0,true,0);
|
|
// 1M since doing USA only now. now cities.dat is only 12MB not 100MB
|
|
// uses 731k slots
|
|
//g_cities.set ( 8,sizeof(PlaceDesc),100000,NULL,0,true,0,"placestbl");
|
|
|
|
// this now maps just a city to the state/adm1 bit vector of the states
|
|
// it is in... AND the one byte timezone offset
|
|
g_cities.set ( 8,sizeof(CityDesc),100000,NULL,0,false,0,"placestbl");
|
|
|
|
// we do zips separate now! use wordId as the key (89k used)
|
|
if ( ! g_zips.set ( 8,sizeof(ZipDesc),10000,NULL,0,true,0,"zipstbl")) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
if (!g_timeZones.set(4,sizeof(CityStateDesc),100000,NULL,0,false,0,
|
|
"tbl99")){ char *xx=NULL;*xx=0;}
|
|
|
|
// map a cityHash/state of an aliased city name to a normalized cityId
|
|
if ( ! g_aliases.set(4,4,128,NULL,0,false,0,"aliastab") )
|
|
return false;
|
|
|
|
|
|
// keep track of max population for each city name and the state
|
|
// in which that max population occurs
|
|
HashTableX maxPops;
|
|
maxPops.set (8,4,100000,NULL,0,false,0,"poptbl");
|
|
|
|
|
|
//////////////////////////////////////////////////////////////
|
|
//
|
|
// LOAD THE allCountries.txt file
|
|
//
|
|
//////////////////////////////////////////////////////////////
|
|
|
|
// geonameid : integer id of record in geonames database
|
|
// name : name of geographical point (utf8) varchar(200)
|
|
// asciiname : name of geographical point in plain ascii
|
|
// characters, varchar(200)
|
|
// alternatenames : alternatenames, comma separated varchar(4000)
|
|
// (varchar(5000) for SQL Server)
|
|
// latitude : latitude in decimal degrees (wgs84)
|
|
// longitude : longitude in decimal degrees (wgs84)
|
|
// feature class : see http://www.geonames.org/export/codes.html,
|
|
// char(1)
|
|
// feature code : see http://www.geonames.org/export/codes.html,
|
|
// varchar(10)
|
|
// country code : ISO-3166 2-letter country code, 2 characters
|
|
// cc2 : alternate country codes, comma separated,
|
|
// ISO-3166 2-letter country code, 60 characters
|
|
// admin1 code : fipscode (subject to change to iso code),
|
|
// isocode for the us and ch, see file
|
|
// admin1Codes.txt for display names of this code;
|
|
// varchar(20)
|
|
// admin2 code : code for the second administrative division, a
|
|
// county in the US, see file admin2Codes.txt;
|
|
// varchar(80)
|
|
// admin3 code : code for third level administrative division,
|
|
// varchar(20)
|
|
// admin4 code : code for fourth level administrative division,
|
|
// varchar(20)
|
|
// population : bigint (4 byte int)
|
|
// elevation : in meters, integer
|
|
// gtopo30 : average elevation of 30'x30' (ca 900mx900m)
|
|
// area in meters, integer
|
|
// timezone : the timezone id (see file timeZone.txt)
|
|
// modification date : date of last modification in yyyy-MM-dd format
|
|
|
|
|
|
// . make the filename to open
|
|
// . downloadeded from http://geonames.org/allCountries.zip ?
|
|
// . sample line =
|
|
// 3038840 Serrat de Ventader Serrat de Ventader 42.4833333
|
|
// 1.4333333 T MT AD 00
|
|
char ff[1024];
|
|
sprintf ( ff , "%sallCountries.txt", g_hostdb.m_dir );
|
|
// places.txt is just the United States
|
|
//sprintf ( ff , "%splaces.txt", g_hostdb.m_dir );
|
|
logf(LOG_INFO,"places: reading %s",ff);
|
|
FILE *fd = fopen ( ff, "r" );
|
|
if ( ! fd )
|
|
return log("places: failed to open %s: %s",ff,strerror(errno));
|
|
|
|
|
|
|
|
// count how many times we see each word for purposes of establishing
|
|
// the most common indicators of a place. i.e. "center", "square",...
|
|
//HashTableX ct;
|
|
// init with 8M places too
|
|
//ct.set ( 8 , 4 , 9300000,NULL,0,false,0 ,"addrcmmn");
|
|
|
|
// similar to "ct" but we incorporate latitude/longitude to restrict
|
|
// voting in order to remove "local words", like Edisto!
|
|
//HashTableX gvt;
|
|
//gvt.set ( 8 , 0 , 30000 ,NULL,0,false,0,"addrgvt" );
|
|
|
|
HashTableX popTable;
|
|
popTable.set ( 4,4,30000,NULL,0,false,0,"poptab");
|
|
|
|
|
|
int32_t badEntry = 0;
|
|
|
|
int32_t line = 0;
|
|
|
|
//int32_t MAX = 0;
|
|
|
|
// . go through the places in allCountries.txt
|
|
// . format described in /gb/geo/geonames/readme.txt
|
|
char buf[10000];
|
|
// for debugging
|
|
char *dbuf = buf;
|
|
|
|
//char topBuf[1000000];
|
|
//char *topBufPtr = topBuf;
|
|
// map a wid to a string ptr with this table, "st"
|
|
HashTableX st;
|
|
st.set ( 8 , 4 , 30000 , NULL,0,false,0 ,"addrst");
|
|
|
|
while ( fgets ( buf , 10000 , fd ) ) {
|
|
// tmp debug for postalCodes.txt
|
|
//break;
|
|
// length of line, including the terminating \n
|
|
int32_t wlen = gbstrlen(buf) ;
|
|
// sanity check
|
|
if ( wlen >= 9000 ) { char *xx=NULL;*xx=0; }
|
|
// skip if empty
|
|
if ( wlen <= 0 ) continue;
|
|
// null terminate it, instead of \n
|
|
buf[wlen-1]='\0';
|
|
|
|
// debug point
|
|
//char *poo = strstr(buf,"Town of North Hempstead" ); if (poo)
|
|
// log("hey");
|
|
|
|
// log it
|
|
if ( (line % 10000) == 0 )
|
|
log(LOG_INFO,"places: read line #%"INT32" out of "
|
|
"6,900,574 (%"INT32" places added)",line,
|
|
g_cities.m_numSlotsUsed);
|
|
line++;
|
|
|
|
// country id
|
|
uint8_t crid = 0;
|
|
// country code
|
|
char cc[3];
|
|
cc[0] = 0;
|
|
cc[1] = 0;
|
|
// admin1code
|
|
char a1[2];
|
|
// admin2code
|
|
//char a2[2];
|
|
// reset
|
|
a1[0] = a1[1] = 0;
|
|
// descriptive bits
|
|
//pbits_t bits = 0;
|
|
// place type
|
|
placetype_t ptype = 0;
|
|
// official name of the place
|
|
char *name = NULL;
|
|
// the ascii version
|
|
char *ascii = NULL;
|
|
// comma-separated abbreviations and alternative names
|
|
char *alt = NULL;
|
|
// stop after this char ptr
|
|
char *stop = NULL;
|
|
double latitude = 0.0;
|
|
double longitude = 0.0;
|
|
// population of the city/place
|
|
int32_t pop = 0;
|
|
// count tabs
|
|
int32_t tabs = 0;
|
|
// point to the beginning of the line
|
|
char *p = buf;
|
|
char tzoff = 0;
|
|
char useDST; // daylight savings time
|
|
// debug point
|
|
//if ( strncmp(buf,"2241297\t", 8) ==0 )
|
|
//if ( strncmp(buf,"3856157\t", 8) ==0 )
|
|
// log("gotit");
|
|
|
|
// parse out the tab delimited things from the line
|
|
for ( ; *p ; p++ ) {
|
|
// skip if no tab
|
|
if ( *p != '\t' ) continue;
|
|
// count tabs
|
|
tabs++;
|
|
// point "s" to right after the tab
|
|
char *s = p + 1;
|
|
// done?
|
|
if ( ! *s ) break;
|
|
// after first tab is the official place name
|
|
if ( tabs == 1 ) name = s;
|
|
// then the name in ascii
|
|
if ( tabs == 2 ) ascii = s;
|
|
// then comma-separated list of alternative names
|
|
if ( tabs == 3 ) alt = s;
|
|
// the latitude
|
|
if ( tabs == 4 ) {
|
|
// a stopping point for "alt"
|
|
stop = s;
|
|
// get it
|
|
latitude = atof(s);
|
|
}
|
|
// the longitude
|
|
if ( tabs == 5 ) {
|
|
// get it
|
|
longitude = atof(s);
|
|
}
|
|
// . the category of place is after the 6th tab
|
|
// . the specific type of place is after the 7th tab
|
|
// . see http://www.geonames.org/export/codes.html
|
|
// . to save mem, only hash certain types...
|
|
if ( tabs == 7 ) {
|
|
// this is usually a state in the U.S.
|
|
if ( ! strncmp(s,"ADM1",4) )
|
|
ptype = 0;//PT_STATE;
|
|
// this is usually a county in the U.S.
|
|
else if ( ! strncmp(s,"ADM2",4) )
|
|
ptype = 0;//PT_ADM2;
|
|
// this is usually a county in the U.S.
|
|
else if ( ! strncmp(s,"ADM3",4) )
|
|
ptype = 0;//PT_ADM3;
|
|
// this is usually a county in the U.S.
|
|
else if ( ! strncmp(s,"ADM4",4) )
|
|
ptype = 0;//PT_ADM4;
|
|
// populated place = city
|
|
else if ( ! strncmp(s,"PPL" ,3) )
|
|
ptype = PT_CITY;
|
|
// town of, township, etc.
|
|
// town of north hempstead
|
|
// . crap! this gets a different san jose!
|
|
else if ( ! strncmp(s,"ADMD" ,4) )
|
|
ptype = PT_CITY;
|
|
// locality
|
|
else if ( ! strncmp(s,"LCTY" ,4) )
|
|
ptype = PT_CITY;
|
|
// independent political entity
|
|
else if ( ! strncmp(s,"PCLIX" ,4) )
|
|
ptype = PT_CITY;
|
|
else if ( ! strncmp(s,"P\t" ,2) )
|
|
ptype = PT_CITY;
|
|
// independent political entity = country
|
|
else if ( ! strncmp(s,"PCLI",4) )
|
|
ptype = PT_COUNTRY;
|
|
// allow schools (popular meeting place)
|
|
else if ( ! strncmp(s,"SCH",3) )
|
|
ptype = 0;//PT_SCH;
|
|
// and parks (popular meeting place)
|
|
else if ( ! strncmp(s,"PRK",3) )
|
|
ptype = 0;//PT_PRK;
|
|
}
|
|
// . country code (two letters)
|
|
// . sometimes things like a gulf of aden has no
|
|
// associated country code!
|
|
if ( tabs == 8 && s[0] != '\t' ) {
|
|
cc[0] = to_lower_a(s[0]);
|
|
cc[1] = to_lower_a(s[1]);
|
|
cc[2] = 0;
|
|
crid = getCountryId ( cc );
|
|
// sanity check
|
|
if ( s[2]!='\t'&&s[2]) { char *xx=NULL;*xx=0;}
|
|
continue;
|
|
}
|
|
// alternate country code (two letters)
|
|
if ( tabs == 9 && ! crid && s[0] != '\t' ) {
|
|
cc[0] = to_lower_a(s[0]);
|
|
cc[1] = to_lower_a(s[1]);
|
|
cc[2] = 0;
|
|
crid = getCountryId ( cc );
|
|
}
|
|
|
|
// . admin1 code (two letters)
|
|
// . readme.txt says varchar(20) but
|
|
// /gb/geo/admin1Codes.txt seems to say 2 chars
|
|
// . actually i have seen 3 letter ones... but they
|
|
// if truncated to two chars would be unique in their
|
|
// respective country. i.e. GB.ENG, GB.NIR, ...
|
|
// . BUT for GR.ESYE11 through GR.ESYE14, ... just use
|
|
// the last two chars!
|
|
if ( tabs == 10 ) {
|
|
// usually these 2 chars are digits!
|
|
a1[0] = to_lower_a(s[0]);
|
|
a1[1] = to_lower_a(s[1]);
|
|
// panic!
|
|
if ( s[2] == '\t' ) continue;
|
|
// watch out for GReece
|
|
if ( cc[0] != 'g' ) continue;
|
|
if ( cc[1] != 'r' ) continue;
|
|
// and its "states" (admin1 codes)
|
|
if ( a1[0] != 'e' ) continue;
|
|
if ( a1[1] != 's' ) continue;
|
|
// use the last two for this guy!
|
|
s += 4;
|
|
if ( ! is_digit(s[0]) ) continue;
|
|
if ( ! is_digit(s[1]) ) continue;
|
|
a1[0] = s[0];
|
|
a1[1] = s[1];
|
|
}
|
|
// pop is timezone - 3
|
|
if ( tabs == 14 ) {
|
|
// get it
|
|
pop = atol(s);
|
|
}
|
|
// timezone
|
|
if ( tabs == 17 ) {
|
|
char *tzname = p + 1;
|
|
// assume we use daylights savings time
|
|
useDST = 1;
|
|
// assume not found
|
|
tzoff = 0;
|
|
// find the end, a tab i guess or wsapce
|
|
char *e = tzname;
|
|
for ( ; *e && ! is_wspace_a(*e) ; e++ );
|
|
// temp null term
|
|
char saved = *e;
|
|
*e = '\0';
|
|
// convert to timezone offset
|
|
if ( ! strcmp(tzname,"America/Chicago") )
|
|
tzoff = -6;
|
|
else if ( ! strcmp(tzname,"America/Anchorage"))
|
|
tzoff = -9;
|
|
else if ( ! strcmp(tzname,"America/Indiana/Knox"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"America/Kentucky/Monticello"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"America/Boise"))
|
|
tzoff = -7;
|
|
else if ( ! strcmp(tzname,"America/Indiana/Indianapolis"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"America/Indiana/Marengo"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"America/Indiana/Petersburg"))
|
|
tzoff = -6;
|
|
else if ( ! strcmp(tzname,"America/Indiana/Tell_City"))
|
|
tzoff = -6;
|
|
|
|
else if ( ! strcmp(tzname,"America/Indiana/Vevay"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"America/Indiana/Vincennes"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"America/Indiana/Winamac"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"America/Juneau"))
|
|
tzoff = -9;
|
|
else if ( ! strcmp(tzname,"America/Kentucky/Louisville"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"America/Menominee"))
|
|
tzoff = -6;
|
|
else if ( ! strcmp(tzname,"America/Nome"))
|
|
tzoff = -9;
|
|
else if ( ! strcmp(tzname,"America/North_Dakota/Center"))
|
|
tzoff = -6;
|
|
else if ( ! strcmp(tzname,"America/North_Dakota/New_Salem"))
|
|
tzoff = -6;
|
|
else if ( ! strcmp(tzname,"America/Shiprock"))
|
|
tzoff = -7;
|
|
else if ( ! strcmp(tzname,"America/Yakutat"))
|
|
// could not find this - guessing
|
|
tzoff = -9;
|
|
|
|
else if ( ! strcmp(tzname,"America/Detroit"))
|
|
tzoff = -5;
|
|
else if ( !strcmp(tzname,"America/St_Thomas")){
|
|
tzoff = -4;
|
|
useDST = 0;
|
|
}
|
|
else if ( ! strcmp(tzname,"Pacific/Kwajalein"))
|
|
tzoff = -12;
|
|
|
|
|
|
else if ( ! strcmp(tzname,"America/Adak"))
|
|
tzoff = -10;
|
|
else if ( ! strcmp(tzname,"America/Phoenix")){
|
|
tzoff = -7; useDST = 0; }
|
|
else if ( ! strcmp(tzname,"America/Denver"))
|
|
tzoff = -7;
|
|
else if (!strcmp(tzname,"America/Los_Angeles"))
|
|
tzoff = -8;
|
|
else if ( ! strcmp(tzname,"America/New_York"))
|
|
tzoff = -5;
|
|
else if ( ! strcmp(tzname,"Pacific/Honolulu")){
|
|
tzoff = -10; useDST = 0; }
|
|
// amchitka in alasakn aleutian islands...
|
|
else if ( ! tzname[0] )
|
|
tzoff = 0;
|
|
else {
|
|
char *xx=NULL;*xx=0; }
|
|
// restore
|
|
*e = saved;
|
|
}
|
|
}
|
|
|
|
// break point
|
|
//if ( name && strncasecmp(name,"Madrid\t",7)==0 )
|
|
// log("hey");
|
|
|
|
// skip if not a place we are interested in
|
|
//if ( ! bits )
|
|
// continue;
|
|
|
|
if ( ! crid ) {
|
|
badEntry++;
|
|
log("places: bad country for "
|
|
"for %s",dbuf);
|
|
continue;
|
|
}
|
|
|
|
// must have all 4 things here:
|
|
if ( !a1[0] || ! name ) {
|
|
//log("places: %s does not have country of adm1",name);
|
|
badEntry++;
|
|
continue;
|
|
}
|
|
|
|
// skip all NON-USA places now that we are specializing
|
|
// no, now we had facebook events from all over, if they
|
|
// have a lat/lon! yeah, so let foreign cities through...
|
|
//if ( crid != CRID_US )continue;
|
|
|
|
// only store cities for now
|
|
if ( ! ptype ) continue;
|
|
// sanity check
|
|
if ( ! is_ascii(a1[0]) || ! is_ascii(a1[1]) ) {
|
|
//log("places: bad %s",name);
|
|
badEntry++;
|
|
continue;
|
|
}
|
|
// what is this???? i see "00"
|
|
if ( is_digit(a1[0]) ) continue;
|
|
|
|
uint64_t h_washington = hash64n ("washington");
|
|
uint64_t h_dc = hash64n ("dc");
|
|
uint64_t h_d = hash64n ("d");
|
|
uint64_t h_c = hash64n ("c");
|
|
uint64_t h_wdc = h_washington;
|
|
h_wdc <<= 1;
|
|
h_wdc ^= h_dc;
|
|
uint64_t h_wdc2 = h_washington;
|
|
h_wdc2 <<= 1;
|
|
h_wdc2 ^= h_d;
|
|
h_wdc2 <<= 1;
|
|
h_wdc2 ^= h_c;
|
|
|
|
// set nameEnd/asciiEnd/altEnd
|
|
char *nameEnd = name;
|
|
for (;nameEnd;nameEnd++)
|
|
if(*nameEnd ==','||*nameEnd=='\t'||!*nameEnd ) break;
|
|
char *asciiEnd = ascii;
|
|
for (;asciiEnd;asciiEnd++)
|
|
if(*asciiEnd ==','||*asciiEnd=='\t'||!*asciiEnd)break;
|
|
char *altEnd = alt;
|
|
for ( ; altEnd ; altEnd++ )
|
|
if (*altEnd==','||*altEnd=='\t'||!*altEnd) break;
|
|
// null terms
|
|
*nameEnd = '\0';
|
|
*asciiEnd = '\0';
|
|
*altEnd = '\0';
|
|
|
|
// ok, now we need to grab the place id in the file and
|
|
// use that to reference the alt names table we hashed up
|
|
// top. because that includes the language code of the
|
|
// altname!!!
|
|
// then we need to make a string like
|
|
// cs.en.nb.nn.sk=Egypt,fy.nl=Egypte,fi=Egypti
|
|
// and store that into a buffer for each place. then the
|
|
// city desc needs to references that buffer. we also hash
|
|
// every alt name to point to the same CityDesc or CountryDesc
|
|
// or StateDesc whichever type of place it is...
|
|
//
|
|
// MDW LEFT OFF HERE
|
|
|
|
uint64_t h = getWordXorHash ( name );
|
|
|
|
// hashes we added, to dedup
|
|
//HashTableX dt;
|
|
//char buf[10000];
|
|
//dt.set ( 6,0,100,buf,10000,false,0);
|
|
|
|
// do not add "washington, dc" as a city, treat
|
|
// dc as a state!!
|
|
if ( h == h_wdc )
|
|
continue;
|
|
if ( h == h_wdc2 )
|
|
continue;
|
|
// no dups!
|
|
//if ( dt.isInTable(&h ) ) continue;
|
|
// add it
|
|
//if ( ! dt.addKey(&h) ) { char *xx=NULL;*xx=0; }
|
|
|
|
// normalize this
|
|
char adm1[3];
|
|
adm1[0] = to_lower_a(a1[0]);
|
|
adm1[1] = to_lower_a(a1[1]);
|
|
adm1[2] = 0;
|
|
|
|
// use this now
|
|
uint32_t cid32 = (uint32_t)getCityId32(h,a1);
|
|
|
|
// we add 100 to the timeZoneOffset to indicate it
|
|
// does not use DST
|
|
//if ( useDST == 0 ) tzoff += 100;
|
|
|
|
// already in there?
|
|
int32_t slot = g_timeZones.getSlot ( &cid32 );
|
|
if ( slot >= 0 ) {
|
|
CityStateDesc *csd ;
|
|
csd = (CityStateDesc *)g_timeZones.
|
|
getValueFromSlot(slot);
|
|
char tv = csd->m_timeZoneOffset;
|
|
if ( tv != tzoff ) {
|
|
log("places: bad city timezone "
|
|
"csh=%"UINT32" z: %s",
|
|
(uint32_t)cid32,
|
|
name);
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
// get the pop from this
|
|
int32_t cpop = *(int32_t *)popTable.getValue ( &cid32 );
|
|
// if already in there, and this has more pop,
|
|
// then use it!
|
|
if ( pop > cpop ) {
|
|
csd->m_latitude = latitude;
|
|
csd->m_longitude = longitude;
|
|
popTable.addKey ( &cid32, &pop );
|
|
}
|
|
}
|
|
// timezone table maps city/state pair to a tzoffset
|
|
else {
|
|
// for each city/state pair we must store its
|
|
// lat/lon now too
|
|
CityStateDesc csd;
|
|
csd.m_timeZoneOffset = tzoff;
|
|
csd.m_useDST = useDST;
|
|
csd.m_latitude = latitude;
|
|
csd.m_longitude = longitude;
|
|
g_timeZones.addKey ( &cid32 , &csd );
|
|
popTable.addKey ( &cid32, &pop );
|
|
}
|
|
|
|
|
|
// add city name to the temporary hashtable of CityDescriptors.
|
|
// later we will serialize it into g_cityDescBuf and make
|
|
// the g_city hash table map ptrs into that. i think
|
|
// we can save it in cities.dat because HashTableX provides
|
|
// the mechanism for that.
|
|
addCity ( h , adm1 , pop , &maxPops );
|
|
|
|
// if the ascii hash is different, add as alias
|
|
addAlias ( ascii, adm1, h,pop, &maxPops );
|
|
// and the alt hash
|
|
addAlias ( alt, adm1, h,pop, &maxPops );
|
|
|
|
// now add the alternate names of this city
|
|
// as aliases, not just to g_cities, but also to
|
|
// g_aliases
|
|
int32_t len = gbstrlen(name);
|
|
if ( strncmp(name,"Township of ",12) == 0 )
|
|
addAlias ( name + 12,adm1,h,pop,&maxPops);
|
|
if ( strncmp(name,"Town of ",8) == 0 )
|
|
addAlias ( name + 8 ,adm1,h,pop,&maxPops );
|
|
if ( strncmp(name,"City of ",7) == 0 )
|
|
addAlias ( name + 7 ,adm1,h,pop,&maxPops );
|
|
if ( strncmp(ascii,"Township of ",12) == 0 )
|
|
addAlias ( ascii + 12,adm1,h,pop,&maxPops);
|
|
if ( strncmp(ascii,"Town of ",8) == 0 )
|
|
addAlias ( ascii + 8 ,adm1,h,pop,&maxPops );
|
|
if ( strncmp(ascii,"City of ",7) == 0 )
|
|
addAlias ( ascii + 7 ,adm1,h,pop,&maxPops );
|
|
// "New York City" equals "New York"
|
|
char *tail = name+len-5;
|
|
if ( len >=6 && strncmp(tail," City",5)==0) {
|
|
*tail = '\0';
|
|
addAlias ( name ,adm1,h,pop,&maxPops );
|
|
*tail = ' ';
|
|
}
|
|
tail = ascii+len-5;
|
|
if ( len >=6 && strncmp(tail," City",5)==0) {
|
|
*tail = '\0';
|
|
addAlias ( ascii ,adm1,h,pop,&maxPops );
|
|
*tail = ' ';
|
|
}
|
|
}
|
|
|
|
/*
|
|
// now scan each city in g_cities and set their CF_SINGLE_STATE
|
|
// flag if they only have one state
|
|
for ( int32_t i = 0 ; i < g_cities.m_numSlots ; i++ ) {
|
|
// skip empty slots
|
|
if ( ! g_cities.m_flags[i] ) continue;
|
|
// get the data value
|
|
uint64_t *bv = (uint64_t *)g_cities.getValueFromSlot(i);
|
|
// count bits on
|
|
int32_t nb = getNumBitsOn(*bv);
|
|
// sanity check
|
|
if ( nb == 0 ) { char *xx=NULL;*xx=0; }
|
|
// if only 1 set this flag
|
|
if ( nb == 1 ) *bv |= CF_UNIQUE;
|
|
}
|
|
*/
|
|
|
|
// close that file
|
|
fclose(fd);
|
|
|
|
//logf(LOG_INFO,"places: allCountries.txt had %"INT32" bad entries.",
|
|
logf(LOG_INFO,"places: places.txt had %"INT32" bad entries.",
|
|
badEntry);
|
|
|
|
// reset for this file
|
|
badEntry = 0;
|
|
|
|
//////////////////////////////////////////////////////////////
|
|
//
|
|
// LOAD THE postalCodes.txt file
|
|
//
|
|
//////////////////////////////////////////////////////////////
|
|
|
|
//country code :iso country code, 2 characters
|
|
//postal code :varchar(10)
|
|
//place name :varchar(180)
|
|
//admin name1 :1. order subdivision (state) varchar(100)
|
|
//admin code1 :1. order subdivision (state) varchar(20)
|
|
//admin name2 :2. order subdivision (county/province) varchar(100
|
|
//admin code2 :2. order subdivision (county/province) varchar(20)
|
|
//admin name3 :3. order subdivision (community) varchar(100)
|
|
//latitude :estimated latitude (wgs84)
|
|
//longitude :estimated longitude (wgs84)
|
|
//accuracy :accuracy of lat/lng from 1=estimated to 6=centroid
|
|
|
|
//
|
|
// crap canadian state abbreviations are not in allCountries.txt
|
|
// so use the "admin code1" in the postalCodes.txt file!
|
|
//
|
|
|
|
// . now read in the zip codes
|
|
// . make the filename to open
|
|
sprintf ( ff , "%spostalCodes.txt", g_hostdb.m_dir );
|
|
logf(LOG_INFO,"places: reading %s",ff);
|
|
fd = fopen ( ff, "r" );
|
|
if ( ! fd )
|
|
return log("places: failed to open %s: %s",ff,strerror(errno));
|
|
|
|
// make the city buf
|
|
SafeBuf sb;
|
|
|
|
line = 0;
|
|
|
|
// . go through the places in allCountries.txt
|
|
// . format described in /gb/geo/geonames/readme.txt
|
|
while ( fgets ( buf , 10000 , fd ) ) {
|
|
// length of line, including the terminating \n
|
|
int32_t wlen = gbstrlen(buf) ;
|
|
// sanity check
|
|
if ( wlen >= 9000 ) { char *xx=NULL;*xx=0; }
|
|
// skip if empty
|
|
if ( wlen <= 0 ) continue;
|
|
// null terminate it, instead of \n
|
|
buf[wlen-1]='\0';
|
|
|
|
// log it
|
|
if ( (line % 10000) == 0 )
|
|
log(LOG_INFO,"places: read postal line #%"INT32" out of "
|
|
"848,226 (%"INT32" places added)",line,
|
|
g_cities.m_numSlotsUsed);
|
|
line++;
|
|
|
|
// country id
|
|
uint8_t crid = 0;
|
|
// admin1code
|
|
char a1[2];
|
|
// reset
|
|
a1[0] = a1[1] = 0;
|
|
|
|
// count tabs
|
|
int32_t tabs = 0;
|
|
// point to the beginning of the line
|
|
char *p = buf;
|
|
// isoalte the zip code
|
|
char *zip = NULL;
|
|
char *cityName = NULL;
|
|
char *a1name = NULL;
|
|
char *a2name = NULL;
|
|
//char *zipEnd = NULL;
|
|
// parse out the tab delimited things from the line
|
|
for ( ; *p ; p++ ) {
|
|
// a temp var
|
|
char *s = p;
|
|
// put country code here
|
|
char cc[3];
|
|
// first is country code
|
|
if ( p == buf ) {
|
|
cc[0] = to_lower_a(s[0]);
|
|
cc[1] = to_lower_a(s[1]);
|
|
cc[2] = 0;
|
|
// sanity check
|
|
if ( s[2] != '\t' ) { char *xx=NULL;*xx=0;}
|
|
// to id
|
|
crid = getCountryId ( cc );
|
|
// must be valid
|
|
//if ( ! crid ) { char *xx=NULL;*xx=0; }
|
|
// there is a "gg" in there!
|
|
if ( ! crid ) break;
|
|
continue;
|
|
}
|
|
// skip if no tab
|
|
if ( *p != '\t' ) continue;
|
|
// count tabs
|
|
tabs++;
|
|
// after first tab is the POSTAL CODE
|
|
if ( tabs == 1 ) {
|
|
zip = p + 1;
|
|
continue;
|
|
}
|
|
if ( tabs == 2 ) {
|
|
// terminate zip for Words::set() below
|
|
*p = '\0';
|
|
cityName = p + 1;
|
|
continue;
|
|
}
|
|
if ( tabs == 3 ) {
|
|
// terminate for cityName
|
|
*p = '\0';
|
|
a1name = p + 1;
|
|
continue;
|
|
}
|
|
// . after 4th tab is admin code1
|
|
// . admin1 code (two letters)
|
|
// . readme.txt says varchar(20) but
|
|
// /gb/geo/admin1Codes.txt seems to say 2 chars
|
|
// . actually i have seen 3 letter ones... but they
|
|
// if truncated to two chars would be unique in their
|
|
// respective country. i.e. GB.ENG, GB.NIR, ...
|
|
// . BUT for GR.ESYE11 through GR.ESYE14, ... just use
|
|
// the last two chars!
|
|
if ( tabs == 4 ) {
|
|
// terminate for a1name
|
|
*p = '\0';
|
|
// usually these 2 chars are digits!
|
|
a1[0] = to_lower_a(p[1]);
|
|
a1[1] = to_lower_a(p[2]);
|
|
// one letter province/state code?
|
|
if ( p[2] == '\t' ) {
|
|
a1[1] = 0;
|
|
continue;
|
|
}
|
|
// panic!
|
|
if ( p[3] == '\t' ) continue;
|
|
// watch out for GReece
|
|
if ( cc[0] != 'g' ) continue;
|
|
if ( cc[1] != 'r' ) continue;
|
|
// and its "states" (admin1 codes)
|
|
if ( a1[0] != 'e' ) continue;
|
|
if ( a1[1] != 's' ) continue;
|
|
// use the last two for this guy!
|
|
s += 4;
|
|
if ( ! is_digit(s[0]) ) continue;
|
|
if ( ! is_digit(s[1]) ) continue;
|
|
a1[0] = s[0];
|
|
a1[1] = s[1];
|
|
}
|
|
if ( tabs == 5 ) {
|
|
// terminate for cityName
|
|
//*p = '\0';
|
|
a2name = p + 1;
|
|
continue;
|
|
}
|
|
if ( tabs == 6 ) {
|
|
// terminate for a2name
|
|
*p = '\0';
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// if we got an illegit adm1 code try convert the admin 1 name
|
|
bool legit = true;
|
|
if ( !a1[0] )
|
|
legit = false;
|
|
if ( !is_ascii(a1[0]) )
|
|
legit = false;
|
|
if ( !is_ascii(a1[1]) )
|
|
legit = false;
|
|
// empty is NULL
|
|
if ( a1name && ! *a1name ) a1name = NULL;
|
|
if ( a2name && ! *a2name ) a2name = NULL;
|
|
if ( cityName && ! *cityName ) cityName = NULL;
|
|
//if ( is_ascii(a1[0])&&is_ascii(a1[1])&&is_ascii(a1[2]) )
|
|
// legit = false;
|
|
// do we got this?
|
|
//if ( ! legit && ! a1name ) continue;
|
|
// not a chance to save ourselves if no adm1 name given
|
|
if ( ! legit && ! a1name && ! a2name && ! cityName ) {
|
|
badEntry++;
|
|
continue;
|
|
}
|
|
|
|
// now we must have a valid a1name because as we have found
|
|
// the adm1 code in postalCodes.txt does not always correspond
|
|
// to those in allCountries.txt. like "british columbia" is
|
|
// "02" in allCountries.txt and "bc" in postalCodes.txt.
|
|
if ( ! a1name ) {
|
|
badEntry++;
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// skip all NON-USA places now that we are specializing
|
|
//
|
|
if ( crid != CRID_US )
|
|
continue;
|
|
|
|
/*
|
|
// try to convert it
|
|
PlaceDesc *tpd ;
|
|
int32_t ss;
|
|
int64_t th;
|
|
int64_t *twids;
|
|
Words tw;
|
|
|
|
// make a city hash that would match Place::m_hash
|
|
//int64_t cityHash = hashStringXor ( cityName );
|
|
|
|
//int64_t tmpHash ;
|
|
//tmpHash = hash64Lower_utf8 ( cityName , gbstrlen(cityName) ) ;
|
|
//int32_t cityHash = (int32_t)(ch & 0xffffffff);
|
|
//if ( strncmp(cityName,"Budlake",7)==0 )
|
|
// log("hey");
|
|
if ( ! legit ) {
|
|
char *use = NULL;
|
|
if ( ! use ) use = a2name;
|
|
if ( ! use ) use = a1name;
|
|
if ( ! use ) use = cityName;
|
|
if ( ! use ) { char *xx=NULL;*xx=0; }
|
|
// hash each alnum word in there
|
|
redo:
|
|
if ( ! use ) { char *xx=NULL;*xx=0; }
|
|
// hash the name
|
|
int64_t uh = hashStringXor ( use );
|
|
// see if we got it
|
|
City *c = (City *) g_cities.getValue ( &uh );
|
|
// set adm1 i guess
|
|
if ( c ) {
|
|
legit = true;
|
|
adm1Bits = c->m_adm1Bits;
|
|
}
|
|
// a nested loop
|
|
for ( ; ss >= 0 ; ss = g_cities.getNextSlot(ss,&th)) {
|
|
// get the place
|
|
tpd=(PlaceDesc *)g_cities.getValueFromSlot(ss);
|
|
// must be our ctry
|
|
if ( tpd->m_crid != crid ) continue;
|
|
// got it
|
|
a1[0] = tpd->m_adm1[0];
|
|
a1[1] = tpd->m_adm1[1];
|
|
legit = true;
|
|
break;
|
|
}
|
|
// if still not found, try the other
|
|
if ( ! legit && use && use == a2name && a1name ) {
|
|
use = a1name;
|
|
goto redo;
|
|
}
|
|
if ( ! legit && use && use == a1name && cityName ) {
|
|
use = cityName;
|
|
goto redo;
|
|
}
|
|
}
|
|
*/
|
|
|
|
static int32_t s_printed = 0;
|
|
// sanity check
|
|
if ( ! legit ) {
|
|
if ( ++s_printed < 100 )
|
|
log("places: bad adm1 for "
|
|
"zip=\"%s\" cityName=\"%s\" "
|
|
"adm1Name=\"%s\" adm2Name=\"%s\"",
|
|
zip, cityName,a1name,a2name);
|
|
badEntry++;
|
|
continue;
|
|
}
|
|
|
|
// the two-letter adm1 in postalCodes.txt sometimes differs
|
|
// from those in allCountries.txt. like, for example,
|
|
// British Columbia has adm1 code of "02" in allCountries.txt
|
|
// but it is "bc" in postalCodes.txt.
|
|
// so let's hash the full adm1 name in postalCodes.txt in order
|
|
// to get the proper adm1 from allCountries.txt.
|
|
|
|
if ( ! a1name ) continue;
|
|
// hash the proper name of the adm1
|
|
int64_t HH = getWordXorHash ( a1name );
|
|
// skip if empty
|
|
if ( HH == 0 ) continue;
|
|
// now get state
|
|
int32_t pos = getStateOffset ( &HH );
|
|
// skip if could not match it to an adm1 in allCountries.txt
|
|
// by the full name of the adm1
|
|
if ( pos < 0 ) { char *xx=NULL;*xx=0; }//continue;
|
|
|
|
// set it
|
|
ZipDesc zd;
|
|
//zd.m_crid = crid;
|
|
// set the state's bit. each state has its own unique bit
|
|
zd.m_adm1Bits = 1LL << pos;
|
|
zd.m_adm1[0] = a1[0];
|
|
zd.m_adm1[1] = a1[1];
|
|
zd.m_cityHash = getWordXorHash ( cityName );
|
|
// centroid lat/lon now
|
|
zd.m_latitude = 999.0;
|
|
zd.m_longitude = 999.0;
|
|
|
|
// sanity check
|
|
if ( ! zd.m_cityHash ) { char *xx=NULL;*xx=0; }
|
|
|
|
// offset to current position
|
|
int32_t cityOffset = sb.length();
|
|
// store it
|
|
int32_t cityNameLen = gbstrlen(cityName);
|
|
sb.safeMemcpy ( cityName , cityNameLen );
|
|
sb.safeMemcpy ( "\0", 1 ); // null terminate
|
|
// update zd
|
|
zd.m_cityOffset = cityOffset;
|
|
|
|
int64_t zh = getWordXorHash ( zip );
|
|
// skip if bad
|
|
if ( ! zh ) { badEntry++; continue; }
|
|
|
|
// sanity check
|
|
//if ( g_zips.isInTable ( &zh ) ) {
|
|
// // both willowbrook,Il and hinsdale,IL have the
|
|
// // same zip code!
|
|
// //char *xx=NULL;*xx=0; }
|
|
// continue;
|
|
//}
|
|
// debug point
|
|
//if ( zh == 70799779105646092LL )
|
|
// log("hey");
|
|
|
|
if ( ! g_zips.addKey ( &zh , &zd ) ) return false;
|
|
|
|
}
|
|
// close that file
|
|
fclose(fd);
|
|
|
|
//
|
|
// now open zipcode.csv and add the lat/lon of each zip code
|
|
// from http://www.boutell.com/zipcodes/zipcode.zip
|
|
//
|
|
sprintf ( ff , "%szipcode.csv", g_hostdb.m_dir );
|
|
logf(LOG_INFO,"places: reading %s",ff);
|
|
fd = fopen ( ff, "r" );
|
|
if ( ! fd )
|
|
return log("places: failed to open %s: %s",ff,strerror(errno));
|
|
line = 0;
|
|
// go through the zipcodes in zipcode.csv, one per line
|
|
while ( fgets ( buf , 10000 , fd ) ) {
|
|
// length of line, including the terminating \n
|
|
int32_t wlen = gbstrlen(buf) ;
|
|
// sanity check
|
|
if ( wlen >= 9000 ) { char *xx=NULL;*xx=0; }
|
|
// skip if empty
|
|
if ( wlen <= 0 ) continue;
|
|
// null terminate it, instead of \n
|
|
buf[wlen-1]='\0';
|
|
// log it
|
|
if ( (line % 10000) == 0 )
|
|
log(LOG_INFO,"places: read line #%"INT32"",line);
|
|
line++;
|
|
// for debug
|
|
char *p = buf;
|
|
// lat is after 7th quote, lon is after 9th quote
|
|
int32_t qcount = 0;
|
|
float latitude = 999.0;
|
|
float longitude = 999.0;
|
|
char *zip = NULL;
|
|
for ( ; *p ; p++ ) {
|
|
if ( *p == '\"' ) qcount++;
|
|
else continue;
|
|
if ( qcount == 1 ) zip = p+1;
|
|
if ( qcount == 7 ) latitude = atof (p+1);
|
|
if ( qcount == 9 ) longitude = atof (p+1);
|
|
}
|
|
if ( ! zip ) continue;
|
|
// must be numeric (disregard line 1 that has "zip")
|
|
if ( ! is_digit(zip[0]) ) continue;
|
|
// null term
|
|
if ( zip[6] != '\"' ) zip[6] = '\0';
|
|
else { char *xx=NULL;*xx=0; }
|
|
// look it up
|
|
int64_t zh = getWordXorHash ( zip );
|
|
// skip if bad
|
|
ZipDesc *zd = (ZipDesc *)g_zips.getValue ( &zh );
|
|
// must be there
|
|
if ( ! zd ) {
|
|
logf(LOG_INFO,"places: could not find zip %s",zip);
|
|
continue;
|
|
}
|
|
// set it
|
|
zd->m_latitude = latitude;
|
|
zd->m_longitude = longitude;
|
|
}
|
|
fclose(fd);
|
|
|
|
//
|
|
// scan all zips and make sure all have lat/lon
|
|
//
|
|
int32_t missed = 0;
|
|
for ( int32_t i = 0 ; i < g_zips.m_numSlotsUsed ; i++ ) {
|
|
// skip i fempty bucket
|
|
if ( ! g_zips.m_flags[i] ) continue;
|
|
// get it
|
|
ZipDesc *zd = (ZipDesc *)g_zips.getValueFromSlot(i);
|
|
// check it
|
|
if ( zd->m_latitude == 999.0 ||
|
|
zd->m_longitude == 999.0 )
|
|
missed++;
|
|
}
|
|
logf(LOG_INFO,"places: missed lat/lon for %"INT32" zipcodes",missed);
|
|
|
|
|
|
logf(LOG_INFO,"places: postalCodes.txt had %"INT32" bad entries.",
|
|
badEntry);
|
|
|
|
/*
|
|
// convert the indicator count table into g_indicators for IND_NAME
|
|
// and add them into g_indicators now
|
|
for ( int32_t i = 0 ; i < ct.m_numSlots ; i++ ) {
|
|
// skip if empty
|
|
if ( ct.m_flags[i] == 0 ) continue;
|
|
// this is a count table
|
|
int32_t count = *(int32_t *)ct.getValueFromSlot ( i );
|
|
// skip if not popular
|
|
if ( count < MIN_POP_COUNT ) continue;
|
|
// skip for now
|
|
continue;
|
|
// make into score
|
|
//float boost = 1.0 + (9.0 * (float)count / (float)MAX);
|
|
//float boost = 1.00;
|
|
// increment for every count
|
|
//for ( int32_t j = 10 ; j < count ; j++ )
|
|
// boost *= 1.002;
|
|
// limit it to 1.5 for now...
|
|
//if ( boost > 1.5 ) boost = 1.5;
|
|
// get wid
|
|
//int64_t *wid = (int64_t *)ct.getKey ( i );
|
|
// . add it
|
|
// . use a boost of just 0.25 for now
|
|
//if(! addIndicator ( *wid , IND_NAME , 0.25 ) ) // boost ) )
|
|
// return log("places: failed to make indicators.");
|
|
// debug
|
|
//char *str = *(char **)st.getValue ( wid );
|
|
// show it
|
|
//logf (LOG_DEBUG,"events: top place %s boost=%.02f",
|
|
// str,boost);
|
|
}
|
|
*/
|
|
|
|
//////////////////////////////////////////////////////////////
|
|
//
|
|
// add the aliases
|
|
//
|
|
//////////////////////////////////////////////////////////////
|
|
|
|
logf(LOG_INFO,"places: making aliases.dat");
|
|
|
|
// . abbreviations for popular cities
|
|
// . now we use the s_cityList array
|
|
int32_t ncl = (int32_t)sizeof(s_cityList)/ sizeof(AliasDesc);
|
|
for ( int32_t i = 0 ; i < ncl ; i++ ) {
|
|
char *s1 = s_cityList[i].m_s1;
|
|
char *s2 = s_cityList[i].m_s2;
|
|
// use this now
|
|
uint64_t h1 = getWordXorHash(s1);
|
|
uint64_t h2 = getWordXorHash(s2);
|
|
// skip if the same
|
|
if ( h1 == h2 ) continue;
|
|
// sanity check
|
|
if ( h1 == 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( h2 == 0 ) { char *xx=NULL;*xx=0; }
|
|
// get it
|
|
CityDesc *cdp2 = (CityDesc *)g_cities.getValue ( &h2 );
|
|
// must be there
|
|
if ( ! cdp2 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . add it as an alias for h2
|
|
// . will add to g_aliases table which maps our
|
|
// cityHash and adm1Str to the normalized cityHash
|
|
// . also adds to g_cities which maps a normalized city
|
|
// hash to a bit vector of states that contain a city
|
|
// by that name
|
|
addAlias ( s1 , s_cityList[i].m_adm1,h2,
|
|
s_cityList[i].m_pop,&maxPops);
|
|
|
|
// you know addAlias() now adds this junk to g_cities...!
|
|
/*
|
|
// get our special cdp
|
|
CityDesc *cdp1 = (CityDesc *)g_cities.getValue ( &h1 );
|
|
// if not there, add one
|
|
if ( ! cdp1 ) {
|
|
// make CityDesc to add
|
|
CityDesc cd;
|
|
// . we choose most pop state for this alias
|
|
// . so "SF" has two entries in s_cityList and the
|
|
// "mostPopState" is "ca" for both
|
|
char *ss = s_cityList[i].m_mostPopStateAbbr;
|
|
// get this
|
|
StateDesc *tsd = getStateDesc(ss);
|
|
// convert to index
|
|
int32_t si = tsd - &s_states[0];
|
|
// sanity
|
|
if ( si < 0 ) { char *xx=NULL;*xx=0; }
|
|
// store it
|
|
cd.m_mostPopularState = si;
|
|
// and the bits indicating states we are in
|
|
cd.m_adm1Bits = cdp2->m_adm1Bits;
|
|
if ( ! g_cities.addKey(&h1,&cd) ){ char*xx=NULL;*xx=0;}
|
|
// flag it as an alias so getCityId32() knows to
|
|
// look it up special...
|
|
//cd.m_adm1Bits |= 0x8000000000000000LL;
|
|
continue;
|
|
}
|
|
// then update bits
|
|
cdp1->m_adm1Bits |= cdp2->m_adm1Bits;
|
|
*/
|
|
}
|
|
|
|
|
|
|
|
// save it
|
|
logf(LOG_INFO,"places: saving timezones.dat");
|
|
|
|
if ( ! g_timeZones.save ( g_hostdb.m_dir , "timezones.dat" ) )
|
|
return log("places: failed to save timezones.dat");
|
|
|
|
// save it
|
|
logf(LOG_INFO,"places: saving cities.dat");
|
|
|
|
if ( ! g_cities.save ( g_hostdb.m_dir , "cities.dat" ) )
|
|
return log("places: failed to save cities.dat");
|
|
|
|
logf(LOG_INFO,"places: saving aliases.dat");
|
|
|
|
if ( ! g_aliases.save ( g_hostdb.m_dir , "aliases.dat" ) )
|
|
return log("places: failed to save aliases.dat");
|
|
|
|
logf(LOG_INFO,"places: saving zips.dat");
|
|
|
|
char *tbuf = sb.getBufStart();
|
|
int32_t tbufSize = sb.length();
|
|
if ( ! g_zips.save ( g_hostdb.m_dir , "zips.dat",tbuf,tbufSize ) )
|
|
return log("places: failed to save zips.dat");
|
|
|
|
// let this memlose
|
|
g_cityBuf = tbuf;
|
|
g_cityBufSize = tbufSize;
|
|
// do not let "sb" free it
|
|
//sb.m_buf = NULL;
|
|
sb.detachBuf();
|
|
|
|
//if ( ! g_indicators.save ( g_hostdb.m_dir, "indicators.dat" ) )
|
|
// return log("places: failed to save indicators.dat");
|
|
|
|
|
|
//////////////////////////////////////////////////////////////
|
|
//
|
|
// LOAD THE planet-090421.osm file to get street names
|
|
//
|
|
//////////////////////////////////////////////////////////////
|
|
|
|
/*
|
|
// init indicator table
|
|
g_streets.set ( 7 , // keySize
|
|
0 ,
|
|
0 , // initial # slots
|
|
NULL , // initial buf
|
|
0 , // initial buf size
|
|
false , // allowDup keys?
|
|
0 ); // niceness
|
|
|
|
// load inidcator table
|
|
if ( g_streets.load ( g_hostdb.m_dir , "streetnames.dat" ) )
|
|
return true;
|
|
|
|
// . open the unholy planet-090421.osm file to create streetnames.dat
|
|
// . see http://wiki.openstreetmap.org/wiki/Data_Primitives to
|
|
// explain a bit about this xml file
|
|
// . http://wiki.openstreetmap.org/wiki/Map_Features
|
|
// . http://wiki.openstreetmap.org/wiki/Develop
|
|
// . http://code.google.com/apis/maps/documentation/examples/
|
|
sprintf ( ff , "%splanet-090421.osm", g_hostdb.m_dir );
|
|
logf(LOG_INFO,"places: reading %s",ff);
|
|
FILE *fd = fopen ( ff, "r" );
|
|
if ( ! fd )
|
|
return log("places: failed to open %s: %s",ff,strerror(errno));
|
|
*/
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
// . "boost" is how much to boost the Place's score by if it has this indicator
|
|
bool addIndicator ( char *s , char bit , float indScore ) {
|
|
// hash it
|
|
int64_t h = hash64Lower_utf8 ( s , gbstrlen(s) );
|
|
return addIndicator ( h , bit , indScore );
|
|
}
|
|
|
|
bool addIndicator ( int64_t h , char bit , float indScore ) {
|
|
// plaza is two types of indicator, street and name
|
|
IndDesc *pid = (IndDesc *)g_indicators.getValue (&h);
|
|
// if there, augment the bits
|
|
if ( pid ) {
|
|
pid->m_bit |= bit;
|
|
return true;
|
|
}
|
|
// add in some indicators of our own
|
|
IndDesc id;
|
|
// set bit, should only be one
|
|
id.m_bit = bit;
|
|
id.m_indScore = indScore;
|
|
// add it. should gbmemcpy "pd"
|
|
return g_indicators.addKey ( &h , &id ) ;
|
|
}
|
|
|
|
// "baseScore" should be event id
|
|
bool Address::hash ( int32_t baseScore ,
|
|
HashTableX *dt ,
|
|
uint32_t date ,
|
|
Words *words ,
|
|
Phrases *phrases ,
|
|
SafeBuf *pbuf ,
|
|
HashTableX *wts ,
|
|
SafeBuf *wbuf ,
|
|
int32_t version ,
|
|
int32_t niceness ) {
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool Addresses::hashForPlacedb ( int64_t docId ,
|
|
int32_t siteHash32 ,
|
|
int32_t ip ,
|
|
HashTableX *dt ) {
|
|
|
|
// sanity check
|
|
if ( dt->m_ds != 512 ) { char *xx=NULL;*xx=0; }
|
|
if ( dt->m_ks != 16 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ensure we allow dups because some streets are repeated on
|
|
// the page, but with different place names. see
|
|
// http://www.zvents.com/albuquerque-nm/venues/show/11865-kimo-theatre
|
|
//if ( ! dt->m_allowDups ) { char *xx=NULL;*xx=0; }
|
|
|
|
// now create the meta rdb list
|
|
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( dt->m_niceness );
|
|
// get it
|
|
Address *a = (Address *)m_am.getPtr(i);
|
|
// skip if lat/lon
|
|
if ( a->m_flags3 & AF2_LATLON ) continue;
|
|
// is it good?
|
|
bool good = false;
|
|
// being inlined is awesome
|
|
if ( a->m_flags & AF_INLINED ) good = true;
|
|
// if the street is verified, add the whole thing too!
|
|
// even if the street num and place name are not verified.
|
|
if ( a->m_flags & AF_VERIFIED_STREET ) good = true;
|
|
// sometimes a street can exist in two cities or states
|
|
if ( a->m_flags & AF_AMBIGUOUS ) good = false;
|
|
// do not add addresses that have no street per se
|
|
if ( a->m_street->m_flags2 & PLF2_IS_NAME ) good = false;
|
|
// no intersections
|
|
if ( a->m_street->m_flags2 & PLF2_INTERSECTION ) good = false;
|
|
// . skip if not good
|
|
// . we no longer add non-inlined addresses cuz those are
|
|
// not as accurate. many pages have the street address
|
|
// too far from the city and state, and we use one from the
|
|
// tag and it ain't right.
|
|
// . THE TAVERN ~
|
|
// 4007 Menaul NE ~
|
|
// Between Washington and Carlisle ~
|
|
// 87110 ~
|
|
// with the tag:
|
|
// New Mexico Music Commission;;PO Box 1450;Santa Fe(nm);...
|
|
// caused it to get "Santa Fe" as the city
|
|
if ( ! good ) continue;
|
|
// not if amibiguous
|
|
//if ( a->m_flags & AF_AMBIGUOUS ) good = false;
|
|
// . skip if no zip
|
|
// . hmmm, a lot seem to be missing zip, so forget about it
|
|
//if ( ! a->m_zip ) continue;
|
|
// seraialize into "buf"
|
|
char buf[513];
|
|
// reset it to all 0s
|
|
memset ( buf , 0 , 513 );
|
|
// convert to semicolon format
|
|
int32_t size = a->serialize ( buf , 511 , NULL , false , false);
|
|
// skip on error, probably > 511 bytes!
|
|
if ( size < 0 ) continue;
|
|
// make the key for this address
|
|
key128_t k = a->makePlacedbKey ( m_docId , false,false );
|
|
// store it for getNamedbData() to use
|
|
if ( a->m_placedbKey != k ) { char *xx=NULL; *xx=0; }
|
|
|
|
// if key already added, skip. assume the first one is better.
|
|
// www.zvents.com/albuquerque-nm/venues/show/11865-kimo-theatre
|
|
// has two different place names for Kimo Theater street addr
|
|
|
|
// will add the entire 512 bytes of buffer to this hash table
|
|
// so it is really up to XmlDoc::addTable128() to fix that
|
|
// when it creates the corresponding meta list. it will need
|
|
// to shrink that list
|
|
if ( ! dt->isInTable (&k) &&
|
|
! dt->addKey ( (char *)&k , buf ) ) return false;
|
|
|
|
// now if the name is verified, then use the hash of the
|
|
// name in place of the street hash
|
|
if ( a->m_flags & AF_VERIFIED_PLACE_NAME_1 ) {
|
|
// use that
|
|
key128_t k2 = a->makePlacedbKey ( m_docId,true,false);
|
|
// add again
|
|
if ( ! dt->addKey ( (char *)&k2 , buf ) ) return false;
|
|
}
|
|
// same with place name 2
|
|
if ( a->m_flags & AF_VERIFIED_PLACE_NAME_2 ) {
|
|
// use that
|
|
key128_t k2 = a->makePlacedbKey ( m_docId,false,true);
|
|
// add again
|
|
if ( ! dt->addKey ( (char *)&k2 , buf ) ) return false;
|
|
}
|
|
|
|
// . skip if not a venue location for this venue website
|
|
if ( ! ( a->m_flags & AF_VENUE_DEFAULT ) ) continue;
|
|
// . do not do this now... key formation is setting del bit!
|
|
continue;
|
|
// . we do not really use this right now...
|
|
// . add the address of the website itself!! a venue website!!
|
|
// . use siteHash32 as the top key
|
|
// . make the key
|
|
key128_t k3;
|
|
k3.n0 = 0LL;
|
|
k3.n1 = 0LL;
|
|
k3.n1 = siteHash32;
|
|
k3.n1 <<= 32;
|
|
k3.n0 = (docId<<1);
|
|
// add it
|
|
if ( ! dt->addKey((char *)&k3,buf)) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
#include "Placedb.h"
|
|
|
|
// . H = 48 bit hash of (streetname,ctryId,adm1,city)
|
|
// N = 16 bit hash of streetnum
|
|
// . placedb key format:
|
|
// H (48 bits) | N (16 bits) |docId(38bits) | delbit(1)
|
|
// . data = serialized address ( see setFromStr() function)
|
|
// . "streetname" should exclude any indicators
|
|
// . we determine the group responsible for this key by the 64 bit hash (H)
|
|
// alone... see Hostdb::getGroupId()
|
|
key128_t Address::makePlacedbKey (int64_t docId,bool useName1,bool useName2){
|
|
|
|
// the key we are setting
|
|
key128_t k;
|
|
// sanity check, must be 8 bits or less
|
|
//if ( m_adm1->m_crid > 255 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// sanity
|
|
if ( m_cityId32 == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// save for sanity check. mask it to 25 bits
|
|
int32_t snh = m_street->m_streetNumHash & 0x01ffffff;
|
|
|
|
// add in street name (not including indicators)
|
|
int64_t h = m_street->m_hash;
|
|
// . use place name 1 instead of street name?
|
|
// . we use this for when "Tingley Colesium" is given and no street!
|
|
if ( useName1 || useName2 ) {
|
|
// use the name hash in place of the street hash!!! HACK
|
|
if ( useName1 ) h = m_name1->m_hash;
|
|
if ( useName2 ) h = m_name2->m_hash;
|
|
// anytime we use a name as the street hash we have to
|
|
// xor in this to prevent a place name from matching
|
|
// a street name (see above)
|
|
h ^= 0x123456;
|
|
// and incorporate the street hash into the snh so that
|
|
// sendBackAddress() function's life is easier
|
|
snh ^= m_street->m_hash;
|
|
// mask it
|
|
snh &= 0x01ffffff;
|
|
}
|
|
|
|
// country id
|
|
//h = hash64 ( (int64_t)m_adm1.m_crid , h );
|
|
// adm1
|
|
// get the two-letter state abbreviation code (nm = new mexico)
|
|
char *adm1Str = NULL;
|
|
if ( m_adm1 ) adm1Str = m_adm1->m_adm1;
|
|
else if ( m_zip ) adm1Str = m_zip->m_adm1;
|
|
// unique cities like "Albuquerque" imply a state
|
|
//else if ( m_city && m_city->m_adm1[0] ) adm1Str = m_city->m_adm1;
|
|
else { char *xx=NULL;*xx=0; }
|
|
h = hash64 ( (int64_t)(*(uint16_t *)adm1Str) , h );
|
|
// city
|
|
int64_t cityHash = 0LL;
|
|
if ( m_city ) cityHash = m_city->m_hash;
|
|
else if ( m_zip ) cityHash = m_zip->m_cityHash;
|
|
else { char *xx=NULL;*xx=0; }
|
|
// use the *city id* to deal with aliases of the same city
|
|
uint64_t cid64 = (uint64_t)getCityId32 ( cityHash , adm1Str );
|
|
// incorporate that into "h"
|
|
h = hash64 ( cid64 , h );
|
|
// store that in most significant int64_t
|
|
k.n1 = h;
|
|
|
|
// street hash
|
|
int64_t n0 = snh;
|
|
// shift up for docid
|
|
n0 <<= 38;
|
|
// sanity
|
|
if ( (int32_t)NUMDOCIDBITS != 38 ) { char *xx=NULL;*xx=0; }
|
|
// put that in
|
|
n0 |= docId;
|
|
// empty bit for del bit
|
|
n0 <<= 1;
|
|
// set the del bit to indicate a positive key
|
|
n0 |= 0x01;
|
|
// set
|
|
k.n0 = n0;
|
|
|
|
// sanity checks
|
|
if ( g_placedb.getBigHash (&k) != h ) { char *xx=NULL;*xx=0; }
|
|
if ( g_placedb.getStreetNumHash(&k) != snh ) { char *xx=NULL;*xx=0; }
|
|
if ( g_placedb.getDocId (&k) != docId ) { char *xx=NULL;*xx=0; }
|
|
// return
|
|
return k;
|
|
}
|
|
|
|
/*
|
|
// similar to Address::serialize()
|
|
int64_t Address::makeAddressVotingTableKey ( ) {
|
|
|
|
int64_t h = 0LL;
|
|
Place *d = NULL;
|
|
|
|
// incorporate place name into the hash
|
|
d = &m_name1;
|
|
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
|
|
// and secondary name
|
|
d = &m_name2;
|
|
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
|
|
// incorporate suite into the hash
|
|
d = &m_suite;
|
|
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
|
|
// incorporate street into the hash
|
|
d = &m_street;
|
|
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
|
|
// incorporate city into the hash
|
|
d = &m_city;
|
|
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
// adm1 of the city
|
|
if ( d->m_str ) h = hash64 ( d->m_adm1 , 2 , h );
|
|
|
|
// incorporate zip into the hash
|
|
d = &m_zip;
|
|
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
// adm1 as well
|
|
if ( d->m_str ) h = hash64 ( d->m_adm1 , 2 , h );
|
|
|
|
// incorporate adm1 into the hash
|
|
d = &m_adm1;
|
|
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
// adm1 as well
|
|
if ( d->m_str ) h = hash64 ( d->m_adm1 , 2 , h );
|
|
|
|
// incorporate adm2 into the hash
|
|
//d = &m_adm2;
|
|
//if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
// adm1 as well
|
|
//if ( d->m_str ) h = hash64 ( d->m_adm1 , 2 , h );
|
|
|
|
// incorporate ctry into the hash
|
|
d = &m_ctry;
|
|
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
|
|
|
|
return h;
|
|
}
|
|
*/
|
|
|
|
///////////////////////////////////////////////
|
|
//
|
|
// Msg2c : for verifying all the places/addresses
|
|
//
|
|
///////////////////////////////////////////////
|
|
|
|
Msg2c::Msg2c() {
|
|
m_replies = 0;
|
|
m_requests = 0;
|
|
//m_mcast.constructor();
|
|
m_initializedInUse = false;
|
|
}
|
|
|
|
#include "Process.h"
|
|
|
|
Msg2c::~Msg2c () {
|
|
// no destroying if still awaiting replies
|
|
if ( m_replies != m_requests && ! g_process.m_exiting ) {
|
|
char *xx=NULL;*xx=0; }
|
|
reset();
|
|
}
|
|
|
|
void Msg2c::reset() {
|
|
m_replies = 0;
|
|
// all done if never initialized the multicasts
|
|
if ( ! m_initializedInUse ) return;
|
|
// int16_tcut
|
|
int32_t max = (int32_t)MAX_ADDR_REQUESTS_OUT;
|
|
// call DEstructors on multicasts
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
m_mcasts[i].destructor();
|
|
}
|
|
}
|
|
|
|
|
|
// . sets Address::m_verified to 1 if verified
|
|
// . returns false if blocked
|
|
// . returns true and sets g_errno on error
|
|
// . and also sets the "avt" address verification table which we serialize
|
|
// into the TitleRec for re-parsing purposes later on, so we consistently
|
|
// re-parse
|
|
bool Msg2c::verifyAddresses ( Addresses *aa ,
|
|
//char *coll ,
|
|
collnum_t collnum ,
|
|
int32_t domHash32 ,
|
|
int32_t ip ,
|
|
int32_t niceness ,
|
|
void *state ,
|
|
void (* callback)(void *state ) ) {
|
|
|
|
m_niceness = niceness;
|
|
m_addresses = aa;
|
|
m_collnum = collnum;
|
|
m_domHash32 = domHash32;
|
|
m_ip = ip;
|
|
m_callback = callback;
|
|
m_state = state;
|
|
// reset
|
|
m_errno = 0;
|
|
m_requests = 0;
|
|
m_replies = 0;
|
|
m_doneLaunching = false;
|
|
|
|
// reset address ptr
|
|
m_i = 0;
|
|
|
|
// all done if no addresses!
|
|
if ( m_addresses->m_am.getNumPtrs() == 0 ) return true;
|
|
|
|
// sanity check
|
|
if ( aa->m_sb.length() != 0 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// . launch the requests
|
|
// . returns false if we are waiting for replies to come in
|
|
if ( ! launchRequests() ) return false;
|
|
// fill the the m_sb buf with all replies
|
|
//allDone();
|
|
// did not block and all replies are in
|
|
return true;
|
|
}
|
|
|
|
// keep tabs on total out
|
|
static int32_t s_totalOut = 0;
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno and returns true on error
|
|
bool Msg2c::launchRequests ( ) {
|
|
// clear it
|
|
g_errno = 0;
|
|
// how many max can be out?
|
|
int32_t maxOut = (int32_t)MAX_ADDR_REQUESTS_OUT;
|
|
// but be careful
|
|
if ( s_totalOut >= 200 ) maxOut = 1;
|
|
// we are only built for one at a time since request buffer is static
|
|
//if ( (int32_t)MAX_ADDR_REQUESTS != 1 ) { char *xx=NULL;*xx=0; }
|
|
loop:
|
|
// all done?
|
|
if ( m_i == m_addresses->m_am.getNumPtrs() )
|
|
m_doneLaunching = true;
|
|
// return true if nothing to launch
|
|
if ( m_doneLaunching )
|
|
return (m_requests == m_replies);
|
|
// don't bother if already got an error
|
|
if ( m_errno )
|
|
return (m_requests == m_replies);
|
|
// limit max to 5ish
|
|
if (m_requests-m_replies >= maxOut ) // MAX_ADDR_REQUESTS_OUT)
|
|
return (m_requests==m_replies);
|
|
// . limit total requests for better performance
|
|
// . www.vinarium-usa.com does like 500,000 lookups. it would take
|
|
// like 30 seconds on a single test server. limiting to 50,000
|
|
// lookups it still takes 10 seconds on titan.
|
|
// . this limit doesn't affect any other pages in urls.txt - 11/18/11
|
|
if ( m_requests > 50000 ) {
|
|
if ( m_requests == m_replies )
|
|
log("addr: limiting msg2c requests to 50000 for %s",
|
|
m_addresses->m_url->m_url);
|
|
return (m_requests==m_replies);
|
|
}
|
|
// take a breath
|
|
QUICKPOLL(m_niceness);
|
|
|
|
Address *a = (Address *)m_addresses->m_am.getPtr(m_i);
|
|
// skip it
|
|
m_i++;
|
|
// assume not verified
|
|
a->m_replyFlags = 0;
|
|
|
|
// . skip if it is like "call for location"
|
|
// . no no no this is messing up "at the filling station" for
|
|
// http://www.zvents.com/albuquerque-nm/events/show/
|
|
// 88688960-sea-the-invalid-mariner
|
|
//if ( a->m_street->m_flags2 & PLF2_AFTER_AT ) {
|
|
// // might be done
|
|
// if ( m_i == m_addresses->m_na ) m_doneLaunching = true;
|
|
// // try the next one
|
|
// goto loop;
|
|
//}
|
|
|
|
// max size of request
|
|
//int32_t max = 1024;
|
|
// request is startKey,endKey,pihash,niceness,coll
|
|
//char *requestBuf = a->m_requestBuf;
|
|
|
|
// prepare to get a request buf if we haven't already
|
|
if ( ! m_initializedInUse ) {
|
|
int32_t max = (int32_t)MAX_ADDR_REQUESTS_OUT;
|
|
memset(m_inUse,0,max);
|
|
// call constructors on multicasts
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
m_mcasts[i].constructor();
|
|
}
|
|
// do not repeat
|
|
m_initializedInUse = true;
|
|
}
|
|
// get a request buf, assume none (-1)
|
|
int32_t reqBufNum = -1;
|
|
// scan what we got
|
|
for ( int32_t i = 0 ; i < MAX_ADDR_REQUESTS_OUT ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if in use
|
|
if ( m_inUse[i] ) continue;
|
|
// and let caller know which one
|
|
reqBufNum = i;
|
|
// and stop
|
|
break;
|
|
}
|
|
// panic! how did this happen?
|
|
if ( reqBufNum == -1 ) { char *xx=NULL;*xx=0; }
|
|
// claim it
|
|
m_inUse[reqBufNum] = 1;
|
|
// point to the junk
|
|
char *requestBuf = m_bigBuf[reqBufNum];
|
|
// store requestbuf # we did get
|
|
a->m_reqBufNum = reqBufNum;
|
|
// and store addr # (subtract one since we increment m_i above)
|
|
a->m_addrNum = m_i - 1;
|
|
// point to this
|
|
Multicast *m = &m_mcasts[reqBufNum];
|
|
|
|
|
|
// store it
|
|
char *p = requestBuf;
|
|
// store placedbKey
|
|
*(key128_t *)p = a->m_placedbKey; p += sizeof(key128_t);
|
|
// site hash
|
|
*(int32_t *)p = m_domHash32; p += 4;
|
|
*(int32_t *)p = m_ip ; p += 4;
|
|
// niceness, 1 byte
|
|
*(char *)p = m_niceness; p += 1;
|
|
// is the street really a place name in disguise? ("Tingley Colesium")
|
|
char isName = ( a->m_street->m_flags2 & PLF2_IS_NAME ) ;
|
|
*(char *)p = isName ; p += 1;
|
|
// collection
|
|
//int32_t collSize = gbstrlen(m_coll) + 1;
|
|
//gbmemcpy ( p , m_coll , collSize );
|
|
//p += collSize;
|
|
*(collnum_t *)p = m_collnum;
|
|
p += sizeof(collnum_t);
|
|
// end of it
|
|
char *pend = requestBuf + REQBUFSIZE; // s_requestBuf + max;
|
|
// . then the address string, semicolon separated, null terminated
|
|
// . like ";;5815 Wyoming Blvd NE;Albuquerque;nm;87109;;..."(see below)
|
|
// . returns -1 and sets g_errno on error
|
|
// . returns # of bytes written, including null terminator
|
|
int32_t written = a->serialize ( p , pend - p , NULL , false , false );
|
|
// error?
|
|
if ( written == -1 ) {
|
|
m_errno = g_errno;
|
|
// unclaim
|
|
m_inUse[reqBufNum] = 0;
|
|
return (m_requests == m_replies);
|
|
}
|
|
// update our ptr
|
|
p += written;
|
|
// must be there
|
|
if ( written == 0 ) { char *xx=NULL;*xx=0; }
|
|
// ensure null terminated
|
|
if ( p[-1] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
|
|
// size of it
|
|
int32_t requestSize = p - requestBuf;
|
|
// sanity check for breach
|
|
if ( requestSize > REQBUFSIZE ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . get group to handle it
|
|
// . each group is responsible for a specific streetname/ctry/city/adm1
|
|
// . Hostdb.cpp::getGroupId()
|
|
//uint32_t gid = getGroupId(RDB_PLACEDB,(char *)&a->m_placedbKey);
|
|
uint32_t shardNum;
|
|
shardNum = getShardNum (RDB_PLACEDB,(char *)&a->m_placedbKey);
|
|
|
|
// . pick a host within that group based on docid
|
|
// . base that on streetname hash i guess
|
|
// . but i would like to cache this using a biased cache
|
|
// . so we need to divide based on streetname hash
|
|
// . that is the most significant 16 bits of the placedb key
|
|
int32_t numHosts = g_hostdb.getNumHostsPerShard();
|
|
int32_t hostNum = a->m_street->m_hash % numHosts;
|
|
Host *group = g_hostdb.getShard ( shardNum );
|
|
// get host # "hostNum" in group "group" to send our request to
|
|
Host *h = &group [ hostNum ];
|
|
|
|
//int32_t addrNum = m_i - 1;
|
|
|
|
// launch it
|
|
//Multicast *m = &m_mcast;
|
|
// this returns false and sets g_errno on error
|
|
if ( ! m->send ( requestBuf ,
|
|
requestSize ,
|
|
0x2c , // msgType
|
|
false , // multicast own request?
|
|
shardNum, // gid ,
|
|
false , // send to whole group?
|
|
0 , // key for selecting host (not used)
|
|
this , // state
|
|
(void *)a , // state2
|
|
gotMsg2cReplyWrapper ,
|
|
180 , // total timeout
|
|
m_niceness ,
|
|
false , // realtime udp
|
|
h->m_hostId ,
|
|
NULL,//&a->m_replyFlags , // replyBuf
|
|
0,//1 , // replyBufMaxSize
|
|
false )) { // freeReplyBuf?
|
|
// note it
|
|
m_errno = g_errno;
|
|
// return false if we are waiting on replies
|
|
return (m_requests == m_replies);
|
|
}
|
|
|
|
// keep tabls
|
|
s_totalOut++;
|
|
// successfully launched
|
|
m_requests++;
|
|
// launch another
|
|
goto loop;
|
|
}
|
|
|
|
void gotMsg2cReplyWrapper ( void *state , void *state2 ) {
|
|
Msg2c *THIS = (Msg2c*)state;
|
|
// we got one
|
|
THIS->m_replies++;
|
|
// back
|
|
s_totalOut--;
|
|
|
|
// error?
|
|
if ( g_errno ) {
|
|
THIS->m_errno = g_errno;
|
|
log("addr: msg2c reply: %s",mstrerror(g_errno));
|
|
}
|
|
|
|
// cast this
|
|
Addresses *aa = THIS->m_addresses;
|
|
|
|
// point to the address we were working for
|
|
Address *a = (Address *)state2;
|
|
// what address # was it matching?
|
|
int32_t addrNum = a->m_addrNum;
|
|
// and the reply buffer num for making available again
|
|
int32_t reqBufNum = a->m_reqBufNum;
|
|
// sanity
|
|
if ( reqBufNum<0 || reqBufNum>=MAX_ADDR_REQUESTS_OUT ) {
|
|
char *xx=NULL; *xx=0; }
|
|
// make it available again
|
|
THIS->m_inUse[reqBufNum] = 0;
|
|
|
|
// test it
|
|
Multicast *m = &THIS->m_mcasts[reqBufNum];
|
|
int32_t replySize , replyMaxSize; bool freeIt;
|
|
char *r = m->getBestReply (&replySize,&replyMaxSize,&freeIt);
|
|
|
|
// store reply into our cache
|
|
if ( ! g_errno && ! aa->addToReplyBuf (r,replySize,addrNum)){
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// set this
|
|
THIS->m_errno = g_errno;
|
|
}
|
|
|
|
// free that memory to stop the mem leak
|
|
mfree ( r , replyMaxSize , "umsg2c" );
|
|
|
|
// test it
|
|
//if ( r && replySize != 1 ) { char *xx=NULL; *xx=0; }
|
|
// show it
|
|
//log("addr: got reply=%"INT32" replyaddr=0x%"XINT32"",(int32_t)*r,(int32_t)r);
|
|
// launchGetRequests() returns false if still waiting for replies...
|
|
if ( ! THIS->launchRequests() ) return;
|
|
// set g_errno for the callback
|
|
if ( THIS->m_errno ) g_errno = THIS->m_errno;
|
|
// fill the table
|
|
//THIS->allDone ( );
|
|
// otherwise, call callback
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
// we then call Addresses::updateAddresses() to modify our m_addresses[]
|
|
// array with these replies!
|
|
bool Addresses::addToReplyBuf ( char *reply , int32_t replySize , int32_t addrNum ) {
|
|
// if nothing found in placedb lookup we get a 0 byte reply
|
|
if ( replySize == 0 ) return true;
|
|
// sanity
|
|
if ( addrNum < 0 || addrNum >= m_am.getNumPtrs()){char *xx=NULL;*xx=0;}
|
|
// if no room, make it 1.5 times bigger
|
|
if ( m_sb.m_length + replySize+4+4 > m_sb.m_capacity &&
|
|
! m_sb.reserve ( (int32_t)(m_sb.m_capacity * 1.5 + 1000 ) ) ) {
|
|
log("addr: addtoreplybuf: %s",mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// store the address # this reply is for
|
|
if ( ! m_sb.pushLong ( addrNum ) ) return false;
|
|
// then reply stuff
|
|
if ( ! m_sb.pushLong ( replySize ) ) return false;
|
|
if ( ! m_sb.safeMemcpy ( reply , replySize ) ) return false;
|
|
return true;
|
|
}
|
|
|
|
class State2c {
|
|
public:
|
|
UdpSlot *m_slot;
|
|
Msg5 m_msg5;
|
|
int32_t m_votesForStreet;
|
|
int32_t m_votesForStreetNum;
|
|
int32_t m_votesForPlaceName1;
|
|
int32_t m_votesForPlaceName2;
|
|
RdbList m_list;
|
|
int32_t m_domHash32;
|
|
int32_t m_ip;
|
|
key128_t m_placedbKey;
|
|
int32_t m_niceness;
|
|
// is the street really a place name in disguise? (Tingley Colesium)
|
|
char m_isName;
|
|
// point to the serialize Address (semicolon separated, null term'd)
|
|
char *m_addrStr;
|
|
};
|
|
|
|
void handleRequest2c ( UdpSlot *slot , int32_t nicenessWTF ) {
|
|
// get the request
|
|
char *request = slot->m_readBuf;
|
|
int32_t requestSize = slot->m_readBufSize;
|
|
// overflow protection for corrupt requests
|
|
if ( requestSize < 4 ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
g_udpServer.sendErrorReply ( slot , g_errno );
|
|
return;
|
|
}
|
|
|
|
// parse the request
|
|
char *p = request;
|
|
|
|
// do the lookup on disk (hopefully in cache or ssd!)
|
|
// make a new Msg5
|
|
State2c *st;
|
|
try { st = new (State2c); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("msg2c: new(%"INT32"): %s", (int32_t)sizeof(State2c), mstrerror(g_errno));
|
|
return g_udpServer.sendErrorReply ( slot, g_errno );
|
|
}
|
|
mnew ( st , sizeof(State2c) , "hndl2c" );
|
|
|
|
// save slot for sending reply
|
|
st->m_slot = slot;
|
|
|
|
// extract placedb key from request
|
|
st->m_placedbKey = *(key128_t *)p; p += sizeof(key128_t);
|
|
// get key range
|
|
key128_t startKey = st->m_placedbKey;
|
|
key128_t endKey = st->m_placedbKey;
|
|
// sanity check
|
|
if ( startKey.n1 == 0LL ) { char *xx=NULL;*xx=0; }
|
|
if ( endKey.n1 == 0LL ) { char *xx=NULL;*xx=0; }
|
|
// now we also mask out the street num hash
|
|
startKey.n1 &= 0xffffffffffff0000LL;
|
|
// and or that in for the endKey
|
|
endKey.n1 |= 0x000000000000ffffLL;
|
|
// mask out all but n1
|
|
startKey.n0 = 0x0000000000000000LL;
|
|
// or in lower bits for the endKey
|
|
endKey .n0 = 0xffffffffffffffffLL;
|
|
|
|
// domhash
|
|
st->m_domHash32 = *(int32_t *)p; p += 4;
|
|
st->m_ip = *(int32_t *)p; p += 4;
|
|
// get niceness
|
|
//int32_t niceness = *(char *)p; p++;
|
|
// skip still though!!
|
|
p++;
|
|
// this was messing up our niceness conversion algo
|
|
int32_t niceness = slot->m_niceness;
|
|
// is the street really a place name in disguise? (Tingley Colesium)
|
|
st->m_isName = *(char *)p; p++;
|
|
// save it
|
|
st->m_niceness = niceness;
|
|
// get coll
|
|
//char *coll = p; p += gbstrlen(p) + 1;
|
|
collnum_t collnum = *(collnum_t *)p;
|
|
p += sizeof(collnum_t);
|
|
// the address string, semicolon separated, NULL terminated
|
|
st->m_addrStr = p; p += gbstrlen(p) + 1;
|
|
|
|
// . get from msg5, return if it blocked
|
|
// . will probably not block since in the disk page cache a lot
|
|
if ( ! st->m_msg5.getList ( RDB_PLACEDB ,
|
|
collnum ,
|
|
&st->m_list ,
|
|
(char *)&startKey ,
|
|
(char *)&endKey ,
|
|
100000 , // minRecSizes
|
|
true , // include tree?
|
|
false , // addtocache?
|
|
0 , // maxcacheage
|
|
0 , // startfilenum
|
|
-1 , // numFiles
|
|
st ,
|
|
gotList2c ,
|
|
niceness ,
|
|
true ))// do err correction?
|
|
return;
|
|
// it did not block...
|
|
gotList2c( st , NULL , NULL );
|
|
}
|
|
|
|
void gotList2c ( void *state , RdbList *xxx , Msg5 *yyy ) {
|
|
// cast our state class
|
|
State2c *st = (State2c *)state;
|
|
// get this
|
|
UdpSlot *slot = st->m_slot;
|
|
// return right away if error getting the rec
|
|
if ( g_errno ) {
|
|
// loop back up here on error below as well
|
|
hadError:
|
|
// all done with this
|
|
mdelete ( st , sizeof(State2c),"msg2cfr");
|
|
delete (st);
|
|
g_udpServer.sendErrorReply ( slot,g_errno );
|
|
return;
|
|
}
|
|
// assume not good
|
|
st->m_votesForStreet = 0;
|
|
st->m_votesForStreetNum = 0;
|
|
st->m_votesForPlaceName1 = 0;
|
|
st->m_votesForPlaceName2 = 0;
|
|
|
|
// if request was looking up a *place name* and not a street
|
|
// then we do some different logic
|
|
if ( st->m_isName ) {
|
|
// caller needs a street address for the place
|
|
sendBackAddress ( st );
|
|
return;
|
|
}
|
|
|
|
// get our street num hash
|
|
key128_t *pk = &st->m_placedbKey;
|
|
int64_t myBigHash = g_placedb.getBigHash(pk);
|
|
int32_t myStreetNumHash = g_placedb.getStreetNumHash(pk);
|
|
|
|
// point to the place name
|
|
char *pn1 = st->m_addrStr;
|
|
// get the first semicolon
|
|
char *semi1 = pn1;
|
|
// scan for it
|
|
for ( ; *semi1 && *semi1 !=';' ; semi1++ );
|
|
// NULL term
|
|
*semi1 = '\0';
|
|
// skip leading "the"
|
|
if ( ! strncasecmp ( pn1, "the ", 4) ) pn1 += 4;
|
|
// get niceness
|
|
int32_t niceness = st->m_niceness;
|
|
// make a vector of "int32_ts" from the place name
|
|
int32_t myvbuf1[50];
|
|
int32_t mynv1 = makeSimpleWordVector ( pn1 , myvbuf1 , 50*4,niceness ) ;
|
|
if ( mynv1 == -1 ) goto hadError;
|
|
|
|
// do the same for the second name
|
|
char *pn2 = semi1 + 1;
|
|
// skip for it
|
|
char *semi2 = pn2;
|
|
// scan for it
|
|
for ( ; *semi2 && *semi2 !=';' ; semi2++ );
|
|
// NULL term
|
|
*semi2 = '\0';
|
|
// skip leading "the"
|
|
if ( ! strncasecmp ( pn2, "the ", 4) ) pn2 += 4;
|
|
// make vector of secondary place name
|
|
int32_t myvbuf2[50];
|
|
int32_t mynv2 = makeSimpleWordVector ( pn2 , myvbuf2 , 50*4,niceness ) ;
|
|
if ( mynv2 == -1 ) goto hadError;
|
|
|
|
//log("build: matching %s",pn1);
|
|
|
|
// each placedb record's place name in the list is hashed and
|
|
// stored in this table so we can accumulate votes. "voting table"
|
|
HashTableX vt;
|
|
char vtableBuf[5000];
|
|
vt.set(4,4,128,vtableBuf,5000,false,niceness,"addrvt");
|
|
|
|
// and likewise each hash has a ptr to the original string
|
|
// of the place name
|
|
HashTableX ptrTable;
|
|
char ptrBuf[5000];
|
|
ptrTable.set(4,4,128,ptrBuf,5000,false,niceness,"addptr");
|
|
|
|
// how much reply buf to allocate? need at least one byte for
|
|
// the original one byte reply of flags...
|
|
// now we also store the best lat and lon which are the two doubles,
|
|
// and the 4 bytes before for the # of votes for that lat/lon
|
|
int32_t need = 1 + 4 + sizeof(double)*2;
|
|
|
|
// int16_tcut
|
|
RdbList *list = &st->m_list;
|
|
|
|
while ( ! list->isExhausted() ) {
|
|
// breathe
|
|
QUICKPOLL ( st->m_niceness );
|
|
// get it
|
|
char *data = list->getCurrentData();
|
|
// get the key
|
|
key128_t k; list->getCurrentKey(&k);
|
|
// skip it
|
|
list->skipCurrentRecord();
|
|
// cast it
|
|
Address a2;
|
|
//Place places2[10];
|
|
//int32_t np2 = 0;
|
|
PlaceMem pm;
|
|
char tmpbuf[7024];
|
|
pm.init ( 5000 ,10,10,tmpbuf,7024,0 );
|
|
// set "a"
|
|
setFromStr ( &a2, data, 0 , &pm ,st->m_niceness );
|
|
// must not be same site as us for better voting accuracy
|
|
if ( a2.m_domHash32 == st->m_domHash32 ) continue;
|
|
// and different ip from us, for better voting accuracy
|
|
if ( iptop(a2.m_ip) == iptop(st->m_ip) ) continue;
|
|
// valid ip sanity check
|
|
if ( a2.m_ip == 0 || a2.m_ip==-1 ) { char *xx=NULL; *xx=0; }
|
|
|
|
// sanity check
|
|
if (g_placedb.getBigHash(&k)!=myBigHash) {char*xx=NULL;*xx=0;}
|
|
|
|
// ok, now we have verfied the street for sure
|
|
st->m_votesForStreet++;
|
|
|
|
// get the street num hash of that record
|
|
int32_t snh = g_placedb.getStreetNumHash ( &k );
|
|
// . does it match our street number?
|
|
// . i.e. the "15110" in "15110 Wyoming blvd"
|
|
if ( snh != myStreetNumHash ) continue;
|
|
// yes, another match
|
|
st->m_votesForStreetNum++;
|
|
|
|
//
|
|
// build a vector for each of the two place names
|
|
//
|
|
|
|
// get place name
|
|
pn1 = data;
|
|
// get semi
|
|
semi1 = pn1;
|
|
// scan for it
|
|
for ( ; *semi1 && *semi1 !=';' ; semi1++ );
|
|
// NULL term
|
|
*semi1 = '\0';
|
|
// skip leading "the"
|
|
if ( ! strncasecmp ( pn1, "the ", 4) ) pn1 += 4;
|
|
// make its place name into a vector
|
|
int32_t vbuf1[50];
|
|
int32_t nvbuf1 ;
|
|
nvbuf1 = makeSimpleWordVector(pn1,vbuf1,50*4,st->m_niceness);
|
|
if ( nvbuf1 == -1 )
|
|
goto hadError;
|
|
// do the same for the second name
|
|
pn2 = semi1 + 1;
|
|
// skip for it
|
|
semi2 = pn2;
|
|
// scan for it
|
|
for ( ; *semi2 && *semi2 !=';' ; semi2++ );
|
|
// NULL term
|
|
*semi2 = '\0';
|
|
// skip leading "the"
|
|
if ( ! strncasecmp ( pn2, "the ", 4) ) pn2 += 4;
|
|
// make vector of secondary place name
|
|
int32_t vbuf2[50];
|
|
int32_t nvbuf2;
|
|
nvbuf2 = makeSimpleWordVector (pn2,vbuf2,50*4,st->m_niceness);
|
|
if ( nvbuf2 == -1)
|
|
goto hadError;
|
|
// undo
|
|
*semi1 = ';';
|
|
*semi2 = ';';
|
|
|
|
//log("build: matching %s vs %s",pn1,pn2);
|
|
|
|
// ok, compare the two vectors
|
|
float sim1 = computeSimilarity ( myvbuf1 ,
|
|
vbuf1 ,
|
|
NULL ,
|
|
NULL ,
|
|
NULL ,
|
|
st->m_niceness );
|
|
|
|
float sim2 = computeSimilarity ( myvbuf2 ,
|
|
vbuf2 ,
|
|
NULL ,
|
|
NULL ,
|
|
NULL ,
|
|
st->m_niceness );
|
|
|
|
// compare the secondary to primary, and vice versa
|
|
float sim3 = computeSimilarity ( myvbuf1 ,
|
|
vbuf2 ,
|
|
NULL ,
|
|
NULL ,
|
|
NULL ,
|
|
st->m_niceness );
|
|
|
|
float sim4 = computeSimilarity ( myvbuf2 ,
|
|
vbuf1 ,
|
|
NULL ,
|
|
NULL ,
|
|
NULL ,
|
|
st->m_niceness );
|
|
|
|
//
|
|
// now we also hash each word in each place name and
|
|
// store those two hashes into a table so we can score
|
|
// each place name of each placedb record. this allows us
|
|
// to ultimately set Address::m_placedbName1 and 2.
|
|
//
|
|
int32_t h1 = hash32 ( (char *)vbuf1 , nvbuf1 * 4 , 0 );
|
|
int32_t h2 = hash32 ( (char *)vbuf2 , nvbuf2 * 4 , 0 );
|
|
|
|
// . update max buf if its a new string
|
|
// . include one byte for the \0
|
|
// . include 4 bytes for preceeding score
|
|
if ( h1 && ! vt.isInTable(&h1) ) {
|
|
// update what we allocate
|
|
need+=gbstrlen(pn1)+5;
|
|
// add to ptr table
|
|
if ( ! ptrTable.addKey ( &h1 , &pn1 ) ) goto hadError;
|
|
}
|
|
|
|
if ( h2 && h2!=h1 && ! vt.isInTable(&h2) ) {
|
|
// update what we allocate
|
|
need+=gbstrlen(pn2)+5;
|
|
// add to ptr table
|
|
if (! ptrTable.addKey ( &h2 , &pn2 ) ) goto hadError;
|
|
}
|
|
|
|
// add to voting table
|
|
if ( h1 && ! vt.addTerm32 ( &h1 ) ) goto hadError;
|
|
if ( h2 && h2 != h1 && ! vt.addTerm32 ( &h2 ) ) goto hadError;
|
|
|
|
// break here for now to figure it out!
|
|
//char *xx=NULL;*xx=0;
|
|
|
|
//log("build: matching sim=%.02f for %s vs %s",sim,pn1,pn2);
|
|
|
|
// skip this guy if not a match
|
|
if ( sim1 < 85.0 &&
|
|
sim2 < 85.0 &&
|
|
sim3 < 85.0 &&
|
|
sim4 < 85.0 ) continue;
|
|
|
|
// 85%+ is good enough to be a vote for
|
|
if ( sim1 >= 85.0 ) st->m_votesForPlaceName1++;
|
|
if ( sim2 >= 85.0 ) st->m_votesForPlaceName2++;
|
|
if ( sim3 >= 85.0 ) st->m_votesForPlaceName1++;
|
|
if ( sim4 >= 85.0 ) st->m_votesForPlaceName2++;
|
|
|
|
// that is good enough
|
|
break;
|
|
}
|
|
|
|
// set the reply
|
|
char *reply = NULL;
|
|
if ( need < TMPBUFSIZE ) reply = slot->m_tmpBuf;
|
|
else reply = (char *)mmalloc ( need , "repbuf" );
|
|
if ( ! reply ) goto hadError;
|
|
char *rend = reply + need;
|
|
|
|
// reply is either 1 or 0
|
|
//char *reply = slot->m_tmpBuf;
|
|
// clear it
|
|
uint8_t flags = 0;
|
|
// use flags
|
|
if ( st->m_votesForStreet ) flags |= AF_VERIFIED_STREET;
|
|
if ( st->m_votesForStreetNum ) flags |= AF_VERIFIED_STREET_NUM;
|
|
if ( st->m_votesForPlaceName1 ) flags |= AF_VERIFIED_PLACE_NAME_1;
|
|
if ( st->m_votesForPlaceName2 ) flags |= AF_VERIFIED_PLACE_NAME_2;
|
|
// sanity checks
|
|
if ( (flags & AF_VERIFIED_STREET_NUM) &&
|
|
!(flags & AF_VERIFIED_STREET ) ) { char *xx=NULL;*xx=0; }
|
|
if ( (flags & AF_VERIFIED_PLACE_NAME_1) &&
|
|
!(flags & AF_VERIFIED_STREET_NUM ) ) { char *xx=NULL;*xx=0; }
|
|
if ( (flags & AF_VERIFIED_PLACE_NAME_2) &&
|
|
!(flags & AF_VERIFIED_STREET_NUM ) ) { char *xx=NULL;*xx=0; }
|
|
|
|
// point to reply buffer after that first byte
|
|
char *rptr = reply ;
|
|
|
|
// now scan these placedb recs to find the most agreed upon lat/lon
|
|
// so that we do not trust the one on our page necessarily
|
|
double lat;
|
|
double lon;
|
|
int32_t numVotes;
|
|
// need the street number hash so we only get lat/lon coords from
|
|
// addresses with the same street number as well as street
|
|
if ( ! getBestLatLon ( list , &lat, &lon , &numVotes, niceness ,
|
|
myStreetNumHash ) )
|
|
goto hadError;
|
|
// add that in
|
|
*(int32_t *)rptr = numVotes; rptr += 4;
|
|
*(double *)rptr = lat; rptr += sizeof(double);
|
|
*(double *)rptr = lon; rptr += sizeof(double);
|
|
|
|
// then the 1 byte flag
|
|
*rptr = flags; rptr++;
|
|
|
|
// . now we store all the alternative place names and their vote count,
|
|
// as int32_t as it was 2 or more. so scan the score table to find
|
|
// the hashes of the winners, then lookup the hashes of the winners
|
|
// in the ptr table, ptrTable, to get the string to send back.
|
|
// . we set Address::m_placedbNames to this string above when we
|
|
// process this reply
|
|
for ( int32_t i = 0 ; i < vt.m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// skip emptyies
|
|
if ( vt.isEmpty(i) ) continue;
|
|
// get score
|
|
int32_t score = vt.getScoreFromSlot ( i );
|
|
// skip if too small
|
|
if ( score <= 1 ) continue;
|
|
// get key
|
|
int32_t key = *(int32_t *)vt.getKeyFromSlot ( i );
|
|
// grab string
|
|
char *str = *(char **)ptrTable.getValue ( &key );
|
|
// must be there
|
|
if ( ! str ) { char *xx=NULL;*xx=0; }
|
|
// skip if empty string... was it just "the "???
|
|
if ( ! *str ) continue;
|
|
// store score first
|
|
*(int32_t *)rptr = score;
|
|
// skip it
|
|
rptr += 4;
|
|
// get length
|
|
int32_t len = gbstrlen(str);
|
|
// store in reply buf, include \0
|
|
gbmemcpy ( rptr , str , len + 1 );
|
|
// skip over
|
|
rptr += len + 1;
|
|
// sanity check
|
|
if ( rptr > rend ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// the reply size may be less than what we allocated
|
|
int32_t replySize = rptr - reply;
|
|
|
|
// set it
|
|
//if ( st->m_votes ) *reply = 1;
|
|
//else *reply = 0;
|
|
// all done with this
|
|
mdelete ( st , sizeof(State2c),"msg2cfr");
|
|
delete (st);
|
|
// send the 1 byte reply
|
|
g_udpServer.sendReply_ass(reply,replySize,reply,need,slot);
|
|
}
|
|
|
|
// the msg2c request was asking for the address of a possible place name,
|
|
// like "Tingley Colesium", so this sends back the address
|
|
void sendBackAddress ( State2c *st ) {
|
|
|
|
// int16_tcut
|
|
RdbList *list = &st->m_list;
|
|
// winning street address
|
|
char *winner = NULL;
|
|
int32_t winnerSnh = 0;
|
|
// and max count
|
|
int32_t max = 0;
|
|
// get this
|
|
UdpSlot *slot = st->m_slot;
|
|
// set myBigHash for comparing
|
|
key128_t *pk = &st->m_placedbKey;
|
|
int64_t myBigHash = g_placedb.getBigHash(pk);
|
|
|
|
// set up a little voting table
|
|
char vbuf[30000];
|
|
HashTableX vt;
|
|
vt.set ( 4 , 4 , 100 ,vbuf,30000,false,0 ,"addrvt");
|
|
|
|
while ( ! list->isExhausted() ) {
|
|
// breathe
|
|
QUICKPOLL ( st->m_niceness );
|
|
// get it
|
|
char *data = list->getCurrentData();
|
|
// get the key
|
|
key128_t k; list->getCurrentKey(&k);
|
|
// skip it
|
|
list->skipCurrentRecord();
|
|
// cast it
|
|
//Address a2;
|
|
// set "a2"
|
|
//setFromStr ( &a2, data, 0 , st->m_niceness );
|
|
// must not be same site as us
|
|
//if ( a2.m_domHash32 == st->m_domHash32 ) continue;
|
|
// and different ip from us
|
|
//if ( iptop(a2.m_ip) == iptop(st->m_ip) ) continue;
|
|
// sanity check
|
|
if (g_placedb.getBigHash(&k)!=myBigHash) {char*xx=NULL;*xx=0;}
|
|
|
|
// now his key's street hash was replaced with his placename1
|
|
// hash, and (TODO) his street num hash was made to include
|
|
// his actual street name hash, so we can use this to make sure
|
|
// everyone agrees on the same street address
|
|
|
|
// get the street num hash of that record
|
|
int32_t snh = g_placedb.getStreetNumHash ( &k );
|
|
// get his vote count, we take the max
|
|
if ( ! vt.addTerm32 ( &snh ) ) {
|
|
// all done with this
|
|
mdelete ( st , sizeof(State2c),"msg2cfr");
|
|
delete (st);
|
|
g_udpServer.sendErrorReply ( slot,g_errno );
|
|
return;
|
|
}
|
|
|
|
// does this guy have a latitude/longitude in him?
|
|
char *pp = data;
|
|
// count out like 9 semicolons to see
|
|
int32_t scount = 0;
|
|
for ( ; scount < 10 ; pp++ )
|
|
if ( *pp == ';' ) scount++;
|
|
// check it out
|
|
bool hasLatLon = ( pp[1] != ';' );
|
|
// bad?
|
|
if ( scount < 5 ) { char *xx=NULL;*xx=0; }
|
|
// get his count
|
|
int32_t score = vt.getScore32 ( &snh );
|
|
// new max?
|
|
if ( score < max ) continue;
|
|
// on tie, pref if has lat/lon
|
|
if ( score == max && ! hasLatLon ) continue;
|
|
// point to winning address then
|
|
winner = data;
|
|
// set this for loop below
|
|
winnerSnh = snh;
|
|
}
|
|
|
|
// for looping again, reset this, but only if we had a winner
|
|
double bestLat;
|
|
double bestLon;
|
|
int32_t numVotes;
|
|
if ( winner && ! getBestLatLon ( list,
|
|
&bestLat,
|
|
&bestLon,
|
|
&numVotes,
|
|
st->m_niceness,
|
|
winnerSnh ) ) {
|
|
// all done with this
|
|
mdelete ( st , sizeof(State2c),"msg2cfr");
|
|
delete (st);
|
|
g_udpServer.sendErrorReply ( slot,g_errno );
|
|
return;
|
|
}
|
|
|
|
// all done with this
|
|
// CRAP! the winner is referencing into this list which is in this
|
|
// state we are freeing!
|
|
|
|
// debug
|
|
//log("placedb: input=%s output=%s",st->m_addrStr,winner);
|
|
|
|
// if no winner, send empty reply
|
|
if ( ! winner ) {
|
|
mdelete ( st , sizeof(State2c),"msg2cfr");
|
|
delete (st);
|
|
g_udpServer.sendReply_ass(NULL,0,NULL,0,slot);
|
|
return;
|
|
}
|
|
|
|
int32_t wlen = gbstrlen(winner);
|
|
// hos can this be?
|
|
if ( wlen <= 1 ) { char *xx=NULL;*xx=0; }
|
|
// send winner back. add in extra for lat/lon
|
|
int32_t need = wlen + 48;
|
|
// use the slot's tmp buf to hold the reply if we can
|
|
char *reply = slot->m_tmpBuf;
|
|
// make buf if we need to
|
|
if ( need > TMPBUFSIZE )
|
|
reply = (char *)mmalloc ( need , "msg2creply");
|
|
// return error on error
|
|
if ( ! reply ) {
|
|
mdelete ( st , sizeof(State2c),"msg2cfr");
|
|
delete (st);
|
|
g_udpServer.sendErrorReply ( slot,g_errno );
|
|
return;
|
|
}
|
|
// now store here
|
|
char *p = reply;
|
|
*(int32_t *)p = numVotes; p += 4;
|
|
*(double *)p = bestLat ; p += sizeof(double);
|
|
*(double *)p = bestLon ; p += sizeof(double);
|
|
// how much to copy, include \0
|
|
int32_t bytes = wlen + 1;
|
|
// copy over all but lat and lon if there, includes last ';'
|
|
gbmemcpy ( p , winner , bytes ); p += bytes;
|
|
// how big is reply?
|
|
int32_t replySize = p - reply;
|
|
// sanity check
|
|
if ( replySize > need ) { char *xx=NULL;*xx=0; }
|
|
// free it last since winner points into it
|
|
mdelete ( st , sizeof(State2c),"msg2cfr");
|
|
delete (st);
|
|
// send back empty reply if no winner, strange!
|
|
g_udpServer.sendReply_ass(reply,replySize,reply,need,slot);
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool getBestLatLon ( RdbList *list ,
|
|
double *bestLat ,
|
|
double *bestLon ,
|
|
int32_t *numVotes ,
|
|
int32_t niceness ,
|
|
int32_t winnerSnh ) {
|
|
// reset ptr, since we did a loop above with it
|
|
list->resetListPtr();
|
|
// no best now
|
|
int32_t bestScore = 0;
|
|
*bestLat = NO_LATITUDE;
|
|
*bestLon = NO_LONGITUDE;
|
|
*numVotes = 0;
|
|
// voting table for lat/lon
|
|
HashTableX gpsTable;
|
|
char gbuf[1024];
|
|
gpsTable.set ( 8 , 4 , 32 , gbuf , 1024 , false , niceness,"addrgps");
|
|
// now loop again looking for the best lat/lon of the winning street
|
|
while ( ! list->isExhausted() ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// get it
|
|
char *data = list->getCurrentData();
|
|
// need this now
|
|
//int32_t dataSize = list->getCurrentDataSize();
|
|
// get the key
|
|
key128_t k; list->getCurrentKey(&k);
|
|
// skip it
|
|
list->skipCurrentRecord();
|
|
// get the street num hash of that record
|
|
int32_t snh = g_placedb.getStreetNumHash ( &k );
|
|
// skip if not winner
|
|
if ( winnerSnh && snh != winnerSnh ) continue;
|
|
// grab it from the string (TODO: use this for above too!)
|
|
double lat;
|
|
double lon;
|
|
getLatLonFromStr ( data , &lat , &lon );
|
|
// skip if either not there
|
|
if ( lat == NO_LATITUDE ) continue;
|
|
if ( lon == NO_LONGITUDE ) continue;
|
|
// sanity check
|
|
if ( sizeof(double) != 8 ) { char *xx=NULL;*xx=0; }
|
|
// get hash for them
|
|
int64_t h1 = *(int64_t *)⪫
|
|
int64_t h2 = *(int64_t *)&lon;
|
|
int64_t h = (h1<<1) ^ h2;
|
|
// add to table
|
|
if ( ! gpsTable.addTerm ( &h ) )
|
|
return false;
|
|
// get score
|
|
int32_t score = gpsTable.getScore ( &h );
|
|
// skip if not best
|
|
if ( score <= bestScore ) continue;
|
|
// otherwise set it
|
|
*bestLat = lat;
|
|
*bestLon = lon;
|
|
bestScore = score;
|
|
}
|
|
*numVotes = bestScore;
|
|
return true;
|
|
}
|
|
|
|
uint8_t getCountryIdFromAddrStr ( char *addr ) {
|
|
char *p = addr;
|
|
int32_t scount = 0;
|
|
for ( ; scount < 7 ; p++ )
|
|
if ( *p == ';' ) scount++;
|
|
// empty? assume US then
|
|
if ( *p == ';' ) return CRID_US;
|
|
// map abbr to crid
|
|
uint8_t crid = getCountryId ( p );
|
|
return crid;
|
|
}
|
|
|
|
char *getLatLonPtrFromStr ( char *data ) {
|
|
// now point to latitude,longitude
|
|
// skip city,state,zip,something,hash,ip
|
|
char *latitudePtr = data;
|
|
int32_t scount = 0;
|
|
for ( ; scount < 11 ; latitudePtr++ )
|
|
if ( *latitudePtr == ';' ) scount++;
|
|
// pts past that ';'
|
|
return latitudePtr;
|
|
}
|
|
|
|
void getLatLonFromStr ( char *data, double *lat, double *lon ) {
|
|
// set lat lon
|
|
*lat = NO_LATITUDE;
|
|
*lon = NO_LONGITUDE;
|
|
// now point to latitude,longitude
|
|
// skip city,state,zip,something,hash,ip
|
|
char *latitudePtr = getLatLonPtrFromStr ( data );
|
|
// find end of it
|
|
char *latitudeEnd = latitudePtr;
|
|
// this may not be incremented at all if we have no latitude
|
|
for ( ; *latitudeEnd != ';' ; latitudeEnd++ );
|
|
// if we had something, then assign it
|
|
if ( *latitudePtr != ';' )
|
|
*lat = atod2(latitudePtr,latitudeEnd-latitudePtr);
|
|
// skip to l
|
|
char *longitudePtr = latitudeEnd + 1;
|
|
// need this now
|
|
//char *dataEnd = data + dataSize;
|
|
// this may not be incremented at all if we have no latitude
|
|
char *longitudeEnd = longitudePtr;
|
|
// this may not be incremented at all if we have no latitude
|
|
for ( ; *longitudeEnd && *longitudeEnd != ';' ; longitudeEnd++ );
|
|
// . this is the last item so it is already \0 terminated
|
|
// . sometimes is not \0 terminated because it is a sequence of
|
|
// replies serialized into our reply buffer, m_sb
|
|
if ( *longitudePtr && *longitudePtr != ';' )
|
|
*lon = atod2(longitudePtr,longitudeEnd-longitudePtr);
|
|
// sanity check
|
|
if ( *lon == 0.0 || *lat == 0.0 ) {
|
|
log("addr: bad 0.0 lon or lat");
|
|
*lat = NO_LATITUDE;
|
|
*lon = NO_LONGITUDE;
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// used by Events.cpp and by Dates.cpp
|
|
//
|
|
|
|
int streetcmp ( const void *arg1 , const void *arg2 ) {
|
|
// get the addresses
|
|
Place *street1 = *(Place **)arg1;
|
|
Place *street2 = *(Place **)arg2;
|
|
// get word position
|
|
int32_t a1 = street1->m_a;
|
|
int32_t a2 = street2->m_a;
|
|
// if tied, prefer the one whose m_address is set! that means
|
|
// it came from a inlined or verified address
|
|
if ( a1 == a2 ) {
|
|
if ( street1->m_address ) return -1;
|
|
if ( street2->m_address ) return 1;
|
|
if ( street1->m_alias ) return -1;
|
|
if ( street2->m_alias ) return 1;
|
|
return 0;
|
|
}
|
|
// sanity check
|
|
if ( a1 < 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( a2 < 0 ) { char *xx=NULL;*xx=0; }
|
|
// compare
|
|
return ( a1 - a2);
|
|
}
|
|
|
|
// . allow "store hours" addresses to telescope up without limit
|
|
// . only store streets now that have PLF2_AFTER_AT set, or are a street
|
|
// name like "404 John NE"
|
|
// . and store streets in addresses that have verified street, name1 or name2
|
|
// OR are inlined
|
|
// . returns false and sets g_errno on error
|
|
bool Addresses::setFirstPlaceNums ( ) {
|
|
|
|
// no double calls
|
|
//if ( m_sorted ) { char *xx=NULL;*xx=0; }
|
|
if ( m_sorted ) {
|
|
mfree ( m_sorted , m_sortedSize , "asortbuf");
|
|
m_sorted = NULL;
|
|
m_sortedValid = false;
|
|
}
|
|
|
|
//char sbuf[10000];
|
|
// set the sorted[] array which consists of addresses
|
|
// sorted by their street position, or in the
|
|
// case if PLF2_IS_NAME addresses, their place name 1 position
|
|
//Place **sorted = (Place **)sbuf;
|
|
// how much space do we need?
|
|
int32_t need = (m_am.getNumPtrs() + m_sm.getNumPtrs())* 4;
|
|
// alloc if we need to
|
|
m_sorted = (Place **)mmalloc(need,"getaddrtab");
|
|
if ( ! m_sorted ) return false;
|
|
m_sortedValid = true;
|
|
// store for freeing
|
|
m_sortedSize = need;
|
|
// reset count
|
|
m_numSorted = 0;
|
|
|
|
|
|
//////////////////////////////////
|
|
//
|
|
// add streets from m_streets[]
|
|
//
|
|
//////////////////////////////////
|
|
int32_t lasta1 = -1;
|
|
for ( int32_t i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
|
|
// give up control
|
|
QUICKPOLL(m_niceness);
|
|
// get streets #i
|
|
Place *street = (Place *)m_sm.getPtr(i);
|
|
// skip if po box. causes us to miss setting DF_STORE_HOURS
|
|
// for a date because there is a PO box as well as the
|
|
// bldg street address in the "store hours" section.
|
|
if ( street->m_flags2 & PLF2_IS_POBOX ) continue;
|
|
// is the street name really a place name?
|
|
bool isName = ( street->m_flags2 & PLF2_IS_NAME );
|
|
// assume not a good place
|
|
bool good = false;
|
|
// is our street really a place name
|
|
if ( street->m_flags2 & PLF2_AFTER_AT ) good = true;
|
|
// intersections are good
|
|
if ( street->m_flags2 & PLF2_INTERSECTION ) good = true;
|
|
// if it is a verified place name, allow it through too!
|
|
Address *aa = street->m_address;
|
|
if ( aa ) {
|
|
if ( aa->m_flags&AF_VERIFIED_PLACE_NAME_1) good = true;
|
|
if ( aa->m_flags&AF_VERIFIED_PLACE_NAME_2) good = true;
|
|
}
|
|
// . allow an aliases street name to be ok
|
|
// . helps fix zvents.com invalid mariner url even though
|
|
// "The Filling Station" is really after an at... but we
|
|
// were not picking that up before because of another bug
|
|
// which is now fixed.
|
|
if ( street->m_alias ) good = true; // afterAt = true;
|
|
// get the address or the alias, whichever is non-NULL, if any
|
|
Address *ax = aa;
|
|
if ( ! ax ) ax = street->m_alias;
|
|
// sometimes we re-nege on our lat lon address we added because
|
|
// it was ambiguous because their were multiple lat/lon pairs
|
|
// and we didn't know which one was right. we really should
|
|
// delete them i guess up there but i am not sure they were
|
|
// last on stack? this is for addresses that are like
|
|
// after at like "at Norquay" and they have a latlon only
|
|
// flag...
|
|
if ( ax && (ax->m_flags3 & AF2_LATLON) ) {
|
|
// make sure lat/lon is not AMBIG_LATITUDE
|
|
if ( ax->m_latitude == AMBIG_LATITUDE ||
|
|
ax->m_longitude == AMBIG_LONGITUDE ||
|
|
ax->m_latitude == NO_LATITUDE ||
|
|
ax->m_longitude == NO_LONGITUDE )
|
|
continue;
|
|
}
|
|
// is not a name, that's good!
|
|
if ( ! isName ) good = true;
|
|
// must have address or be after at OR it must be a
|
|
// street name like "400 John NE"
|
|
if ( ! good ) continue;
|
|
// skip if it is a place to buy tickets and not really
|
|
// an actual event place
|
|
//if ( street->m_flags2 & PLF2_TICKET_PLACE ) continue;
|
|
// do add po box addresses, the above loop will just
|
|
// disqualify the event if this is the best address for it!
|
|
//if ( street->m_flags2 & PLF2_IS_POBOX ) continue;
|
|
// get the street name word range
|
|
int32_t a1 = street->m_a;
|
|
int32_t b1 = street->m_b;
|
|
// sanity check
|
|
if ( a1 < 0 || b1 < 0 ) { char *xx=NULL;*xx=0; }
|
|
// stop dups
|
|
if ( a1 == lasta1 ) continue;
|
|
// update
|
|
lasta1 = a1;
|
|
// add it
|
|
m_sorted[m_numSorted++] = street;
|
|
}
|
|
|
|
|
|
// . now sort the array by the street/name word start number
|
|
// . i.e. sort streets by their position on the page
|
|
// . in case of ties prefers the street with m_address set, because
|
|
// that indicates it came from an inlined or verified address
|
|
gbqsort ( m_sorted , m_numSorted , 4 , streetcmp , m_niceness );
|
|
|
|
///////////////////////////////////////
|
|
//
|
|
// . remove duplicate places
|
|
// . fix "classes at Blue Tribe School. contact tammy.
|
|
// School 111 Maple SE Abq NM" for panjea.org.
|
|
// . basically an address can have a place name and a street
|
|
// and our streets array treats both kinds separately, so we
|
|
// have to detect if what we think is a different place name
|
|
// is really the place name of a street name here
|
|
//
|
|
///////////////////////////////////////
|
|
int32_t numSorted3 = 0;
|
|
for ( int32_t i = 0 ; i < m_numSorted - 1 ; i++ ) {
|
|
// give up control
|
|
QUICKPOLL(m_niceness);
|
|
// get address #i
|
|
Place *street = m_sorted[i];
|
|
// get next
|
|
Place *next = m_sorted[i+1];
|
|
// re-add "street"
|
|
bool add = false;
|
|
// we must eb after at
|
|
if ( ! ( street->m_flags2 & PLF2_AFTER_AT ) ) add = true;
|
|
// and he must be a regular street
|
|
if ( next->m_flags2 & PLF2_AFTER_AT ) add = true;
|
|
if ( next->m_flags2 & PLF2_IS_NAME ) add = true;
|
|
// and must be kinda close together
|
|
if ( next->m_alnumA - street->m_alnumA > 10 ) add = true;
|
|
// fix "Grants Middle Schoole ... 111 Easterday NE" for
|
|
// www.superpages.com/yellowpages/C-Junior%2BHigh%2B%2526%2BMiddle%2BSchools/S-NM/T-Albuquerque
|
|
// because we get two places for that one address.
|
|
// one place is "Grants Middle School" as a fake street place
|
|
// name, and the other is the address with the actual street
|
|
// which also incorporates the same "Grants Middle School" as
|
|
// its name... so stop that!
|
|
if ( next->m_address &&
|
|
next->m_address->m_name1 &&
|
|
next->m_address->m_name1->m_a == street->m_a )
|
|
add = false;
|
|
if ( next->m_address &&
|
|
next->m_address->m_name2 &&
|
|
next->m_address->m_name2->m_a == street->m_a )
|
|
add = false;
|
|
// ok, ignore us!
|
|
if ( ! add ) continue;
|
|
// re-add it
|
|
m_sorted[numSorted3++] = street;
|
|
}
|
|
// last one
|
|
if ( m_numSorted > 0 )
|
|
m_sorted[numSorted3++] = m_sorted[m_numSorted-1];
|
|
|
|
// replace with the smaller deduped number
|
|
m_numSorted = numSorted3;
|
|
|
|
// clear all in case of re-call
|
|
for ( int32_t i = 0 ; i < m_sections->m_numSections ; i++ ) {
|
|
QUICKPOLL ( m_niceness );
|
|
Section *sn = &m_sections->m_sections[i];
|
|
sn->m_firstPlaceNum = -1;
|
|
}
|
|
|
|
///////////////////////////////
|
|
//
|
|
// loop over streets in sorted[] and hash their sections
|
|
//
|
|
///////////////////////////////
|
|
int32_t lasta = -1;
|
|
for ( int32_t i = 0 ; i < m_numSorted ; i++ ) {
|
|
// give up control
|
|
QUICKPOLL(m_niceness);
|
|
// get address #i
|
|
Place *street = m_sorted[i];
|
|
// get word position, word #a
|
|
int32_t a = street->m_a;
|
|
if ( a == lasta ) continue;
|
|
lasta = a;
|
|
if ( a < 0 ) { char *xx=NULL;*xx=0; }
|
|
// get section
|
|
Section *sa = m_sections->m_sectionPtrs[a];
|
|
// telescope up
|
|
for ( ; sa ; sa = sa->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if already has one
|
|
if ( sa->m_firstPlaceNum >= 0 ) break;
|
|
// we are the first place contained by this section
|
|
sa->m_firstPlaceNum = i;
|
|
}
|
|
|
|
// dbug
|
|
/*
|
|
int32_t b = street->m_b;
|
|
SafeBuf tmp;
|
|
char *start = m_wptrs[a];
|
|
char *end = m_wptrs[b-1]+m_words->m_wordLens[b-1];
|
|
tmp.safeMemcpy(start,end-start);
|
|
tmp.pushChar(0);
|
|
Section **sp = m_sections->m_sectionPtrs;
|
|
int32_t sa = -1;
|
|
int32_t aa = -1;
|
|
if ( street->m_address ) sa = street->m_address->m_street->m_a;
|
|
if ( street->m_alias ) sa = street->m_alias->m_street->m_a;
|
|
log("dbug: (a=%"INT32",b=%"INT32") sec=%"XINT32" %s addr=%"INT32" alias=%"INT32" "
|
|
"url=%s",
|
|
a,b,
|
|
(int32_t)sp[a],
|
|
tmp.getBufStart() ,
|
|
(int32_t)sa,//street->m_address,
|
|
(int32_t)aa,//street->m_alias,
|
|
m_url->m_url);
|
|
*/
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . "i" is the word position of "and" or "&"
|
|
bool Addresses::addIntersection ( int32_t i , int32_t alnumPos ) {
|
|
|
|
//if ( m_ns >= MAX_STREETS ) return true;
|
|
|
|
bool hadUpper = false;
|
|
|
|
//////////
|
|
//
|
|
// to the LEFT of the "and"
|
|
//
|
|
//////////
|
|
|
|
int32_t good1 = -1;
|
|
int32_t j1 = i;
|
|
int32_t numPos1 = -1;
|
|
int32_t lastBeforeNum1 = -1;
|
|
int32_t routePos1 = -1;
|
|
int32_t ap1 = alnumPos;
|
|
int32_t dirCount1 = 0;
|
|
int32_t wcount1 = 0;
|
|
int32_t icount1 = 0;
|
|
bool firstWord = true;
|
|
int64_t lastWid1 = 0LL;
|
|
bool explicit1 = false;
|
|
bool hadPage1 = false;
|
|
bool lastWasStreetInd = false;
|
|
bool badLeftStreetEnd = false;
|
|
|
|
// do not back up past this
|
|
int32_t minj = i - 14; if ( minj < 0 ) minj = 0;
|
|
// now back up to the left, see if that is a street
|
|
for ( int32_t j = i - 1 ; j >= minj ; j-- ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// count it
|
|
if ( m_wids[j] ) ap1--;
|
|
// between is a total killer!
|
|
if ( m_wids[j] == h_between ) return true;
|
|
if ( m_wids[j] == h_btwn ) return true;
|
|
if ( m_wids[j] == h_bet ) return true;
|
|
// try this out
|
|
if ( ! isInStreet( j ) ) break;
|
|
// if not alnum word, keep going
|
|
if ( ! m_wids[j] ) continue;
|
|
|
|
// detect "corner of the page"
|
|
if ( m_wids[j] == h_page ) hadPage1 = true;
|
|
|
|
if ( m_wids[j] == h_intersection && lastWid1 == h_of ) {
|
|
explicit1 = true;
|
|
// include "intersection of" so it is not in name
|
|
good1 = j;
|
|
break;
|
|
}
|
|
|
|
if ( m_wids[j] == h_corner && lastWid1 == h_of ) {
|
|
// ignore "corner of the page"
|
|
if ( hadPage1 ) return true;
|
|
explicit1 = true;
|
|
// include "corner of" so it is not in name
|
|
good1 = j;
|
|
break;
|
|
}
|
|
|
|
// save it
|
|
bool saved3 = lastWasStreetInd;
|
|
// reset this
|
|
lastWasStreetInd = false;
|
|
|
|
// first word we encounter must be a directional or
|
|
// street indicator
|
|
if ( firstWord ) {
|
|
firstWord = false;
|
|
IndDesc *id;
|
|
id=(IndDesc *)g_indicators.getValue(&m_wids[j]);
|
|
bool ok = false;
|
|
if ( id && (id->m_bit & IND_DIR ) &&
|
|
// must have space or comma before us to prevent
|
|
// "tom's and jerry's"
|
|
j>0 &&
|
|
( is_wspace_a(m_wptrs[j][-1]) ||
|
|
m_wptrs[j][-1]==',') ) {
|
|
ok = true;
|
|
dirCount1++;
|
|
icount1++;
|
|
}
|
|
if ( id && (id->m_bit & IND_STREET) ) {
|
|
lastWasStreetInd = true;
|
|
ok = true;
|
|
icount1++;
|
|
}
|
|
// "14th and W St. NW" for gwair.org
|
|
// "i-25 & hwy 301"
|
|
if ( is_digit(m_wptrs[j][0]) &&
|
|
// fix "21+ & I.D. Required" for groundkontrol.com
|
|
is_alnum_a(m_wptrs[j][m_wlens[j]-1]) )
|
|
ok = true;
|
|
// otherwise, stop on any other word
|
|
if ( ! ok ) {
|
|
badLeftStreetEnd = true;
|
|
//break;
|
|
}
|
|
}
|
|
|
|
bool isNum = false;
|
|
// this is good "4th and 5th"
|
|
if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
|
|
m_wptrs[j][m_wlens[j]-2] == 's' &&
|
|
m_wptrs[j][m_wlens[j]-1] == 't' )
|
|
good1 = j;
|
|
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
|
|
m_wptrs[j][m_wlens[j]-2] == 'n' &&
|
|
m_wptrs[j][m_wlens[j]-1] == 'd' )
|
|
good1 = j;
|
|
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
|
|
m_wptrs[j][m_wlens[j]-2] == 'r' &&
|
|
m_wptrs[j][m_wlens[j]-1] == 'd' )
|
|
good1 = j;
|
|
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
|
|
m_wptrs[j][m_wlens[j]-2] == 't' &&
|
|
m_wptrs[j][m_wlens[j]-1] == 'h' )
|
|
good1 = j;
|
|
// numbers not allowed unless after "route", etc.
|
|
else if ( m_words->isNum(j) ) {
|
|
numPos1 = j;
|
|
isNum = true;
|
|
}
|
|
|
|
// allow "79 st & shore rd" for
|
|
// www.nycgovparks.org/facilities/playgrounds
|
|
if ( isNum && saved3 ) good1 = j;
|
|
|
|
// record this
|
|
if ( numPos1 == -1 ) lastBeforeNum1 = j;
|
|
|
|
// this one too
|
|
if ( m_wids[j] == h_route ) routePos1 = j;
|
|
if ( m_wids[j] == h_rte ) routePos1 = j;
|
|
if ( m_wids[j] == h_rt ) routePos1 = j;
|
|
if ( m_wids[j] == h_hwy ) routePos1 = j;
|
|
if ( m_wids[j] == h_highway ) routePos1 = j;
|
|
if ( m_wids[j] == h_hiway ) routePos1 = j;
|
|
if ( m_wids[j] == h_road ) routePos1 = j;
|
|
if ( m_wids[j] == h_rd ) routePos1 = j;
|
|
// "Locatd on US 64 and New Mexico Highway X"
|
|
if ( m_wids[j] == h_us ) routePos1 = j;
|
|
if ( m_wids[j] == h_interstate ) routePos1 = j;
|
|
if ( m_wids[j] == h_i ) routePos1 = j;
|
|
|
|
// stop if word after the number is not a route
|
|
if ( ! isNum && numPos1 >= 0 && routePos1 == -1 )
|
|
break;
|
|
|
|
// save it
|
|
lastWid1 = m_wids[j];
|
|
|
|
// no mixing caps
|
|
if ( s_lc.isInTable ( &m_wids[j] ) ) continue;
|
|
// cap?
|
|
if ( is_upper_utf8(m_wptrs[j]) ) hadUpper = true;
|
|
// do not include a lower case guy
|
|
else if ( hadUpper && is_lower_utf8(m_wptrs[j]) )
|
|
break;
|
|
|
|
// count it
|
|
wcount1++;
|
|
|
|
// note it
|
|
j1 = j;
|
|
}
|
|
|
|
// scan to left looking for "corner of" etc
|
|
int32_t minsj = j1 - 10; if ( minsj < 0 ) minsj = 0;
|
|
bool hadOf = false;
|
|
for ( int32_t sj = j1 - 1 ; sj > minsj ; sj-- ) {
|
|
// skip tags etc
|
|
if ( ! m_wids[sj] ) continue;
|
|
// of is ok
|
|
if ( m_wids[sj] == h_of ) { hadOf = true; continue; }
|
|
// bad i fno of
|
|
if ( ! hadOf ) break;
|
|
// corner of intersection of
|
|
if ( m_wids[sj] != h_intersection &&
|
|
m_wids[sj] != h_corner )
|
|
break;
|
|
explicit1 = true;
|
|
break;
|
|
}
|
|
|
|
if ( badLeftStreetEnd && ! explicit1 ) return true;
|
|
|
|
// . return if only indicator in street name.
|
|
// . fixes "NE and NW parts of Metro Atlanta."
|
|
if ( ! explicit1 && dirCount1 == wcount1 ) return true;
|
|
|
|
// reset it to before the pure number if no "route" before number
|
|
if ( ! explicit1 && numPos1 >= 0 && routePos1 != numPos1 - 2 ) {
|
|
j1 = lastBeforeNum1;
|
|
// if negative give up!
|
|
if ( j1 < 0 ) return true;
|
|
}
|
|
// use good1 if we had that!
|
|
if ( good1 >= 0 && good1 < j1 )
|
|
j1 = good1;
|
|
|
|
// return if no street to the left
|
|
if ( j1 == i ) return true;
|
|
|
|
//////////
|
|
//
|
|
// to the right of the "and"
|
|
//
|
|
//////////
|
|
|
|
bool good2 = false;
|
|
int32_t icount2 = 0;
|
|
int32_t dirCount2 = 0;
|
|
int32_t wcount2 = 0;
|
|
int32_t j2 = i;
|
|
bool hadStreetInd = false;
|
|
bool hadDirInd = false;
|
|
int32_t numPos2 = -1;
|
|
int32_t lastBeforeNum2 = -1;
|
|
int32_t routePos2 = -1;
|
|
int32_t ap2 = alnumPos;
|
|
bool hadCornerDirInd2 = false;
|
|
bool firstWord2 = true;
|
|
|
|
// do not exceed this
|
|
int32_t maxj = i + 14; if ( maxj > m_nw ) maxj = m_nw;
|
|
// need a street to the right as well
|
|
for ( int32_t j = i + 1 ; j < maxj ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// count it
|
|
if ( m_wids[j] ) ap2++;
|
|
// try this out
|
|
if ( ! isInStreet( j ) ) break;
|
|
// skip if not alnum at this point
|
|
if ( ! m_wids[j] ) continue;
|
|
|
|
bool savedFirstWord2 = firstWord2;
|
|
if ( firstWord2 ) firstWord2 = false;
|
|
|
|
// if we hit a street indicator, only a dir can follow
|
|
IndDesc *id=(IndDesc *)g_indicators.getValue(&m_wids[j]);
|
|
if ( id && (id->m_bit & IND_STREET) && ! savedFirstWord2 ) {
|
|
hadStreetInd = true;
|
|
icount2++;
|
|
good2 = true;
|
|
}
|
|
else if ( id && (id->m_bit & IND_DIR ) ) {
|
|
hadDirInd = true;
|
|
// fix "Central Ave SE and Richmond SE Albuquerque"
|
|
if ( m_wlens[j] == 2 )
|
|
hadCornerDirInd2 = true;
|
|
icount2++;
|
|
dirCount2++;
|
|
good2 = true;
|
|
}
|
|
else if ( hadStreetInd || hadCornerDirInd2 )
|
|
break;
|
|
|
|
// this is good "4th and 5th"
|
|
if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
|
|
m_wptrs[j][m_wlens[j]-2] == 's' &&
|
|
m_wptrs[j][m_wlens[j]-1] == 't' )
|
|
good2 = true;
|
|
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
|
|
m_wptrs[j][m_wlens[j]-2] == 'n' &&
|
|
m_wptrs[j][m_wlens[j]-1] == 'd' )
|
|
good2 = true;
|
|
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
|
|
m_wptrs[j][m_wlens[j]-2] == 'r' &&
|
|
m_wptrs[j][m_wlens[j]-1] == 'd' )
|
|
good2 = true;
|
|
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
|
|
m_wptrs[j][m_wlens[j]-2] == 't' &&
|
|
m_wptrs[j][m_wlens[j]-1] == 'h' )
|
|
good2 = true;
|
|
// numbers not allowed unless after "route", etc.
|
|
else if ( m_words->isNum(j) ) {
|
|
numPos2 = j;
|
|
// stop if had no route
|
|
if ( routePos2 == -1 ) break;
|
|
}
|
|
|
|
// fix for 14th and Curtis Denver CO
|
|
if ( cityAdm1Follows ( j ) ) {
|
|
good2 = true;
|
|
break;
|
|
}
|
|
|
|
// record this
|
|
if ( numPos2 == -1 ) lastBeforeNum2 = j;
|
|
|
|
// this one too
|
|
if ( m_wids[j] == h_route ) routePos2 = j;
|
|
if ( m_wids[j] == h_rte ) routePos2 = j;
|
|
if ( m_wids[j] == h_rt ) routePos2 = j;
|
|
if ( m_wids[j] == h_hwy ) routePos2 = j;
|
|
if ( m_wids[j] == h_highway ) routePos2 = j;
|
|
if ( m_wids[j] == h_hiway ) routePos2 = j;
|
|
if ( m_wids[j] == h_road ) routePos2 = j;
|
|
if ( m_wids[j] == h_rd ) routePos2 = j;
|
|
// "Locatd on US 64 and New Mexico Highway X"
|
|
if ( m_wids[j] == h_us ) routePos2 = j;
|
|
if ( m_wids[j] == h_interstate ) routePos2 = j;
|
|
if ( m_wids[j] == h_i ) routePos2 = j;
|
|
|
|
// no mixing caps
|
|
if ( s_lc.isInTable ( &m_wids[j] ) ) continue;
|
|
// cap?
|
|
if ( is_upper_utf8(m_wptrs[j]) ) hadUpper = true;
|
|
// do not include a lower case guy
|
|
else if ( hadUpper && is_lower_utf8(m_wptrs[j]) )
|
|
break;
|
|
|
|
// count it
|
|
wcount2++;
|
|
|
|
// note it
|
|
j2 = j;
|
|
}
|
|
|
|
// reset it to before the pure number if no "route" before number
|
|
if ( numPos2 >= 0 && routePos2 != numPos2 - 2 ) {
|
|
j2 = lastBeforeNum2;
|
|
// if negative give up!
|
|
if ( j2 < 0 ) return true;
|
|
}
|
|
|
|
// fix "First Nations North and South" and
|
|
// "Broadway South East and North East"
|
|
if ( ! explicit1 && wcount2 == dirCount2 ) return true;
|
|
|
|
// trim after the "route x"
|
|
if ( numPos2 == routePos2 + 2 )
|
|
j2 = numPos2;
|
|
|
|
// return if no street to the left
|
|
if ( j2 == i ) return true;
|
|
|
|
// these are indivative of good street names
|
|
if ( routePos2 >= 0 ) good2 = true;
|
|
|
|
// no need for street indicator on right street if we have
|
|
// "intersection of" or whatever to left of left street
|
|
|
|
|
|
// need to have a "good" street name in there
|
|
if ( ! explicit1 && ! good2 ) return true;
|
|
|
|
int32_t a = j1;
|
|
int32_t b = j2+1;
|
|
|
|
// . no starting/ending with stop word
|
|
// . i-25 is exception!
|
|
if ( m_wids[j1] != h_i && m_words->isStopWord(j1) ) return true;
|
|
if ( m_wids[j2] != h_i && m_words->isStopWord(j2) ) return true;
|
|
|
|
// count alnums from a to b
|
|
int32_t ac = 0;
|
|
for ( int32_t i = a ; i < b ; i++ )
|
|
if ( m_wids[i] ) ac++;
|
|
|
|
// add the INTERSECTION
|
|
Place *street = (Place *)m_sm.getMem(sizeof(Place));
|
|
if ( ! street ) return false;
|
|
street->m_a = a;
|
|
street->m_b = b;
|
|
street->m_alnumA = ap1;
|
|
street->m_alnumB = ap1 + ac; // ap2+1;
|
|
street->m_type = PT_STREET;
|
|
street->m_str = m_wptrs[j1];
|
|
street->m_strlen = m_wptrs[j2]-m_wptrs[j1]+m_wlens[j2];
|
|
//street->m_adm1[0] = 0;
|
|
//street->m_adm1[1] = 0;
|
|
//street->m_crid = 0;
|
|
street->m_flags2 = PLF2_INTERSECTION;
|
|
street->m_bits = 0;
|
|
street->m_address = NULL;
|
|
street->m_alias = NULL;
|
|
// set its m_hash member
|
|
setHashes ( street , m_words , m_niceness );
|
|
|
|
// prevent overlap with next street
|
|
//lastb = m_street->m_b;
|
|
// . need to know this for getting place name
|
|
// . place name must also be in upper case if
|
|
// the street is...
|
|
// . TODO: do we need this???? mdw
|
|
//if ( uc == 1 ) m_street->m_bits |= PLF_HAS_UPPER;
|
|
// set some bits
|
|
for ( int32_t k = a ; m_bits && k < b ; k++ )
|
|
m_bits->m_bits[k] |= D_IS_IN_STREET;
|
|
// point to next street
|
|
//m_ns++;
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . sets *good to true when we have a completed street
|
|
bool Addresses::isInStreet ( int32_t j ) {
|
|
// we can never contain a tag
|
|
if ( m_tids[j] ) {
|
|
// skip if <sup>
|
|
if ( m_tids[j] == TAG_SUP ) return true;
|
|
if ( m_tids[j] == (TAG_SUP|BACKBIT) ) return true;
|
|
// . crap but micorosft front page has brs
|
|
// . "intersection of Interstate 405 and Sunset <br>Boulevard"
|
|
if ( m_tids[j] == TAG_BR ) return true;
|
|
// be a little more sensitive with this since it is easier
|
|
// to have false positives because we do not have a street
|
|
// number!
|
|
return false;
|
|
}
|
|
// are we punctuation?
|
|
if ( ! m_wids[j] ) {
|
|
// single space is ok
|
|
if (m_wptrs[j][0]==' '&&m_wlens[j]==1) return true;
|
|
// double space is ok
|
|
if (m_wptrs[j][0]==' '&&m_wptrs[j][1]==' '&& m_wlens[j]==2)
|
|
return true;
|
|
// period only after abbreviation
|
|
if ( m_wptrs[j][0] == '.' && j > 0 &&
|
|
isAbbr(m_wids[j-1])&&
|
|
m_wptrs[j][1] == ' ' && m_wlens[j]==2 )
|
|
return true;
|
|
// . period after a single letter as well
|
|
// . N. M.
|
|
if ( m_wptrs[j][0] == '.' && j > 0 &&
|
|
m_wlens[j-1]==1 &&
|
|
// fix "8. wall street"
|
|
!is_digit(m_wptrs[j-1][0]) &&
|
|
m_wptrs[j][1] == ' ' &&
|
|
m_wlens[j]==2 )
|
|
return true;
|
|
// N.M.
|
|
if ( m_wptrs[j][0] == '.' && j > 0 &&
|
|
// fix 1."5 miles west"
|
|
!is_digit(m_wptrs[j-1][0]) &&
|
|
m_wlens[j-1]==1 && m_wlens[j]==1 )
|
|
return true;
|
|
// quote: The Noyes House 2525 "N" Avenue
|
|
// National
|
|
if (m_wptrs[j][0]=='\"'&&m_wptrs[j][1]==' ' &&
|
|
m_wlens[j]==2&&
|
|
// 'closer to 37"' is not a street name!
|
|
!is_digit(m_wptrs[j-1][0]))
|
|
return true;
|
|
if (m_wptrs[j][0]==' ' &&m_wptrs[j][1]=='\"'&&
|
|
m_wlens[j]==2) return true;
|
|
// punct mark: st. michael's drive
|
|
if (m_wptrs[j][0]=='\''&&m_wlens[j]==1) return true;
|
|
// mosby's run: utf8 apostrophe
|
|
if (m_wlens[j]==3&&
|
|
m_wptrs[j][0]==-30 &&
|
|
m_wptrs[j][1]==-128 &&
|
|
m_wptrs[j][2]==-103 )
|
|
return true;
|
|
// village of los ranchos growers' market
|
|
if (m_wptrs[j][0]=='\''&&m_wptrs[j][1]==' '&&
|
|
m_wlens[j]==2) return true;
|
|
// hyphens usually bad, but x-y is ok.
|
|
if(m_wptrs[j][0]=='-'&&m_wlens[j]==1&&j>0&&j+1<m_nw&&
|
|
m_words->isAlpha(j-1)&&m_words->isAlpha(j+1))return true;
|
|
// i-25 is ok now too
|
|
if (m_wptrs[j][0]=='-'&&j>0&&m_wids[j-1]==h_i&&j+1<m_nw&&
|
|
is_digit(m_wptrs[j+1][0]) )
|
|
return true;
|
|
// fix "3650-A Hwy 528..."
|
|
//if(m_wptrs[j][0]=='-'&&m_wlens[j]==1&&j==i+1&&
|
|
// j+1<m_nw&&m_wlens[j+1]==1&&
|
|
// is_alpha_a(m_wptrs[j+1][0])) return true;
|
|
// "620-624 Central Ave SW." (El Rey)
|
|
//if ( hasRange &&j==i+1 ) return true;
|
|
// fix for 4909-15 Hawkins NE" for ceder.net
|
|
//if(j+1<m_nw&&
|
|
// m_wlens[j+1]==2&&is_digit(m_wptrs[j+1][0])&&
|
|
// m_wlens[j-1]>=4&&is_digit(m_wptrs[j-1][0]) ) {
|
|
// hasHyphenAddress = true;
|
|
// return true;
|
|
//}
|
|
// sequence of whitespace is ok
|
|
int32_t k; for(k=0;k<m_wlens[j];k++)
|
|
if(!is_wspace_a(m_wptrs[j][k])) break;
|
|
if(k==m_wlens[j]) return true;
|
|
// '/' is ok if part of a fraction!
|
|
//if( j == fractionj ) return true;
|
|
// . comma allowed only b4 directional indicatr
|
|
// . "131 Monroe St, NE"
|
|
// . no because we got a false positive:
|
|
// "1024 4th street, sw corner..."
|
|
// . ok, this is back again now! BUT... need
|
|
// to make sure a tag or city name follows it
|
|
// . crap, now we got
|
|
// "5305 Gibson, S.E. <b>Albuquerque ..."
|
|
if ( m_wptrs[j][0]!=',' ) return false;
|
|
if ( m_wptrs[j][1]!=' ' ) return false;
|
|
if ( j+3>= m_nw ) return false;
|
|
char gotDir = 0;
|
|
if ( m_wids[j+1] == h_ne ) gotDir = 2;
|
|
if ( m_wids[j+1] == h_nw ) gotDir = 2;
|
|
if ( m_wids[j+1] == h_se ) gotDir = 2;
|
|
if ( m_wids[j+1] == h_sw ) gotDir = 2;
|
|
if ( m_wids[j+1] == h_n&&m_wids[j+3]==h_e)gotDir=4;
|
|
if ( m_wids[j+1] == h_n&&m_wids[j+3]==h_w)gotDir=4;
|
|
if ( m_wids[j+1] == h_s&&m_wids[j+3]==h_e)gotDir=4;
|
|
if ( m_wids[j+1] == h_s&&m_wids[j+3]==h_w)gotDir=4;
|
|
if ( ! gotDir ) return false;
|
|
// its great if tag follows the dir indicator
|
|
if ( m_tids[j+gotDir] ) return true;
|
|
// or a punct then a tag
|
|
if ( m_tids[j+gotDir+1] ) return true;
|
|
// ok, a cap word must follow
|
|
if ( ! is_upper_utf8 (m_wptrs[j+gotDir+1])) return false;
|
|
// we are good
|
|
return true;
|
|
}
|
|
|
|
|
|
// skip dates, not allowed in there
|
|
if ( m_bits && (m_bits->m_bits[j] & D_IS_IN_DATE) )
|
|
return false;
|
|
|
|
// . otherwise we are alphanumeric
|
|
// . more than 10 is too many for a street
|
|
//if ( alnumsInPhrase++ >= 10 ) return false;
|
|
|
|
// stop at "at"
|
|
if ( m_wids[j] == h_at )
|
|
return false;
|
|
|
|
// stop at "and"
|
|
if ( m_wids[j] == h_and )
|
|
return false;
|
|
|
|
// stop at "between"
|
|
if ( m_wids[j] == h_between )
|
|
return false;
|
|
if ( m_wids[j] == h_btwn )
|
|
return false;
|
|
if ( m_wids[j] == h_bet )
|
|
return false;
|
|
|
|
// stop at "location"
|
|
if ( m_wids[j] == h_location )
|
|
return false;
|
|
|
|
// stop at "location"
|
|
if ( m_wids[j] == h_intersection )
|
|
return false;
|
|
|
|
int64_t postWid = 0LL;
|
|
int32_t maxj = j+15; if ( j > m_nw ) j = m_nw;
|
|
for ( int32_t pi = j + 1 ; pi < maxj ; pi++ ) {
|
|
if ( ! m_wids[pi] ) continue;
|
|
postWid = m_wids[pi];
|
|
break;
|
|
}
|
|
|
|
// skip if indicator
|
|
//IndDesc *id=(IndDesc *)g_indicators.getValue(&m_wids[j]);
|
|
//if ( id && (id->m_bit & IND_STREET) ) return true;
|
|
//if ( id && (id->m_bit & IND_DIR ) ) return true;
|
|
|
|
return true;
|
|
}
|
|
|
|
uint64_t getAdm1Bits ( char *stateAbbr ) {
|
|
//if ( stateAbbr[2] ) { char *xx=NULL;*xx=0; }
|
|
uint64_t h64 = hash64Lower_a( stateAbbr , 2 );
|
|
StateDesc **sdp = (StateDesc **)g_states.getValue(&h64);
|
|
//uint16_t *val = (uint16_t *)g_states.getValue ( &h64 );
|
|
// this happens if we have a foreign latlon only address in the contact
|
|
// address tags and we call setFromStr() on that. obviously
|
|
// foreign states will not be in here! so allow this for now and
|
|
// do not core!
|
|
if ( ! sdp ) return 0;
|
|
// get position in the s_states[] array
|
|
int32_t pos = (int32_t)((*sdp) - s_states);
|
|
// that is the shifter
|
|
return (1LL << pos);
|
|
}
|
|
|
|
// . search for all PCLI entries in /geo/allCountries.txt
|
|
// . grep out into countries.txt and process into countries.dat
|
|
// . remove
|
|
// "Kingdom of"
|
|
// "Republic of"
|
|
// "Democractic Republic of"
|
|
// "Oriental Republic of"
|
|
// "* Republic"
|
|
// "United Republic of"
|
|
// "Socialist Republic of"
|
|
// "Independent State of"
|
|
// "State of the" (Vatican City)
|
|
// "Federative Replublic of"
|
|
|
|
/*
|
|
SafeBuf g_countryDescBuf;
|
|
|
|
// . g_countryDescBuf consists of a list of these
|
|
// . the hashtablex g_countryTable maps a country name word hash to
|
|
// a CountryDesc pointer
|
|
class CountryDesc {
|
|
public:
|
|
// country id in one byte
|
|
uint8_t m_crid;
|
|
// two letter, upper case countrycode includes \0
|
|
char *m_countryCode[3];
|
|
// country population, up to 4B
|
|
uint32_t m_population;
|
|
// centroid
|
|
float m_latitude;
|
|
float m_longitude;
|
|
// box radius i guess
|
|
float m_radius;
|
|
// . ptr into SafeBuf g_countryNameBuf
|
|
// . all the country names with their languages like:
|
|
// us-fi-nl=egypt,de-es=egypti,...
|
|
// . comma separated
|
|
// . \0 terminated
|
|
char m_nameBufPtr[];
|
|
// . get the name of the country in the designated language
|
|
// . langAbbr is the two letter lang abbreviation (en=english,etc.)
|
|
// . sometimes it can be 3 letters! nds, nrm, ... see
|
|
// /geo/geonames/iso-languagecodes.txt
|
|
// . sometimes there are names of the place with no associated language
|
|
// as well, so watch out for that
|
|
char *getCountryName ( char *langAbbr );
|
|
};
|
|
|
|
// . a huge string of all the countries and corresponding data
|
|
// . we parse this up into the g_countries table where each slot is a
|
|
// CountryDesc and CountryDesc::m_nameBufPtr references into g_countryData.
|
|
// . we need to know the language of each spelling of the country name
|
|
// so we can display that name if someone's browser says they only know
|
|
// Spanish or something, we'd say Estados instead of States or whatever.
|
|
// . well the alternateNames.txt file has the alternate names of each
|
|
// city or country or state and the language it is from, so use that...
|
|
// . make a name list like "en=Egypt" to indicate its called Egypt in english
|
|
// . cs.en.nb.nn.sk=Egypt,fy.nl=Egypte,fi=Egypti
|
|
char *g_countryData = "";
|
|
|
|
HashTableX g_countries;
|
|
|
|
bool setCountryTable ( ) {
|
|
return true;
|
|
}
|
|
|
|
// access g_countries table to find it
|
|
CountryDesc *getCountryDesc ( int64_t wid ) {
|
|
return NULL;
|
|
}
|
|
|
|
// two letter country code
|
|
CountryDesc *getCountryDesc ( char *countryCode ) {
|
|
int64_t wid = hash64Lower_a ( countryCode , 2 );
|
|
return getCountryDesc ( wid );
|
|
}
|
|
|
|
Place *getCountryPlace ( int32_t a , int32_t alnumPos , Words *words ) {
|
|
return NULL;
|
|
}
|
|
*/
|
|
|
|
StateDesc *getStateDesc ( char *stateAbbr ) {
|
|
uint64_t h64 = hash64Lower_a( stateAbbr , 2 );
|
|
StateDesc **sdp = (StateDesc **)g_states.getValue(&h64);
|
|
if ( ! sdp ) return NULL;
|
|
return *sdp;
|
|
}
|
|
|
|
StateDesc *getStateDescByNum ( int32_t i ) {
|
|
// sto breach;
|
|
if ( i >= (int32_t)sizeof(s_states)/ (int32_t)sizeof(StateDesc)) return NULL;
|
|
if ( i < 0 ) return NULL;
|
|
return &s_states[i];
|
|
}
|
|
|
|
|
|
inline int32_t getStateOffset ( int64_t *h ) {
|
|
StateDesc **sdp = (StateDesc **)g_states.getValue(h);
|
|
if ( ! sdp ) return -1;
|
|
// return the POSITION though
|
|
return (int32_t)((*sdp) - s_states);
|
|
}
|
|
|
|
// from hash of state
|
|
uint64_t getStateBitFromHash ( int64_t *h ) {
|
|
int32_t pos = getStateOffset ( h );
|
|
if ( pos < 0 ) return 0;
|
|
return (1LL << pos);
|
|
}
|
|
|
|
StateDesc *getStateDescFromBits ( uint64_t bit ) {
|
|
int32_t size = sizeof(s_states);
|
|
// item count
|
|
int32_t n = (int32_t)size/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// get it
|
|
StateDesc *sd = &s_states[i];
|
|
// check bits
|
|
if ( (((uint64_t)1LL)<<i) == bit ) return sd;
|
|
}
|
|
// sanity check
|
|
char *xx=NULL;*xx=0;
|
|
return NULL;
|
|
}
|
|
|
|
char *getStateAbbr ( uint64_t bit ) {
|
|
// clear the unique bit
|
|
//bit &= ~ CF_UNIQUE;
|
|
// use this for speed
|
|
int32_t pos = getBitPosLL((uint8_t *)&bit);
|
|
// must be there
|
|
return s_states[pos].m_adm1;
|
|
}
|
|
|
|
int64_t getWordXorHash2 ( char *s , int32_t slen ) {
|
|
// tmp save
|
|
char c = s[slen];
|
|
s[slen] = '\0';
|
|
int64_t h = getWordXorHash(s);
|
|
// put back
|
|
s[slen] = c;
|
|
return h;
|
|
}
|
|
|
|
int64_t getWordXorHash ( char *s ) {
|
|
Words tmp;
|
|
tmp.set9 ( s , 0 );
|
|
int64_t *wids = tmp.m_wordIds;
|
|
uint64_t h = 0LL;
|
|
for ( int32_t i = 0 ; i < tmp.m_numWords ; i++ ) {
|
|
if ( !wids[i] ) continue;
|
|
// make it
|
|
h <<= 1LL;
|
|
h ^= wids[i];
|
|
}
|
|
return h;
|
|
}
|
|
|
|
|
|
#include "GeoIP.h"
|
|
#include "GeoIPCity.h"
|
|
|
|
static const char * _mk_NA( const char * p ){
|
|
return p ? p : "N/A";
|
|
}
|
|
|
|
// try "geolite city" free software
|
|
// mwells@titan:~/tmp2/GeoIP-1.4.6/apps$ geoiplookup -f GeoLiteCity.dat 67.16.94.2
|
|
// GeoIP City Edition, Rev 1: US, NM, Albuquerque, N/A, 35.102501, -106.611702, 505
|
|
// i guess i can just include that library in the gb source
|
|
// . i would say just trace the code and just grab the code we need
|
|
// and re-code into gb. BUT do indeed keep the GeoLiteCity.dat file
|
|
// that is only 28MB so we should load it up at start time
|
|
// . put our api code into here down below
|
|
bool getIPLocation ( int32_t ip ,
|
|
double *lat ,
|
|
double *lon ,
|
|
double *radius ,
|
|
char **city ,
|
|
char **state ,
|
|
char **ctry ,
|
|
char *buf ,
|
|
int32_t bufSize ) {
|
|
|
|
//static int s_i = 0;
|
|
|
|
// assume none
|
|
*city = NULL;
|
|
*state = NULL;
|
|
if ( ctry ) *ctry = NULL;
|
|
|
|
static GeoIP *s_gi = NULL;
|
|
|
|
char *sip = (char *)&ip;
|
|
// if ip is local use abq, nm
|
|
if ( sip[0]==10 ||
|
|
// 192.168.x.x is local
|
|
(sip[0]==(char)192 && sip[1]==(char)168) ||
|
|
// 127.0.0.1
|
|
ip==(int32_t)16777343 ) {
|
|
char *p = buf;
|
|
*city = p;
|
|
p += sprintf ( p , "Albuquerque" );
|
|
*p++ = '\0';
|
|
*state = p;
|
|
p += sprintf ( p , "NM" );
|
|
*p++ = '\0';
|
|
*ctry = p;
|
|
p += sprintf ( p , "US" );
|
|
// use this
|
|
*lat = 35.10438;
|
|
*lon = -106.6270;
|
|
return true;
|
|
}
|
|
|
|
|
|
if ( ! s_gi ) {
|
|
// make full pathc
|
|
char full[1024];
|
|
sprintf(full,"%s%s",g_hostdb.m_dir,"GeoLiteCity.dat");
|
|
s_gi = GeoIP_open(full, GEOIP_STANDARD);
|
|
if ( ! s_gi ) {
|
|
log("gb: could not open %s",full);
|
|
return false;
|
|
}
|
|
//s_i = GeoIP_database_edition(s_gi);
|
|
}
|
|
|
|
// geoiplookup(gi,hostname,i);
|
|
|
|
//char hostname[64];
|
|
//sprintf(hostname,"%s",iptoa(ip));
|
|
//geoiplookup(gi,hostname,i);
|
|
//uint32_t ipnum = GeoIP_lokupaddress(hostname);
|
|
|
|
// put in network byte order, host to network
|
|
int32_t ipnum = htonl ( ip );
|
|
// temp
|
|
//ipnum = ip;
|
|
|
|
GeoIPRecord *gir = GeoIP_record_by_ipnum(s_gi, ipnum);
|
|
|
|
// false if not found
|
|
if ( ! gir ) return false;
|
|
|
|
log("geoip: "
|
|
//"%s: %s, %s, %s, %s, %f, %f, %d",
|
|
"%s, %s, %s, %s, %f, %f, %d",
|
|
//GeoIPDBDescription[(uint32_t)s_gi->databaseType],
|
|
gir->country_code,
|
|
_mk_NA(gir->region),
|
|
_mk_NA(gir->city),
|
|
_mk_NA(gir->postal_code),
|
|
// %d
|
|
gir->latitude,
|
|
gir->longitude, //gir->metro_code,
|
|
gir->area_code);
|
|
|
|
// transfer
|
|
if ( lat ) *lat = gir->latitude;
|
|
if ( lon ) *lon = gir->longitude;
|
|
// express 20 miles in degrees... one degree is 69 miles
|
|
if ( radius ) *radius = 20.0 / 69.0;
|
|
|
|
// city and state
|
|
char *p = buf;
|
|
int32_t len ;
|
|
|
|
// bogus?
|
|
if ( ! gir->country_code ) return false;
|
|
|
|
if ( ctry ) *ctry = p;
|
|
|
|
//len = gbstrlen(gir->country_code);
|
|
//gbmemcpy ( p , gir->country_code , len + 1 );
|
|
p[0] = gir->country_code[0];
|
|
p[1] = gir->country_code[1];
|
|
p += 2;
|
|
*p++ = '\0';
|
|
|
|
*state = p;
|
|
len = 0;
|
|
if ( gir->region ) len = gbstrlen(gir->region);
|
|
// bogus?
|
|
if ( len == 0 ) return false;
|
|
//gbmemcpy ( p , gir->region , len + 1 );
|
|
// make it all lowercase so we don't core anywhere
|
|
int32_t written = to_lower_alnum_a(gir->region,len,p);
|
|
// sanity
|
|
if ( written != len ) { char *xx=NULL;*xx=0; }
|
|
// skip over what we stored
|
|
p += len ;
|
|
// null term
|
|
*p++ = '\0';
|
|
// get len
|
|
//int32_t plen = gbstrlen(p);
|
|
//p += len + 1;
|
|
|
|
*city = p;
|
|
len = 0;
|
|
if ( gir->city ) len = gbstrlen(gir->city);
|
|
// bogus?
|
|
if ( len == 0 ) return false;
|
|
gbmemcpy ( p , gir->city , len );
|
|
p += len;
|
|
*p++ = '\0';
|
|
|
|
// sanbity check
|
|
if ( p - buf > bufSize ) { char *xx=NULL;*xx=0; }
|
|
|
|
// free this junk too!
|
|
GeoIPRecord_delete ( gir );
|
|
|
|
//free ( gir );
|
|
|
|
//GeoIP_delete(gi);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool getLatLon ( uint32_t cityId , double *lat , double *lon ) {
|
|
// now lookup timezone
|
|
int32_t slot = g_timeZones.getSlot ( &cityId );
|
|
// return 0 if not found
|
|
if ( slot < 0 ) return false;
|
|
// otherwise, set m_timeZoneOffset appropriately
|
|
CityStateDesc *csd;
|
|
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(slot);
|
|
*lat = csd->m_latitude;
|
|
*lon = csd->m_longitude;
|
|
return true;
|
|
}
|
|
|
|
// or numeric lat/lon
|
|
float getLatLonSpecial ( char *p ,
|
|
char *bufStart,
|
|
char *bufEnd ,
|
|
char *found ) {
|
|
// assume none
|
|
*found = 0;
|
|
// must start with digit
|
|
if ( ! is_digit(*p) ) return 0.0;
|
|
// set start
|
|
char *start = p;
|
|
// negative sign?
|
|
if ( p>bufStart && p[-1] == '-' ) start--;
|
|
// reset counts
|
|
int32_t digitCount = 0;
|
|
int32_t decimalCount = 0;
|
|
// do not scan so far
|
|
char *pmax = p + 20;
|
|
if ( pmax > bufEnd ) pmax = bufEnd;
|
|
// scan until no digit or period
|
|
for ( ; p < pmax ; p++ ) {
|
|
// count the digits
|
|
if ( is_digit(*p) ) {
|
|
digitCount++;
|
|
continue;
|
|
}
|
|
// decimal point is ok
|
|
if ( *p == '.' ) {
|
|
decimalCount++;
|
|
continue;
|
|
}
|
|
// stop on other crap
|
|
break;
|
|
}
|
|
// give up if less than 3 digits encountered
|
|
if ( digitCount < 3 ) return 0.0;
|
|
// some pages have no period in it
|
|
// and we just have to assume the first
|
|
// 3 digits are before the period. like for
|
|
// switchboard.com urls
|
|
if ( decimalCount >= 2 ) return 0.0;
|
|
// convert
|
|
double dval = atod2(start,p-start);
|
|
// fix switchboard.com stuff which has no decimal pt
|
|
if ( decimalCount == 0 ) {
|
|
// how many digits to left of decimal
|
|
int32_t left = 3;
|
|
// make a divisor
|
|
double ddd = 1;
|
|
for ( int32_t vv = 0 ; vv<digitCount-left; vv++)
|
|
ddd *= 10;
|
|
// fix it
|
|
dval /= ddd;
|
|
}
|
|
// bail if bad
|
|
if ( dval < -180.0 || dval > 180.0 ) return 0.0;
|
|
// in the usual decimal it is
|
|
// lat from 24.450000 to 60 (juneau alask) // 47.4666666
|
|
// lon from -71.083333 to -114.1333333
|
|
//char type = 0;
|
|
//if ( dval >= 24.45 && dval <= 60.0 ) type = 1; // lat
|
|
//else if ( dval >= -140.0 && dval <= -66.1 ) type = 2; // lon
|
|
//else log("query: lat/lon point not in our scope. fix!");
|
|
//if ( type == 0 ) return 0.0;
|
|
|
|
*found = 1;//type;
|
|
return dval;
|
|
}
|
|
|
|
// TEST SCRIPT:
|
|
static char *s_tests[] = {
|
|
"sf",
|
|
"sf ca",
|
|
"sf nm",
|
|
"ottawa ontario",
|
|
"rio de janeiro",
|
|
"mexico city",
|
|
// pasadena texas is more popular than california!
|
|
"pasadena",
|
|
"berlin",
|
|
"berlin, germany",
|
|
"paris",
|
|
"paris, tx",
|
|
"paris, ky",
|
|
"paris, france",
|
|
"homestead",
|
|
"key west",
|
|
"santa fe",
|
|
"san francisco",
|
|
"poland",
|
|
"germany",
|
|
"georgia", // the country!!
|
|
"nm",
|
|
"texas",
|
|
"mass",
|
|
"d.c.",
|
|
"washington",// (should be the state)
|
|
"washington d.c.",
|
|
"washington dc",
|
|
"kentucky",
|
|
"mexico",
|
|
"tokyo",
|
|
"philippines",
|
|
"usa",
|
|
"united states of america",
|
|
"georgia", // (should be the US state, not the country!)
|
|
"87109",
|
|
"90210",
|
|
"taste of germany",
|
|
"kimo theater", // (venue name)
|
|
"pleasant arena", // (venue name)
|
|
"barton road"// (street name test)
|
|
};
|
|
|
|
|
|
|
|
|
|
bool printTesterPage ( SafeBuf &sb ) {
|
|
|
|
sb.safePrintf("<table>");
|
|
|
|
int32_t count = 0;
|
|
int32_t n = sizeof(s_tests)/sizeof(char *);
|
|
bool firstRow = true;
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
if ( count %4 == 0 ) {
|
|
if ( ! firstRow )
|
|
sb.safePrintf("</tr>");
|
|
firstRow = false;
|
|
sb.safePrintf("<tr>");
|
|
}
|
|
count++;
|
|
sb.safePrintf("<td>");
|
|
// print map
|
|
int32_t width = 200;
|
|
int32_t height = 200;
|
|
// get stuff
|
|
float radius;
|
|
char *where = s_tests[i];
|
|
float cityLat;
|
|
float cityLon;
|
|
float stateLat;
|
|
float stateLon;
|
|
float countryLat;
|
|
float countryLon;
|
|
float zipLat;
|
|
float zipLon;
|
|
float userLat;
|
|
float userLon;
|
|
char timeZone2;
|
|
char useDST;
|
|
uint8_t ipCrid = CRID_US;
|
|
char gbwhereBuf[512];
|
|
int32_t gbwhereBufSize = 500;
|
|
getLatLonFromUserInput ( &radius,
|
|
where ,
|
|
&cityLat ,
|
|
&cityLon ,
|
|
&stateLat,
|
|
&stateLon,
|
|
&countryLat,
|
|
&countryLon,
|
|
&zipLat ,
|
|
&zipLon ,
|
|
&userLat ,
|
|
&userLon ,
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
&timeZone2 ,
|
|
&useDST,
|
|
ipCrid,
|
|
gbwhereBuf ,
|
|
gbwhereBufSize ) ;
|
|
// get most accurate lat/lon
|
|
float lat = NO_LATITUDE;
|
|
float lon = NO_LONGITUDE;
|
|
int32_t zoom = 0; // world
|
|
if ( countryLat != NO_LATITUDE && countryLon != NO_LONGITUDE) {
|
|
lat = countryLat;
|
|
lon = countryLon;
|
|
zoom = 3; // country?
|
|
}
|
|
if ( stateLat != NO_LATITUDE && stateLon != NO_LONGITUDE ) {
|
|
lat = stateLat;
|
|
lon = stateLon;
|
|
zoom = 5; // state?
|
|
}
|
|
if ( cityLat != NO_LATITUDE && cityLon != NO_LONGITUDE ) {
|
|
lat = cityLat;
|
|
lon = cityLon;
|
|
zoom = 7; // city?
|
|
}
|
|
if ( zipLat != NO_LATITUDE && zipLon != NO_LONGITUDE ) {
|
|
lat = zipLat;
|
|
lon = zipLon;
|
|
zoom = 8; // zip?
|
|
}
|
|
if ( userLat != NO_LATITUDE && userLon != NO_LONGITUDE ) {
|
|
lat = userLat;
|
|
lon = userLon;
|
|
zoom = 8;
|
|
}
|
|
|
|
sb.safePrintf ( "<img src=\""
|
|
"http://maps.google.com/maps/api/staticmap?"
|
|
"size=%"INT32"x%"INT32"&maptype=roadmap&sensor=false" ,
|
|
width, height );
|
|
sb.safePrintf("&zoom=%"INT32""
|
|
"&markers="
|
|
"size:medium"
|
|
"%%7Ccolor:%s"
|
|
"%%7Clabel:%c" // letter
|
|
"%%7C%.07f" // lat
|
|
"%%2C%.07f" //lon
|
|
,zoom
|
|
,"red" // s_mapColors[0]
|
|
,'A'
|
|
,lat
|
|
,lon );
|
|
|
|
sb.safePrintf("\"><br>%s",s_tests[i]);
|
|
sb.safePrintf("</td>");
|
|
}
|
|
|
|
sb.safePrintf("</tr></table>");
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
//
|
|
// TODO: maybe just print out like 20 google maps for these on a page tester?
|
|
|
|
// . returns false if we could not identify a lat/lon from "where" string
|
|
// . returns false and sets g_errno on error
|
|
// . stores words NOT used for lat/lon determination into gbwhereBuf each
|
|
// word with a "gbwhere:" prefix so we can append gbwhereBuf to the query.
|
|
// . if input is just a state like new mexico, then uses gbwhere:"new mexico"
|
|
// otherwise it could be referring to a street called New Mexico Avenue...
|
|
// . you pass in the radius SearchInput::m_radius as "radius" and we may
|
|
// change it here! if its 0 and we find a lat/lon in the "where" string
|
|
// then we will change it to 100. if the *radius you pass in is non-zero
|
|
// we may change it to zero if we can't find a lat/lon...
|
|
bool getLatLonFromUserInput ( float *radius,
|
|
char *where ,
|
|
float *cityLat ,
|
|
float *cityLon ,
|
|
float *stateLat,
|
|
float *stateLon,
|
|
float *countryLat,
|
|
float *countryLon,
|
|
//double *radius ,
|
|
// . position of the user
|
|
// . we try to set these from the zipcode if ther
|
|
float *zipLat ,
|
|
float *zipLon ,
|
|
float *userLat ,
|
|
float *userLon ,
|
|
PlaceDesc **retCityDesc,
|
|
PlaceDesc **retStateDesc,
|
|
PlaceDesc **retCountryDesc,
|
|
char *timeZone2 ,
|
|
char *useDST,
|
|
// country of search based on ip (two letters)
|
|
uint8_t ipCrid,
|
|
char *gbwhereBuf ,
|
|
int32_t gbwhereBufSize ) {
|
|
|
|
// convert "where" string into a cityId32 so we can convert
|
|
// to a lat/lon by calling getLatLon(cityId)
|
|
|
|
g_errno = 0;
|
|
|
|
Words w;
|
|
if ( ! w.set3 ( where ) ) return false;
|
|
|
|
|
|
|
|
|
|
// express 20 miles in degrees... one degree is 69 miles
|
|
//*radius = 20.0 / 69.0;
|
|
|
|
// start at -1
|
|
int32_t alnumPos = -1;
|
|
|
|
//char *adm1Str = NULL;
|
|
|
|
int32_t cityA = -1;
|
|
int32_t cityB = -1;
|
|
int32_t stateA = -1;
|
|
int32_t stateB = -1;
|
|
int32_t zipA = -1;
|
|
int32_t zipB = -1;
|
|
int32_t countryA = -1;
|
|
int32_t countryB = -1;
|
|
|
|
int32_t cityAlnumA = -1;
|
|
int32_t cityAlnumB = -1;
|
|
int32_t stateAlnumA = -1;
|
|
int32_t stateAlnumB = -1;
|
|
int32_t zipAlnumA = -1;
|
|
int32_t zipAlnumB = -1;
|
|
int32_t countryAlnumA = -1;
|
|
int32_t countryAlnumB = -1;
|
|
|
|
int32_t finalCityA = -1;
|
|
int32_t finalCityB = -1;
|
|
int32_t finalStateA = -1;
|
|
int32_t finalStateB = -1;
|
|
int32_t finalCountryA = -1;
|
|
int32_t finalCountryB = -1;
|
|
int32_t finalZipA = -1;
|
|
int32_t finalZipB = -1;
|
|
|
|
// int16_tcuts
|
|
int64_t *wids = w.getWordIds();
|
|
char **wptrs = w.getWords();
|
|
int32_t *wlens = w.getWordLens();
|
|
|
|
// set lastWidPos
|
|
int64_t lastWidPos = w.m_numWords;
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ )
|
|
if ( wids[i] ) lastWidPos = i;
|
|
|
|
char *bufStart = where;
|
|
char *bufEnd = where + gbstrlen(where);
|
|
|
|
// reset all
|
|
*userLat = NO_LATITUDE;
|
|
*userLon = NO_LONGITUDE;
|
|
*cityLat = NO_LATITUDE;
|
|
*cityLon = NO_LONGITUDE;
|
|
*stateLat = NO_LATITUDE;
|
|
*stateLon = NO_LONGITUDE;
|
|
*countryLat = NO_LATITUDE;
|
|
*countryLon = NO_LONGITUDE;
|
|
*zipLat = NO_LATITUDE;
|
|
*zipLon = NO_LONGITUDE;
|
|
|
|
//int32_t totalAlnums = w.getNumAlnumWords ();
|
|
|
|
// for numeric entries like 58.xxxx -128.yyyy
|
|
bool hasLat = false;
|
|
bool hasLon = false;
|
|
|
|
int32_t ignoreUntil;
|
|
|
|
|
|
// do a initial loop looking for the country to use, otherwise,
|
|
// we'll assume ipcrid. once we establish a country it will be
|
|
// easier to know what state or city is being talked about.
|
|
alnumPos = -1;
|
|
ignoreUntil = -1;
|
|
PlaceDesc *finalCountryDesc = NULL;
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if punct
|
|
if ( ! wids[i] ) continue;
|
|
// alnum pos count
|
|
alnumPos++;
|
|
// fix "united states of america"
|
|
if ( i < ignoreUntil ) continue;
|
|
// country names are unique, so we can set this here
|
|
PlaceDesc *crd = NULL;
|
|
// get the last non-null country in the where box
|
|
getLongestPlaceName_new ( i,
|
|
0 , // alnumPos,
|
|
&w,
|
|
PDF_COUNTRY,
|
|
CRID_ANY,
|
|
NULL, // state abbr
|
|
NULL,//&countryHash64,
|
|
&countryAlnumA,
|
|
&countryAlnumB,
|
|
&countryA,
|
|
&countryB ,
|
|
&crd );
|
|
// record last one
|
|
if ( crd ) {
|
|
finalCountryDesc = crd;
|
|
finalCountryA = countryA;
|
|
finalCountryB = countryB;
|
|
ignoreUntil = countryB;
|
|
}
|
|
}
|
|
|
|
|
|
// assume country based on searcher's IP address
|
|
uint8_t crid = ipCrid;
|
|
// unless a country was specified in the wherebox, then use that
|
|
if ( finalCountryDesc ) crid = finalCountryDesc->m_crid;
|
|
|
|
// do a secondary loop looking for the state before the country
|
|
// or picking the last encountered state. ignore any country we
|
|
// might have found in the first loop. require state be in that
|
|
// country.
|
|
alnumPos = -1;
|
|
ignoreUntil = -1;
|
|
PlaceDesc *finalStateDesc = NULL;
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if punct
|
|
if ( ! wids[i] ) continue;
|
|
// alnum pos count
|
|
alnumPos++;
|
|
// skip if already in use by us
|
|
if ( i < ignoreUntil ) continue;
|
|
// skip if its like a lat/lon
|
|
if ( i+2<w.m_numWords &&
|
|
is_digit(wptrs[i][0]) &&
|
|
wptrs[i][wlens[i]] == '.' &&
|
|
is_digit(wptrs[i][wlens[i]+1]) )
|
|
continue;
|
|
// skip of country words
|
|
//if ( i >= finalCountryA && i < finalCountryB ) continue;
|
|
// country names are unique, so we can set this here
|
|
PlaceDesc *srd = NULL;
|
|
// use this country id (CRID_ANY = 0)
|
|
uint8_t useCrid = CRID_ANY;
|
|
// come back up here with a non-zero crid
|
|
redo:
|
|
// . don't use the countryid to fix "new mexico"...
|
|
// . picks the most popular in case of ties
|
|
getLongestPlaceName_new ( i,
|
|
alnumPos,
|
|
&w,
|
|
PDF_STATE,
|
|
useCrid,
|
|
NULL, // state abbr
|
|
NULL,//&stateHash64,
|
|
&stateAlnumA,
|
|
&stateAlnumB,
|
|
&stateA,
|
|
&stateB ,
|
|
&srd );
|
|
|
|
// if that does not overlap the country we had then
|
|
// re-do it using the country id!!!
|
|
if ( srd &&
|
|
useCrid == CRID_ANY &&
|
|
finalCountryDesc &&
|
|
stateB <= finalCountryA ) {
|
|
useCrid = finalCountryDesc->m_crid;
|
|
goto redo;
|
|
}
|
|
|
|
|
|
// if it is exact overlap and same country... prefer
|
|
// the state. try to fix 'georgia' which is a state and country
|
|
if ( srd &&
|
|
useCrid == CRID_ANY &&
|
|
finalCountryDesc &&
|
|
stateA == finalCountryA &&
|
|
stateB == finalCountryB &&
|
|
finalCountryDesc->m_crid == srd->m_crid &&
|
|
// if in 'mexico' searching for 'mexico' assume the
|
|
// state, and nuke the country...
|
|
ipCrid == srd->m_crid ) {
|
|
ignoreUntil = stateB;
|
|
finalCountryDesc = NULL;
|
|
finalCountryA = -1;
|
|
finalCountryB = -1;
|
|
crid = ipCrid;
|
|
}
|
|
|
|
// otherwise, if NOT in 'mexico' searching for 'mexico'
|
|
// assume the country, not the state in mexico
|
|
if ( srd &&
|
|
useCrid == CRID_ANY &&
|
|
finalCountryDesc &&
|
|
stateA == finalCountryA &&
|
|
stateB == finalCountryB &&
|
|
finalCountryDesc->m_crid == srd->m_crid &&
|
|
// if in 'mexico' searching for 'mexico' assume the
|
|
// state, and nuke the country...
|
|
ipCrid != srd->m_crid ) {
|
|
ignoreUntil = finalCountryB;
|
|
srd = NULL;
|
|
}
|
|
|
|
|
|
// if it is exact overlap and different countries,
|
|
// prefer one that is "crid", the same country as the user!
|
|
// try to fix 'georgia' which is a state and country...
|
|
// in the US we expect georgia the state.
|
|
if ( srd &&
|
|
useCrid == CRID_ANY &&
|
|
finalCountryDesc &&
|
|
stateA == finalCountryA &&
|
|
stateB == finalCountryB &&
|
|
finalCountryDesc->m_crid == ipCrid &&
|
|
srd->m_crid != ipCrid ) {
|
|
ignoreUntil = stateB;
|
|
srd = NULL;
|
|
}
|
|
|
|
// if the state is in the user's country but the country
|
|
// is not the user's country. kill the country descriptor.
|
|
// so 'georgia' in the US will match the state, not
|
|
// 'georgia' the country.
|
|
if ( srd &&
|
|
useCrid == CRID_ANY &&
|
|
finalCountryDesc &&
|
|
stateA == finalCountryA &&
|
|
stateB == finalCountryB &&
|
|
finalCountryDesc->m_crid != ipCrid &&
|
|
srd->m_crid == ipCrid ) {
|
|
ignoreUntil = stateB;
|
|
finalCountryDesc = NULL;
|
|
finalCountryA = -1;
|
|
finalCountryB = -1;
|
|
crid = ipCrid;
|
|
}
|
|
|
|
|
|
// if it does overlap the country, nuke the country then
|
|
// to fix 'new mexico' so country is not 'mexico'
|
|
if ( srd &&
|
|
useCrid == CRID_ANY &&
|
|
finalCountryDesc &&
|
|
stateB > finalCountryA ) {
|
|
finalCountryDesc = NULL;
|
|
finalCountryA = -1;
|
|
finalCountryB = -1;
|
|
crid = ipCrid;
|
|
}
|
|
|
|
// get the last non-null state
|
|
if ( srd ) {
|
|
finalStateDesc = srd;
|
|
finalStateA = stateA;
|
|
finalStateB = stateB;
|
|
ignoreUntil = stateB;
|
|
}
|
|
}
|
|
|
|
// do a third loop looking for the city. ignore any state or country
|
|
// we found in the first two loops. require city be in an state or
|
|
// country we found in the first two loops.
|
|
alnumPos = -1;
|
|
ignoreUntil = -1;
|
|
PlaceDesc *finalCityDesc = NULL;
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if punct
|
|
if ( ! wids[i] ) continue;
|
|
// alnum pos count
|
|
alnumPos++;
|
|
// skip if already in use by us
|
|
if ( i < ignoreUntil ) continue;
|
|
// skip of country words
|
|
// no, was hurting "mexico city" because "mexico" was
|
|
// our country and should have been the city!
|
|
//if ( i >= finalCountryA && i < finalCountryB ) continue;
|
|
// . skip over the state
|
|
// . no, for 'santa fe' it was a state, but we need
|
|
// to comment this line out to contest that.
|
|
//if ( i >= finalStateA && i < finalStateB ) continue;
|
|
// state abbr?
|
|
char *stateAbbr = NULL;
|
|
//if ( finalStateDesc ) stateAbbr = finalStateDesc->m_adm1;
|
|
redoCity:
|
|
// country names are unique, so we can set this here
|
|
PlaceDesc *crd1 = NULL;
|
|
// picks the most popular in case of ties
|
|
getLongestPlaceName_new ( i,
|
|
alnumPos,
|
|
&w,
|
|
PDF_CITY,
|
|
crid,
|
|
stateAbbr,
|
|
NULL,//&cityHash64,
|
|
&cityAlnumA,
|
|
&cityAlnumB,
|
|
&cityA,
|
|
&cityB ,
|
|
&crd1 );
|
|
|
|
// if none found, try not restricting to searcher's
|
|
// country then!!! should fix 'tokyo' since there is no
|
|
// 'tokyo' city in the US at all.
|
|
// crap, then this gets georgia this city in jamaica
|
|
PlaceDesc *crd2 = NULL;
|
|
int32_t city2A;
|
|
int32_t city2B;
|
|
int32_t city2AlnumA;
|
|
int32_t city2AlnumB;
|
|
getLongestPlaceName_new ( i,
|
|
alnumPos,
|
|
&w,
|
|
PDF_CITY,
|
|
CRID_ANY,//crid,
|
|
stateAbbr,
|
|
NULL,//&cityHash64,
|
|
&city2AlnumA,
|
|
&city2AlnumB,
|
|
&city2A,
|
|
&city2B ,
|
|
&crd2 );
|
|
|
|
// default to city in user's country
|
|
PlaceDesc *crd = crd1;
|
|
|
|
// use the worldly city if the local city name does not
|
|
// exist in the user's country.
|
|
if ( ! crd ) {
|
|
crd = crd2;
|
|
cityA = city2A;
|
|
cityB = city2B;
|
|
}
|
|
// if both existed, prefer the longer. if tied. prefer
|
|
// the local one even if its population might be smaller
|
|
if ( crd && crd2 && city2B > cityB ) {
|
|
crd = crd2;
|
|
cityA = city2A;
|
|
cityB = city2B;
|
|
}
|
|
|
|
// if city does NOT overlap the state re-do it using the
|
|
// stateAbbr. constrain to that state then...
|
|
if ( crd &&
|
|
! stateAbbr &&
|
|
finalStateDesc &&
|
|
cityB <= finalStateA ) {
|
|
stateAbbr = finalStateDesc->m_adm1;
|
|
goto redoCity;
|
|
}
|
|
|
|
// if it more than contains the country... nuke the country
|
|
// fixes "mexico city" where it thinks "mexico" is the country
|
|
if ( crd &&
|
|
! stateAbbr &&
|
|
finalCountryDesc &&
|
|
cityA == finalCountryA &&
|
|
cityB > finalCountryB ) {
|
|
ignoreUntil = cityB;
|
|
finalCountryDesc = NULL;
|
|
finalCountryA = -1;
|
|
finalCountryB = -1;
|
|
crid = ipCrid;
|
|
}
|
|
|
|
// do not intersect with country otherwise beyond this point
|
|
if ( i >= finalCountryA && i < finalCountryB ) continue;
|
|
|
|
// if it is exact overlap and same country... prefer state!
|
|
if ( crd &&
|
|
! stateAbbr &&
|
|
finalStateDesc &&
|
|
cityA == finalStateA &&
|
|
cityB == finalStateB &&
|
|
// the state must be in different country now to
|
|
// fix the 'kentucky' query so we do not get
|
|
// 'kentucky, arkansas'
|
|
finalStateDesc->m_crid == crd->m_crid ) {
|
|
ignoreUntil = cityB;
|
|
crd = NULL;
|
|
}
|
|
|
|
// if it equals the state, and we already had a finalCity
|
|
// then toss that city... it's most likely a city/state
|
|
// combo where the state is a city name somewhere as well!
|
|
// fixes 'ottawa, ontario' where it ontario is also a city
|
|
// in the US!
|
|
if ( crd &&
|
|
! stateAbbr &&
|
|
finalStateDesc &&
|
|
finalCityDesc &&
|
|
cityA == finalStateA ) {
|
|
ignoreUntil = finalStateB;
|
|
crd = NULL;
|
|
continue;
|
|
}
|
|
|
|
// if it is exacvt overlap and different countries,
|
|
// prefer one that is "crid"
|
|
if ( crd &&
|
|
! stateAbbr &&
|
|
finalStateDesc &&
|
|
cityA == finalStateA &&
|
|
cityB == finalStateB &&
|
|
// the state must be in different country now to
|
|
// fix the 'kentucky' query so we do not get
|
|
// 'kentucky, arkansas'
|
|
finalStateDesc->m_crid == crid ) {
|
|
ignoreUntil = cityB;
|
|
crd = NULL;
|
|
}
|
|
|
|
// if exact overlap and city is in the user's country,
|
|
// then prefer city and nuke state
|
|
if ( crd &&
|
|
! stateAbbr &&
|
|
finalStateDesc &&
|
|
cityA == finalStateA &&
|
|
cityB == finalStateB &&
|
|
// the state must be in different country now to
|
|
// fix the 'kentucky' query so we do not get
|
|
// 'kentucky, arkansas'
|
|
finalStateDesc->m_crid == crid ) {
|
|
ignoreUntil = cityB;
|
|
finalStateDesc = NULL;
|
|
finalStateA = -1;
|
|
finalStateB = -1;
|
|
}
|
|
|
|
|
|
|
|
// if it does overlap the state, nuke the state then
|
|
// to fix 'key west' query. it thought 'west' was a
|
|
// state in iceland!
|
|
if ( crd &&
|
|
! stateAbbr &&
|
|
finalStateDesc &&
|
|
cityB > finalStateA &&
|
|
// the state must be in different country now to
|
|
// fix the 'kentucky' query so we do not get
|
|
// 'kentucky, arkansas'
|
|
finalStateDesc->m_crid != crd->m_crid &&
|
|
// i added this so 'georgia' the city in jamaica
|
|
// did not beat out the state in the US...
|
|
crd->m_crid == crid ) {
|
|
ignoreUntil = cityB;
|
|
finalStateDesc = NULL;
|
|
finalStateA = -1;
|
|
finalStateB = -1;
|
|
}
|
|
|
|
// BUT kill the city if its the one in a different state
|
|
if ( crd &&
|
|
! stateAbbr &&
|
|
finalStateDesc &&
|
|
cityB > finalStateA &&
|
|
finalStateDesc->m_crid != crd->m_crid &&
|
|
finalStateDesc->m_crid == crid ) {
|
|
ignoreUntil = finalStateB;
|
|
crd = NULL;
|
|
}
|
|
|
|
|
|
// get the last non-null city
|
|
if ( crd ) {
|
|
finalCityDesc = crd;
|
|
finalCityA = cityA;
|
|
finalCityB = cityB;
|
|
ignoreUntil = cityB;
|
|
}
|
|
}
|
|
|
|
// and a 4th loop to get the zip code
|
|
alnumPos = -1;
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if punct
|
|
if ( ! wids[i] ) continue;
|
|
// alnum pos count
|
|
alnumPos++;
|
|
// skip of country words
|
|
if ( i >= finalCountryA && i < finalCountryB ) continue;
|
|
// skip over the state
|
|
if ( i >= finalStateA && i < finalStateB ) continue;
|
|
// skip over city
|
|
if ( i >= finalCityA && i < finalCityB ) continue;
|
|
// we must be in the US
|
|
//if ( crid != CRID_US ) continue;
|
|
// U.S. only for now
|
|
getZip_new ( i,
|
|
alnumPos,
|
|
&w,
|
|
NULL,//&zipHash64,
|
|
NULL,//&zipCityHash64,
|
|
NULL,//&zipStateHash64,
|
|
&zipAlnumA,
|
|
&zipAlnumB,
|
|
&zipA,
|
|
&zipB ,
|
|
zipLat,
|
|
zipLon);
|
|
// skip if none
|
|
if ( *zipLat != NO_LATITUDE ) {
|
|
// set these i guess
|
|
finalZipA = zipA;
|
|
finalZipB = zipB;
|
|
}
|
|
}
|
|
|
|
// loop for numeric lat/lon
|
|
alnumPos = -1;
|
|
ignoreUntil = -1;
|
|
for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
|
|
// skip if punct
|
|
if ( ! wids[i] ) continue;
|
|
// ignore
|
|
if ( i < ignoreUntil ) continue;
|
|
// stop if we had any of the above though!
|
|
//if ( finalCityDesc ) break;
|
|
//if ( finalStateDesc ) break;
|
|
//if ( finalCountryDesc ) break;
|
|
//if ( zipA >= 0 ) break;
|
|
// alnum pos count
|
|
alnumPos++;
|
|
char found = 0;
|
|
float ret = getLatLonSpecial(wptrs[i],
|
|
bufStart,
|
|
bufEnd,
|
|
&found);
|
|
if ( found && !hasLat ) { // == 1
|
|
*userLat = ret;
|
|
ignoreUntil = i + 3;
|
|
// the next one should be the lon
|
|
hasLat = true;
|
|
continue;
|
|
}
|
|
if ( found && !hasLon ) { // == 2
|
|
*userLon = ret;
|
|
ignoreUntil = i + 3;
|
|
hasLon = true;
|
|
continue;
|
|
}
|
|
if ( found ) {
|
|
log("query: got extra lat/lon term! ignoring.");
|
|
ignoreUntil = i + 3;
|
|
continue;
|
|
}
|
|
// ok, a random gbwhere: term i guess
|
|
}
|
|
|
|
// if we had a lat/lon toss all else out. should fix location of
|
|
// "33.83660 -116.54670" which thought the 83660 was a french city.
|
|
if ( hasLat && hasLon ) {
|
|
finalCityDesc = NULL;
|
|
finalStateDesc = NULL;
|
|
finalCountryDesc = NULL;
|
|
// nuke other lons/lats too
|
|
*cityLat = NO_LATITUDE;
|
|
*cityLon = NO_LONGITUDE;
|
|
*stateLat = NO_LATITUDE;
|
|
*stateLon = NO_LONGITUDE;
|
|
*countryLat = NO_LATITUDE;
|
|
*countryLon = NO_LONGITUDE;
|
|
*zipLat = NO_LATITUDE;
|
|
*zipLon = NO_LONGITUDE;
|
|
}
|
|
|
|
|
|
/*
|
|
// . if we got a lat and a lon convert...
|
|
// . this was in pageevents.cpp
|
|
if ( hasLat && hasLon ) {
|
|
float distInMilesSquared;
|
|
PlaceDesc *pd;
|
|
pd = getNearestCity_new ( lat ,lon,0, &distInMilesSquared);
|
|
if ( distInMilesSquared < 1000 ) {
|
|
finalCityDesc = pd;
|
|
finalStateDesc =
|
|
*/
|
|
|
|
int32_t nw = w.getNumWords();
|
|
// was it just a city name by itself?
|
|
bool onlyCity = ( ( finalCityA == 0 || finalCityA == 1 ) &&
|
|
( finalCityB == nw || finalCityB == nw-1 ) );
|
|
|
|
// but if only a city and the city name is also a street indicator
|
|
// then cancel it? that way if they put 'avenue' in the where box
|
|
// they do not get 'avenue, maryland' city.
|
|
// they should! and this messed up 'homestead' in florida.
|
|
//if ( onlyCity && finalCityB == finalCityA+1 ) {
|
|
// IndDesc *id = (IndDesc *)g_indicators.getValue(&wids[
|
|
// finalCityA]);
|
|
// if ( id ) onlyCity = false;
|
|
//}
|
|
|
|
// . if only a city name, nuke it if no state
|
|
// . otherwise if we enter 'avenue' into the where box it thinks
|
|
// its "Avenue, Maryland"
|
|
// . but if the whole thing is just this city, then let it fly...
|
|
if ( finalCityDesc &&
|
|
// no country
|
|
! finalCountryDesc &&
|
|
// no state
|
|
! finalStateDesc &&
|
|
// not just city
|
|
! onlyCity &&
|
|
// no zip..
|
|
finalZipA < 0 ) {
|
|
// do not lookup lat/lon...
|
|
finalCityDesc = NULL;
|
|
// nuke these
|
|
finalCityA = -1;
|
|
finalCityB = -1;
|
|
finalStateA = -1;
|
|
finalStateB = -1;
|
|
}
|
|
|
|
// use userlat/lon to make the bounding box. this is usually the
|
|
// city centroid otherwise.
|
|
if ( *userLat != 999.0 && *userLon != 999.0 )
|
|
// uses getNearestCityId() ... need to update to use
|
|
// our new foreign cities...? really only need to add them
|
|
// if they do not use dst i guess...?
|
|
*timeZone2=getTimeZoneFromLatLon(*userLat, *userLon,0,useDST);
|
|
|
|
// this is true if we had a city with a lat/lon
|
|
//bool status = false;
|
|
|
|
if ( finalCityDesc ) {
|
|
// this is easy...
|
|
*timeZone2 = finalCityDesc->m_timeZoneOffset;
|
|
*useDST = false;
|
|
if ( finalCityDesc->m_flags & PDF_USE_DST ) *useDST = true;
|
|
//status = true;
|
|
*cityLat = finalCityDesc->m_lat;
|
|
*cityLon = finalCityDesc->m_lon;
|
|
}
|
|
|
|
if ( finalStateDesc ) {
|
|
*stateLat = finalStateDesc->m_lat;
|
|
*stateLon = finalStateDesc->m_lon;
|
|
}
|
|
|
|
if ( finalCountryDesc ) {
|
|
*countryLat = finalCountryDesc->m_lat;
|
|
*countryLon = finalCountryDesc->m_lon;
|
|
}
|
|
|
|
// did we get a lat/lon from the "where" string?
|
|
bool hasCentroid = false;
|
|
if ( *cityLat != NO_LATITUDE ) hasCentroid = true;
|
|
if ( *zipLat != NO_LATITUDE ) hasCentroid = true;
|
|
if ( *userLat != NO_LATITUDE ) hasCentroid = true;
|
|
// if we got a cityLat or zipLat or userLat and
|
|
// radius is zero then we gotta make it default to 100
|
|
if ( *radius == 0 && hasCentroid ) *radius = 100;
|
|
// if no centroid...
|
|
if ( *radius && ! hasCentroid ) *radius = 0;
|
|
// bitch if no centroid
|
|
if ( ! hasCentroid && w.m_numWords )
|
|
log("query: no centroid for location in wherebox");
|
|
|
|
|
|
if ( *userLat != NO_LATITUDE )
|
|
return true;
|
|
|
|
// reset
|
|
alnumPos = -1;
|
|
ignoreUntil = -1;
|
|
// set the gbwherebuf if provided
|
|
char *p = gbwhereBuf;
|
|
char *pend = p + gbwhereBufSize - 1; // room for \0
|
|
bool gotStuff = false;
|
|
bool firstOne = true;
|
|
for ( int32_t i = 0 ; p && i < w.m_numWords ; i++ ) {
|
|
// count it?
|
|
if ( wids[i] ) alnumPos++;
|
|
// skip punct
|
|
if ( ! wids[i] ) continue;
|
|
// skip if in middle of state or city name
|
|
if ( i < ignoreUntil ) continue;
|
|
// if we had a valid city/state/zip, do not include those
|
|
// in this buffer
|
|
if ( //status &&
|
|
(
|
|
//(i>= finalCountryA && i <finalCountryB ) ||
|
|
//(i>= finalStateA && i <finalStateB) ||
|
|
(i>= finalCityA && i <finalCityB ) ||
|
|
(i>= finalZipA && i < finalZipB ) ) )
|
|
continue;
|
|
// breach check
|
|
if ( p + 8 + wlens[i] + 2 >= pend ) break;
|
|
|
|
if ( ! firstOne ) *p++ = ' ';
|
|
firstOne = false;
|
|
|
|
// now do not break up a state name like 'new mexico' into
|
|
// 'gbwhere:new gbwhere:mexico' but rather do
|
|
// 'gbwhere:newmexico' because when we hash the gbwhere:
|
|
// terms we hash the state adm1 string as 'nm' and its synonym
|
|
// 'newmexico'
|
|
// we can't do this right now because when we index foreign
|
|
// events it is always by lat/lon and we do not know the
|
|
// state it is in necessarily...
|
|
//Place *ps = getStatePlace ( i , alnumPos , &w );
|
|
// only print field header if we got something
|
|
//if ( wids[i] ) gotStuff = true;
|
|
|
|
// . if this is a state name, condense it
|
|
// . TODO: what about 'new mexico avenue' will
|
|
// Address::hash() index 'nm' for that? i would think so
|
|
// if synonyms work right... TEST!
|
|
if ( finalStateDesc &&
|
|
//finalStateDesc->m_crid == CRID_US &&
|
|
i >= finalStateA &&
|
|
i < finalStateB ) {
|
|
// if we got a city ignore though!
|
|
if ( finalCityDesc ) continue;
|
|
// or zip...
|
|
if ( finalZipA >= 0 ) continue;
|
|
// mark it
|
|
gotStuff = true;
|
|
// use gbstate:
|
|
gbmemcpy ( p , "gbeventstatecode:", 17 );
|
|
p += 17;
|
|
// special treatment. a state abbr is always 2 chars
|
|
gbmemcpy ( p , finalStateDesc->m_adm1 , 2 );
|
|
p += 2;
|
|
// store the country as well for that state whether
|
|
// it was entered or not! because some states are
|
|
// reduced to their numeric code like "08" and
|
|
// many countries have that same code!
|
|
char *cc = getCountryCode(finalStateDesc->m_crid);
|
|
gbmemcpy ( p , " gbeventcountrycode:", 20 );
|
|
p += 20;
|
|
gbmemcpy ( p , cc , 2 );
|
|
p += 2;
|
|
// also set the timezone
|
|
*timeZone2 = finalStateDesc->m_timeZoneOffset;
|
|
// and useDST
|
|
*useDST = false;
|
|
if ( finalStateDesc->m_flags&PDF_USE_DST) *useDST=true;
|
|
// ignore until end of state words
|
|
ignoreUntil = finalStateB;
|
|
continue;
|
|
}
|
|
// . we cover foreign states using radius logic up above now
|
|
// . when we index a foreign event we do so using the lat/lon
|
|
// only since we do not support foreign addresses yet
|
|
// . therefore we do not index gbwhere:<adm1> for it...
|
|
// so we use the radius centroid logic above
|
|
// . we could fix this by using getNearestCityId() for
|
|
// the foreign events...
|
|
//else if ( finalStateDesc &&
|
|
// finalStateDesc->m_crid != CRID_US &&
|
|
// i >= finalStateA &&
|
|
// i < finalStateB ) {
|
|
//}
|
|
// same logic for countries
|
|
if ( finalCountryDesc &&
|
|
//finalCountryDesc->m_crid == CRID_US &&
|
|
i >= finalCountryA &&
|
|
i < finalCountryB ) {
|
|
// if we got a city ignore though!
|
|
if ( finalCityDesc ) continue;
|
|
// or zip...
|
|
if ( finalZipA >= 0 ) continue;
|
|
// mark it
|
|
gotStuff = true;
|
|
// special treatment. a country abbr is always 2 chars
|
|
char *cc = getCountryCode(finalCountryDesc->m_crid);
|
|
gbmemcpy ( p , "gbeventcountrycode:", 19 );
|
|
p += 19;
|
|
gbmemcpy ( p , cc , 2 );
|
|
p += 2;
|
|
ignoreUntil = finalCountryB;
|
|
continue;
|
|
}
|
|
// . we cover foreign countrys using radius logic up above now
|
|
// . when we index a foreign event we do so using the lat/lon
|
|
// only since we do not support foreign addresses yet
|
|
// . therefore we do not index gbwhere:<adm1> for it...
|
|
// so we use the radius centroid logic above
|
|
//else if ( finalCountryDesc &&
|
|
// finalCountryDesc->m_crid != CRID_US &&
|
|
// i >= finalCountryA &&
|
|
// i < finalCountryB ) {
|
|
//}
|
|
|
|
// mark it
|
|
gotStuff = true;
|
|
// field header
|
|
gbmemcpy ( p , "gbwhere:", 8 );
|
|
// advance
|
|
p += 8;
|
|
// otherwise store into buffer as is
|
|
gbmemcpy ( p , wptrs[i] , wlens[i] );
|
|
// advance ptr cursor
|
|
p += wlens[i];
|
|
}
|
|
|
|
// delete?
|
|
if ( ! gotStuff ) p = gbwhereBuf;
|
|
// null term if provided
|
|
if ( p ) *p = '\0';
|
|
|
|
// set these
|
|
if ( retCityDesc ) *retCityDesc = finalCityDesc;
|
|
if ( retStateDesc ) *retStateDesc = finalStateDesc;
|
|
if ( retCountryDesc ) *retCountryDesc = finalCountryDesc;
|
|
|
|
return true;//status;
|
|
}
|
|
|
|
// returns false if not found
|
|
bool getCityLatLonFromAddress ( Address *aa , double *lat , double *lon ) {
|
|
|
|
// assume none
|
|
*lat = NO_LATITUDE;
|
|
*lon = NO_LONGITUDE;
|
|
|
|
Place *city = aa->m_city;
|
|
Place *state = aa->m_adm1;
|
|
Place *zip = aa->m_zip;
|
|
|
|
// set these
|
|
uint64_t cityHash64 = 0;
|
|
char *adm1Str = NULL;
|
|
|
|
// set city/state from zip if necessary
|
|
if ( ! city && zip ) {
|
|
cityHash64 = zip->m_cityHash;
|
|
adm1Str = zip->m_adm1;
|
|
}
|
|
if ( city )
|
|
cityHash64 = city->m_cityHash;
|
|
if ( state )
|
|
adm1Str = state->m_adm1;
|
|
|
|
// both must be valid
|
|
if ( ! cityHash64 ) return false;
|
|
if ( ! adm1Str ) return false;
|
|
|
|
// combine the two hashes
|
|
uint32_t cid32 = (uint32_t)getCityId32(cityHash64,adm1Str);
|
|
|
|
// now get the lat lon
|
|
bool status = getLatLon ( cid32 , lat , lon );
|
|
|
|
return status;
|
|
}
|
|
|
|
// . like ";;5815 Wyoming Blvd NE;Albuquerque;87109;NM;;;" ???
|
|
char *getZipPtrFromStr ( char *data , int32_t *zipLen ) {
|
|
// now point to latitude,longitude
|
|
// skip city,state,zip,something,hash,ip
|
|
char *zipPtr = data;
|
|
int32_t scount = 0;
|
|
for ( ; scount < 6 ; zipPtr++ )
|
|
if ( *zipPtr == ';' ) scount++;
|
|
// get length
|
|
char *end = zipPtr + 1;
|
|
for ( ; *end != ';' ; end++ );
|
|
*zipLen = end - zipPtr ;
|
|
// pts past that ';'
|
|
return zipPtr;
|
|
}
|
|
|
|
bool getZipLatLon ( char *zip ,
|
|
int32_t zipLen ,
|
|
float *zipLat ,
|
|
float *zipLon ) {
|
|
// assume none
|
|
*zipLat = NO_LATITUDE;
|
|
*zipLon = NO_LONGITUDE;
|
|
// only 5 digits i guess
|
|
if ( zipLen != 5 ) return false;
|
|
// hash it
|
|
int64_t zh = getWordXorHash2(zip,zipLen);
|
|
// get it
|
|
ZipDesc *zd = (ZipDesc *)g_zips.getValue(&zh);
|
|
// mine it
|
|
if ( ! zd ) return false;
|
|
*zipLat = zd->m_latitude;
|
|
*zipLon = zd->m_longitude;
|
|
return true;
|
|
}
|
|
|
|
bool getZipLatLonFromStr ( char *addrStr ,
|
|
float *zipLat ,
|
|
float *zipLon ) {
|
|
int32_t zipLen;
|
|
char *zip = getZipPtrFromStr ( addrStr , &zipLen );
|
|
return getZipLatLon ( zip , zipLen , zipLat, zipLon );
|
|
}
|
|
|
|
bool getZipLatLonFromAddress ( Address *aa ,
|
|
float *zipLat ,
|
|
float *zipLon ) {
|
|
// assume none
|
|
*zipLat = NO_LATITUDE;
|
|
*zipLon = NO_LONGITUDE;
|
|
Place *zip = aa->m_zip;
|
|
if ( ! zip ) return false;
|
|
return getZipLatLon(zip->m_str, zip->m_strlen,zipLat,zipLon);
|
|
}
|
|
|
|
|
|
// if you just want to call setStr() and have it use stack mem to
|
|
// store up to 10 places, then init the PlaceMem with this very quickly
|
|
void PlaceMem::init ( int32_t poolSize ,
|
|
int32_t initNumPoolPtrs ,
|
|
int32_t initNumPlacePtrs ,
|
|
char *stackMem ,
|
|
int32_t stackMemSize ,
|
|
int32_t niceness ) {
|
|
m_stack = stackMem;
|
|
m_stackSize = stackMemSize;
|
|
m_initNumPoolPtrs = initNumPoolPtrs;
|
|
m_initNumPlacePtrs = initNumPlacePtrs;
|
|
m_poolSize = poolSize;
|
|
m_numPlacePtrsAllocated = 0;
|
|
m_numPoolPtrsAllocated = 0;
|
|
m_numPoolsAllocated = 0;
|
|
m_numPlacePtrs = 0;
|
|
m_cursor = NULL;
|
|
m_cursorEnd = NULL;
|
|
m_cursorPoolNum = -1;
|
|
m_niceness = niceness;
|
|
}
|
|
|
|
// . returns NULL and sets g_errno on error
|
|
// . stores ptr to the returned mem in m_placePtrs[placeNum]
|
|
void *PlaceMem::getMem ( int32_t need ) {
|
|
// sanity
|
|
if ( need > m_poolSize ) { char *xx=NULL;*xx=0; }
|
|
top:
|
|
// return if we got it
|
|
if ( m_cursor && m_cursor + need <= m_cursorEnd ) {
|
|
// do we need to realloc m_placePtrs?
|
|
if ( m_numPlacePtrs + 1 > m_numPlacePtrsAllocated ) {
|
|
if ( m_stack ) { char *xx=NULL;*xx=0; }
|
|
int32_t oldSize =m_numPlacePtrsAllocated * 4;
|
|
int32_t newAlloc =m_numPlacePtrsAllocated + 2000;
|
|
if ( m_numPlacePtrsAllocated == 0 )
|
|
newAlloc = m_initNumPlacePtrs;
|
|
char **newPtrs = (char **)mmalloc(newAlloc*4,"pptbl");
|
|
if ( ! newPtrs ) return NULL;
|
|
for ( int32_t i = 0 ; i < m_numPlacePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
newPtrs[i] = m_placePtrs[i];
|
|
// to be safe to avoid bad mem writes
|
|
m_placePtrs[i] = NULL;
|
|
}
|
|
//gbmemcpy ( newPtrs, m_placePtrs , m_numPlacePtrs*4);
|
|
mfree ( m_placePtrs , oldSize , "pptbl");
|
|
m_placePtrs = newPtrs;
|
|
m_numPlacePtrsAllocated = newAlloc;
|
|
}
|
|
// store it
|
|
m_placePtrs[m_numPlacePtrs] = m_cursor;
|
|
// increment it
|
|
m_numPlacePtrs++;
|
|
// save cursor so we can return that
|
|
char *returnPtr = m_cursor;
|
|
// increment to next place (need = sizeof(Place) usually)
|
|
m_cursor += need;
|
|
// return the mem for them to use now
|
|
return (void *)returnPtr;
|
|
}
|
|
|
|
// try to use stack
|
|
if ( m_stack && m_numPoolPtrsAllocated == 0 ) {
|
|
// compute min size for stack...
|
|
int32_t need = 0;
|
|
need += m_initNumPoolPtrs * 4 ;
|
|
need += m_initNumPlacePtrs * 4 ;
|
|
need += m_poolSize;
|
|
// make sure stack size is big enough for what they want
|
|
if ( m_stackSize < need ) { char *xx=NULL;*xx=0;}
|
|
// parse it up
|
|
char *p = m_stack;
|
|
m_placePtrs = (char **)p;
|
|
p += m_initNumPlacePtrs * 4;
|
|
m_poolPtrs = (char **)p;
|
|
p += m_initNumPoolPtrs + 4;
|
|
m_poolPtrs[0] = p;
|
|
p += m_poolSize;
|
|
m_numPoolsAllocated = 1;
|
|
m_numPlacePtrsAllocated = m_initNumPlacePtrs;
|
|
m_numPoolPtrsAllocated = m_initNumPoolPtrs;
|
|
m_cursor = m_poolPtrs[0];
|
|
m_cursorEnd = m_cursor + m_poolSize;
|
|
m_cursorPoolNum = 0;
|
|
// give em that mem now i guess
|
|
goto top;
|
|
}
|
|
|
|
// always constrain to stack if provided to make things simple
|
|
if ( m_stack ) { char *xx=NULL;*xx=0; }
|
|
|
|
// add a new pool
|
|
if ( m_numPoolsAllocated + 1 > m_numPoolPtrsAllocated ) {
|
|
int32_t oldSize = m_numPoolPtrsAllocated * 4;
|
|
int32_t newAlloc = m_numPoolPtrsAllocated + 100;
|
|
if ( m_numPoolPtrsAllocated == 0 )
|
|
newAlloc = m_initNumPoolPtrs;
|
|
char **newPtrs = (char **)mmalloc(newAlloc*4,"pptbl2");
|
|
if ( ! newPtrs ) return NULL;
|
|
gbmemcpy ( newPtrs , m_poolPtrs , m_numPoolsAllocated*4 );
|
|
mfree ( m_poolPtrs , oldSize , "pptbl2");
|
|
m_poolPtrs = newPtrs;
|
|
m_numPoolPtrsAllocated = newAlloc;
|
|
}
|
|
|
|
// if we had called setNumPtrs() or rewind() the next pool might
|
|
// already be allocated, so if that is true, use it!
|
|
int32_t poolNum = m_cursorPoolNum + 1;
|
|
|
|
// sanity check
|
|
if ( poolNum > m_numPoolsAllocated ) { char *xx=NULL;*xx=0; }
|
|
// poolNum could be < m_numPoolsAllocated IF we did a rewind at
|
|
// somepoint so that m_cursorPoolNum was decreased in setNumPtrs().
|
|
// but we need to allocate a new pool if that was not the case.
|
|
if ( poolNum == m_numPoolsAllocated ) {
|
|
// make a new pool now
|
|
char *pool = (char *)mcalloc(m_poolSize,"pool3");
|
|
if ( ! pool ) return NULL;
|
|
m_poolPtrs [ m_numPoolsAllocated ] = pool;
|
|
m_numPoolsAllocated++;
|
|
}
|
|
|
|
// update cursor now
|
|
m_cursor = m_poolPtrs[poolNum];
|
|
m_cursorEnd = m_poolPtrs[poolNum] + m_poolSize;
|
|
m_cursorPoolNum = poolNum;
|
|
// sanity check
|
|
char *pool = m_poolPtrs[m_cursorPoolNum];
|
|
char *poolEnd = pool + m_poolSize;
|
|
if ( m_cursor < pool || m_cursor >= poolEnd ) { char *xx=NULL;*xx=0;}
|
|
|
|
// and re-try
|
|
goto top;
|
|
}
|
|
|
|
PlaceMem::PlaceMem() {
|
|
// make sure reset() won't core us
|
|
m_placePtrs = NULL;
|
|
m_poolPtrs = NULL;
|
|
m_numPoolsAllocated = 0;
|
|
m_niceness = 0;
|
|
m_numPlacePtrs = 0;
|
|
m_numPoolPtrsAllocated = 0;
|
|
m_numPlacePtrsAllocated = 0;
|
|
m_stack = NULL;//false;
|
|
}
|
|
|
|
PlaceMem::~PlaceMem() {
|
|
reset();
|
|
}
|
|
|
|
void PlaceMem::reset ( ) {
|
|
// do not core
|
|
if ( m_stack ) return;
|
|
// free everything
|
|
for ( int32_t i = 0 ; i < m_numPoolsAllocated; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
mfree( m_poolPtrs[i] , m_poolSize, "pool3");
|
|
m_poolPtrs[i] = NULL;
|
|
}
|
|
// free ptrs
|
|
if ( m_placePtrs )
|
|
mfree ( m_placePtrs, m_numPlacePtrsAllocated * 4,"plptrs");
|
|
if ( m_poolPtrs )
|
|
mfree ( m_poolPtrs , m_numPoolPtrsAllocated * 4,"poptrs");
|
|
m_placePtrs = NULL;
|
|
m_poolPtrs = NULL;
|
|
m_numPoolPtrsAllocated = 0;
|
|
m_numPlacePtrsAllocated = 0;
|
|
m_cursor = NULL;
|
|
m_numPlacePtrs = 0;
|
|
m_numPoolsAllocated = 0;
|
|
}
|
|
|
|
// . sometimes we remove the last X Places we added above when we realized
|
|
// something was bogus
|
|
// . pass in ptr to first Place ptr to be nuked
|
|
void PlaceMem::setNumPtrs ( int32_t newNumPtrs ) {
|
|
// return if no change requested
|
|
if ( newNumPtrs == m_numPlacePtrs ) return;
|
|
// sanity check
|
|
if ( newNumPtrs >= m_numPlacePtrs ) { char *xx=NULL;*xx=0;};
|
|
if ( newNumPtrs < 0 ) { char *xx=NULL;*xx=0;};
|
|
// set it back
|
|
m_cursor = m_placePtrs[newNumPtrs];
|
|
// back up the pool until we are in it
|
|
for ( ; m_cursorPoolNum >= 0 ; m_cursorPoolNum-- ) {
|
|
char *pool = m_poolPtrs[m_cursorPoolNum];
|
|
char *poolEnd = pool + m_poolSize;
|
|
if ( m_cursor >= pool && m_cursor < poolEnd ) {
|
|
m_cursorEnd = poolEnd;
|
|
break;
|
|
}
|
|
}
|
|
// this is weird
|
|
if ( m_cursorPoolNum < 0 ) { char *xx=NULL;*xx=0; }
|
|
// reset final
|
|
m_numPlacePtrs = newNumPtrs;
|
|
}
|
|
|
|
void resetAddressTables ( ) {
|
|
if ( s_latList ) mfree ( s_latList,s_latListSize,"latlist");
|
|
s_latList = NULL;
|
|
g_timeZones.reset();
|
|
g_zips.reset();
|
|
g_cities.reset();
|
|
g_indicators.reset();
|
|
g_aliases.reset();
|
|
g_states.reset();
|
|
s_lc.reset();
|
|
s_syn.reset();
|
|
s_jobTable.reset();
|
|
s_doyTable.reset();
|
|
g_nameTable.reset();
|
|
if ( g_pbuf ) mfree ( g_pbuf, g_pbufSize , "placbuf");
|
|
}
|
|
|
|
///////////////////////////////////////////////////
|
|
//
|
|
// NEW PLACES LOGIC
|
|
//
|
|
// Use this for the new functions:
|
|
|
|
// If user enters 'berlin': (try to get in country of m_ipCrid first)
|
|
// If user enters 'berlin, germany':
|
|
// PlaceDesc *getMostPopularCity_new ( uint64_t cityHash64,char crid)
|
|
// Algorithm: scan list of cities in that country and choose the most
|
|
// populated one in that country.
|
|
|
|
// If user enters 'berlin': (next, try to get most popular in world)
|
|
// PlaceDesc *getMostPopularCity_new ( uint64_t cityHash64 , 0 = crid );
|
|
|
|
// If user enters 'berlin, <adm1>' or 'cincinnati, ohio'.
|
|
// PlaceDesc *getCityInState_new ( uint64_t cityHash64,uint64_t stateHash64);
|
|
// Algorithm: get list of all places that are states with stateHash64, and
|
|
// record list as the two-letter state codes. Then scan the cities with
|
|
// cityHash64 and see which has one of the state codes in that list.
|
|
|
|
// If user enters 'germany' or 'republic of chad'
|
|
// PlaceDesc *getCountryPlace ( int32_t a, int32_t alnumPos, Words *w );
|
|
|
|
// need this
|
|
// PlaceDesc *getCountryDescFromId ( uint8_t crid );
|
|
|
|
// For getting the timezone from a lat/lon in a foreign country:
|
|
// PlaceDesc *getNearestCity_new ( float lat , float lon );
|
|
|
|
|
|
///////////////////////////////////////////////////
|
|
|
|
|
|
// . maps a hash of a word or phrase to a PlaceDesc ptr
|
|
// . dups are allowed - one key can map to multiple PlaceDescriptors
|
|
//HashTableX g_nameTable;
|
|
|
|
bool loadPlaces ( ) {
|
|
|
|
// map 64bit name hash to a place dec ptr. allowdups= true.
|
|
// niceness = 0.
|
|
g_nameTable.set ( 8 , // 64 bit key hash
|
|
4 , // placedec ptr
|
|
0 , // no initial slots
|
|
NULL , // no initial buf
|
|
0 , // zero initial buf size
|
|
true , // allow dups?
|
|
0 , // niceness
|
|
"nametab" );
|
|
|
|
|
|
if ( g_proxy.isProxy() ) return true;
|
|
|
|
// log it
|
|
log("places: loading places.dat");
|
|
|
|
// try to load from disk
|
|
if ( g_nameTable.load ( g_hostdb.m_dir ,
|
|
"places.dat" ,
|
|
&g_pbuf ,
|
|
&g_pbufSize ) ) {
|
|
// test it out
|
|
PlaceDesc *pd = getCity2_new ( "abq", "nm", CRID_US,0);
|
|
if ( ! pd ) { char *xx=NULL;*xx=0; }
|
|
// make sure "nm" brings up new mexico
|
|
pd = getState2_new ( "nm", CRID_US,0);
|
|
if ( ! pd ) { char *xx=NULL;*xx=0; }
|
|
// scan for integrity
|
|
pd = (PlaceDesc *)g_pbuf;
|
|
//PlaceDesc *pdend = (PlaceDesc *)(g_pbuf+g_pbufSize);
|
|
for ( ; ; pd++ ) {
|
|
// stop if we enter the name buf space
|
|
if ( ((char *)pd)[0] == 'u' &&
|
|
((char *)pd)[1] == 'n' &&
|
|
! strcmp((char *)pd,"unknown name" ) )
|
|
break;
|
|
// sanity
|
|
if ( pd->m_lat < -180.0 ) { char *xx=NULL;*xx=0; }
|
|
if ( pd->m_lat > 180.0 ) { char *xx=NULL;*xx=0; }
|
|
if ( pd->m_lon < -180.0 ) { char *xx=NULL;*xx=0; }
|
|
if ( pd->m_lon > 180.0 ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// error?
|
|
log("places: failed to load places.dat: %s",mstrerror(g_errno));
|
|
|
|
// try making it
|
|
return generatePlacesFile ( );
|
|
}
|
|
|
|
// used by PageEvents.cpp's getSiteMap() to list the most popular cities
|
|
PlaceDesc *getPlaceDescBuf () {
|
|
return (PlaceDesc *)g_pbuf;
|
|
}
|
|
|
|
bool generatePlacesFile ( ) {
|
|
|
|
log("places: generating places.dat file");
|
|
|
|
char buf[10000];
|
|
|
|
|
|
//
|
|
// MAKE TIMEZONE TABLE for referencing
|
|
//
|
|
// scan allCountries.txt
|
|
char pcmd[1024];
|
|
sprintf(pcmd,"cat %s/timeZones.txt",g_hostdb.m_dir);
|
|
FILE *pf = popen ( pcmd , "r" );
|
|
if ( ! pf ) {
|
|
g_errno = errno;
|
|
return log("places: could not open timeZones.txt");
|
|
}
|
|
class TZVal {
|
|
public:
|
|
char m_tzoff;
|
|
char m_useDST;
|
|
};
|
|
HashTableX tztab;
|
|
tztab.set ( 8 , sizeof(TZVal),0,NULL,0,false,0,"tztab");
|
|
// read in the lines
|
|
while ( fgets ( buf , 10000 , pf ) ) {
|
|
// null terminate it, instead of \n
|
|
buf[gbstrlen(buf)-1]='\0';
|
|
// parse it up. timezonestr\ttzoff1|tzoffdst
|
|
char timeZoneStr[64]; // Europe/Andorra
|
|
int32_t off1;
|
|
int32_t off2; // dst
|
|
sscanf ( buf ,
|
|
"%s\t" // timezone name
|
|
"%"INT32"\t" // off1
|
|
"%"INT32"" // off2
|
|
, timeZoneStr
|
|
, &off1
|
|
, &off2
|
|
);
|
|
// make a table
|
|
int64_t tzh64 = getWordXorHash ( timeZoneStr );
|
|
// make the value
|
|
TZVal tzval;
|
|
tzval.m_tzoff = off1;
|
|
if ( off1 != off2 ) tzval.m_useDST = 1;
|
|
else tzval.m_useDST = 0;
|
|
tztab.addKey ( &tzh64 , &tzval );
|
|
}
|
|
|
|
|
|
|
|
|
|
// . map a geoId to ptr to the PlaceDesc in the g_placeBuf
|
|
// . a temporary table really...
|
|
HashTableX places;
|
|
places.set ( 4, 4, 5000000 , NULL ,0 , false, 0,"gpht");
|
|
|
|
// official names of each place
|
|
SafeBuf nameBuf;
|
|
nameBuf.reserve ( 10*1024*1024 );
|
|
// this is actually required and we check for it to avoid
|
|
// overruning our PlaceDesc when we scan those. we need this
|
|
// to set "pdend" for the PlaceDesc scan because we concatenate
|
|
// the nameBuf to the end of the placeBuf. so basically
|
|
// places.dat holds those two conjoined buffers ...
|
|
nameBuf.safePrintf("unknown name");
|
|
nameBuf.pushChar('\0');
|
|
|
|
int32_t zero = 0;
|
|
|
|
// reserve 100MB
|
|
SafeBuf placeBuf;
|
|
placeBuf.reserve ( 100*1024*1024 );
|
|
|
|
HashTableX dedup;
|
|
dedup.set ( 8,4,100000,NULL,0,false,0,"pddptb");
|
|
|
|
// this will have to be remade
|
|
sprintf(pcmd,"unlink %s/citylatlist.dat",g_hostdb.m_dir);
|
|
system(pcmd);
|
|
|
|
// scan allCountries.txt
|
|
sprintf(pcmd,"cat %s/allCountries.txt",g_hostdb.m_dir);
|
|
pf = popen ( pcmd , "r" );
|
|
if ( ! pf ) { g_errno = errno; return false; }
|
|
|
|
// limit g_nameTable from getting too big! otherwise places.dat
|
|
// is 550MB on disk and in memory!!! with this is it 200MB.
|
|
// otherwise it grows to 32M slots...
|
|
g_nameTable.m_maxSlots = 8388608; // 1<<23
|
|
|
|
// read in the lines
|
|
while ( fgets ( buf , 10000 , pf ) ) {
|
|
// null terminate it, instead of \n
|
|
buf[gbstrlen(buf)-1]='\0';
|
|
// parse it up. id|name|lat|lon|abbr
|
|
/*
|
|
int32_t geoId;
|
|
char name[512];
|
|
float lat;
|
|
float lon;
|
|
char code [16];
|
|
char countryAbbr[32];
|
|
char stateAbbr[32];
|
|
int32_t population = 0;
|
|
char timeZoneStr[64]; // Europe/Andorra
|
|
*/
|
|
// convert all tabs to \0
|
|
char *p = buf;
|
|
for ( ; *p ; p++ ) if ( *p == '\t' ) *p = '\0';
|
|
// see /geo/geonames/index.html for format description
|
|
p = buf;
|
|
int32_t geoId = atol(p); p += strlen(p) + 1;
|
|
//if ( geoId == 1850147 )
|
|
// log("hey");
|
|
char *officialName = p; p += strlen(p) + 1; // official name
|
|
char *asciiName = p; p += strlen(p) + 1; // asciname
|
|
char *altNames = p; p += strlen(p)+1; // altnames
|
|
float lat;
|
|
// sometimes allCountries.txt leaves out "altNames" field!
|
|
// so detect if this field is a latitude or not...
|
|
bool hadAlpha = false;
|
|
bool hadDigit = false;
|
|
bool hadPeriod = false;
|
|
char *tmp = altNames;
|
|
for ( ; *tmp ; tmp++ ) {
|
|
if ( is_alpha_a(*tmp) ) hadAlpha = true;
|
|
if ( is_digit (*tmp) ) hadDigit = true;
|
|
if ( *tmp == '.' ) hadPeriod = true;
|
|
}
|
|
// need a digit and no alphas to be a latitude
|
|
bool isLat = false;
|
|
if ( hadDigit && ! hadAlpha && hadPeriod ) isLat = true;
|
|
if ( isLat ) {
|
|
lat = atof ( altNames );
|
|
}
|
|
else {
|
|
lat = atof(p);
|
|
p += strlen(p) + 1;
|
|
}
|
|
float lon = atof ( p ); p += strlen(p) + 1;
|
|
p += strlen(p) + 1; // code class
|
|
char *code = p; p += strlen(p)+1; // code type
|
|
char *countryAbbr = p; p += strlen(p)+1;
|
|
p += strlen(p)+1; // altCountry
|
|
char *stateAbbr = p; p += strlen(p)+1;
|
|
p += strlen(p)+1; // adm2
|
|
p += strlen(p)+1; // adm3
|
|
p += strlen(p)+1; // adm4
|
|
int32_t population = atol(p); p += strlen(p)+1;
|
|
p += strlen(p)+1; // elevation
|
|
p += strlen(p)+1; // avg elevation
|
|
char *timeZoneStr = p; p += strlen(p)+1;
|
|
p += strlen(p)+1; // moddate
|
|
|
|
// debug point
|
|
//if ( geoId == 5381396 )
|
|
// log("hey");
|
|
|
|
// skip if no timezone for now
|
|
if ( ! timeZoneStr[0] ) {
|
|
log("places: no timezone for geoid=%"INT32" name=%s",
|
|
geoId,officialName);
|
|
continue;
|
|
}
|
|
|
|
// reserve space
|
|
//placeBuf.reserve ( 1024 );
|
|
// not allowed to grow since we use dedup table now
|
|
if ( placeBuf.getAvail() < (int32_t)sizeof(PlaceDesc) ) {
|
|
char *xx=NULL;*xx=0;}
|
|
|
|
// make a new country desc
|
|
PlaceDesc *pd = (PlaceDesc *)placeBuf.getBuf();
|
|
//
|
|
// see http://www.geonames.org/export/codes.html
|
|
//
|
|
|
|
// exceptions:
|
|
// "122 Mile House" ...
|
|
if ( ! strncmp( code,"PPLL",4)) continue;
|
|
// a basic city
|
|
if ( ! strncmp( code,"PPL",3)) pd->m_flags = PDF_CITY;
|
|
// locality
|
|
else if ( ! strcmp ( code ,"LCTY")) pd->m_flags = PDF_CITY;
|
|
// . town of, township, town of north hempstead
|
|
// . crap! this gets a different san jose!
|
|
// . avoid "City of Cincinnati" etc.. crap
|
|
// . BUT allow town of north hempstead through (5129081)
|
|
else if ( ! strcmp ( code ,"ADMD") && geoId == 5129081 )
|
|
pd->m_flags = PDF_CITY;
|
|
// independent political entity
|
|
else if ( ! strcmp ( code,"PCLIX")) pd->m_flags = PDF_CITY;
|
|
// another city i guess
|
|
else if ( ! strcmp ( code , "P" ) ) pd->m_flags = PDF_CITY;
|
|
// states
|
|
else if ( ! strcmp ( code ,"ADM1")) pd->m_flags = PDF_STATE;
|
|
// countries
|
|
else if ( ! strcmp ( code ,"PCLI")) pd->m_flags = PDF_COUNTRY;
|
|
// otherwise, skip it!
|
|
else continue;
|
|
|
|
// . sanity
|
|
// . these were messing up our raw lat/lon processing
|
|
// in searchinput.cpp because we thought that a direct
|
|
// lat/lon in the wherebox was a city name because there was
|
|
// a city name that was "35", which was our latitude entered!
|
|
if ( pd->m_flags == PDF_CITY && is_digit(officialName[0]) ){
|
|
log("places: bad city name: %s",officialName);
|
|
continue;
|
|
}
|
|
|
|
// a bunch of cities do not have states...
|
|
//if ( pd->m_flags != PDF_COUNTRY &&
|
|
// ( ! stateAbbr[0] || ! stateAbbr[0] ) ) {
|
|
// log("hey %s",officialName);
|
|
// continue;
|
|
//}
|
|
|
|
// get country id
|
|
pd->m_crid = getCountryId ( countryAbbr );
|
|
// geoid for looking up in alternateNames.txt
|
|
//pd->m_geoId = geoId;
|
|
// lat and lon
|
|
pd->m_lat = lat;
|
|
pd->m_lon = lon;
|
|
pd->m_population = population;
|
|
// skip over it (not allowed to grow anymore!)
|
|
//placeBuf.advance ( sizeof(PlaceDesc) );
|
|
placeBuf.m_length += (int32_t)sizeof(PlaceDesc);
|
|
// . point to that. we'll store <adm1>,<name> in there now
|
|
// . we need to somehow append alternate names later
|
|
//pd->m_data = placeBuf.getBuf();
|
|
// store adm1 in m_data[]
|
|
pd->m_adm1[0] = to_lower_a(stateAbbr[0]);
|
|
pd->m_adm1[1] = to_lower_a(stateAbbr[1]);
|
|
// if greece... use last two
|
|
if ( to_lower_a(countryAbbr[0]) == 'g' &&
|
|
to_lower_a(countryAbbr[1]) == 'r' &&
|
|
pd->m_adm1[0] == 'e' &&
|
|
pd->m_adm1[1] == 's' &&
|
|
is_digit(stateAbbr[4]) &&
|
|
is_digit(stateAbbr[5]) ) {
|
|
// store the last two letter's for greece
|
|
pd->m_adm1[0] = to_lower_a(stateAbbr[4]);
|
|
pd->m_adm1[1] = to_lower_a(stateAbbr[5]);
|
|
}
|
|
// hash timezone string
|
|
uint64_t tzh64 = getWordXorHash ( timeZoneStr );
|
|
//look it up in our table made from /geo/geonames/timeZones.txt
|
|
TZVal *tzv = (TZVal *)tztab.getValue ( &tzh64 );
|
|
if ( ! tzv ) { char *xx=NULL;*xx=0 ;}
|
|
// from -12 to + 12 i guess
|
|
pd->m_timeZoneOffset = tzv->m_tzoff;
|
|
// now the daylightsavings time flag
|
|
if ( tzv->m_useDST ) pd->m_flags |= PDF_USE_DST;
|
|
// . add to table using the name as the key
|
|
// . i think this table is just for generation since
|
|
// we'll use the g_namesTable to map place names to
|
|
// the PlaceDesc.
|
|
places.addKey(&geoId,&pd);
|
|
// store OFFSETS in nametable
|
|
int32_t placeDescOffset = (char *)pd - placeBuf.getBufStart();
|
|
|
|
// we need to add the official name here because it's not
|
|
// always in alternateNames.txt...
|
|
uint64_t nh64a = getWordXorHash ( officialName );
|
|
uint64_t dedupKeya = nh64a ^ (uint32_t)placeDescOffset;
|
|
// skip if in there
|
|
if ( ! dedup.isInTable(&dedupKeya) ) {
|
|
// make this name's hash point to its PlaceDesc
|
|
if ( ! g_nameTable.addKey ( &nh64a, &placeDescOffset))
|
|
return false;
|
|
// do not add dup combos
|
|
dedup.addKey ( &dedupKeya , &zero );
|
|
}
|
|
|
|
// hmmm... we need nh64 to be ascii for adding to nameBuf...
|
|
uint64_t exactHash64 = hash64n ( officialName );
|
|
// also make this name's hash point to the
|
|
// name itself so we can convert a lat/lon into
|
|
// a place name, based on getNearestCity_new()
|
|
if ( ! dedup.isInTable ( &exactHash64 ) ) {
|
|
// nameBuf
|
|
int32_t nameOffset = nameBuf.length();
|
|
// store it
|
|
int32_t olen = gbstrlen(officialName);
|
|
nameBuf.safeMemcpy ( officialName , olen );
|
|
nameBuf.pushChar('\0');
|
|
// store offset
|
|
pd->m_officialNameOffset = nameOffset;
|
|
// do not repeat!
|
|
dedup.addKey ( &exactHash64 , &nameOffset );
|
|
}
|
|
else {
|
|
// i guess we already added this name before so
|
|
// point to where we added it
|
|
int32_t off = *(int32_t *)dedup.getValue ( &exactHash64 );
|
|
// use that then
|
|
pd->m_officialNameOffset = off;
|
|
}
|
|
|
|
//
|
|
// also add the ascii too, it seems a lot of times that
|
|
// is not given in the alternateNames.txt file either!!!!
|
|
//
|
|
uint64_t nh64b = getWordXorHash ( asciiName );
|
|
uint64_t dedupKeyb = nh64b ^ (uint32_t)placeDescOffset;
|
|
// skip if in there
|
|
if ( ! dedup.isInTable(&dedupKeyb) ) {
|
|
// make this name's hash point to its PlaceDesc
|
|
if ( ! g_nameTable.addKey ( &nh64b, &placeDescOffset))
|
|
return false;
|
|
// do not add dup combos
|
|
dedup.addKey ( &dedupKeyb , &zero );
|
|
}
|
|
|
|
|
|
// skip if not state
|
|
if ( ! ( pd->m_flags & PDF_STATE) ) continue;
|
|
// skip if is numeric for now... strange...
|
|
//if ( is_digit(stateAbbr[0]) ) continue;
|
|
if ( ! stateAbbr[0] ) continue;
|
|
// if we are a state, add our abbreviation here as well!
|
|
// does this convert to lowercase? yes... it should
|
|
uint64_t nh64c = getWordXorHash ( stateAbbr );
|
|
// make another dedupkey
|
|
uint64_t dedupKeyc = nh64c ^ (uint32_t)placeDescOffset;
|
|
// check that as well
|
|
if ( dedup.isInTable(&dedupKeyc) ) continue;
|
|
if ( ! g_nameTable.addKey ( &nh64c , &placeDescOffset ) )
|
|
return false;
|
|
// do not add dup combos
|
|
dedup.addKey ( &dedupKeyc , &zero );
|
|
}
|
|
// close the pipe
|
|
pclose(pf);
|
|
|
|
// . now scan in the alternateNames.txt
|
|
// . add to the hashtablex g_nameTable
|
|
// . key is word xor hash of the name
|
|
// . value is ptr to the PlaceDesc in placeBuf
|
|
// . allow dups since a single name can point to multiple unique places
|
|
sprintf(pcmd,"cat %s/alternateNames.txt",g_hostdb.m_dir);
|
|
pf = popen ( pcmd , "r" );
|
|
if ( ! pf ) { g_errno = errno; return false; }
|
|
|
|
// read in the lines
|
|
while ( fgets ( buf , 10000 , pf ) ) {
|
|
// null terminate it, instead of \n
|
|
buf[gbstrlen(buf)-1]='\0';
|
|
// convert all tabs to \0
|
|
char *p = buf;
|
|
for ( ; *p ; p++ ) if ( *p == '\t' ) *p = '\0';
|
|
// parse it up. id|name|lat|lon|abbr
|
|
p = buf;
|
|
p += strlen(p) + 1; // some number
|
|
int32_t geoId = atol(p); p += strlen(p) + 1;
|
|
p += strlen(p) + 1; // langIdStr
|
|
char *altName = p; p += strlen(p) + 1;
|
|
p += strlen(p) + 1; // is preferred name
|
|
p += strlen(p) + 1; // is int16_t ?name
|
|
// now hash up that name
|
|
uint64_t nh64d = getWordXorHash ( altName );
|
|
// find the place desc for it
|
|
PlaceDesc **ppd = (PlaceDesc **)places.getValue ( &geoId );
|
|
// this won't be there if its not a city,ctry,state, etc.
|
|
// or timezone was missing above
|
|
if ( ! ppd ) continue;
|
|
// cast it otherwise
|
|
PlaceDesc *pd = *ppd;
|
|
// store OFFSETS in nametable
|
|
int32_t placeDescOffset = (char *)pd - placeBuf.getBufStart();
|
|
// do not add dup combos
|
|
uint64_t dedupKeyd = nh64d ^ (uint32_t)placeDescOffset;
|
|
if ( dedup.isInTable ( &dedupKeyd ) ) continue;
|
|
// use that
|
|
if ( ! g_nameTable.addKey ( &nh64d , &placeDescOffset ) )
|
|
return false;
|
|
// do not add dup combos
|
|
dedup.addKey ( &dedupKeyd , &zero ) ;
|
|
}
|
|
pclose(pf);
|
|
|
|
// set this temporarily so getState_new() etc. works for now
|
|
g_pbuf = placeBuf.getBufStart();
|
|
|
|
// . add in state aliases for states in the US
|
|
// . "wash" = "washington" "ore = oregeon" etc.
|
|
int32_t n = (int32_t)sizeof(s_states)/ sizeof(StateDesc);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// get it
|
|
StateDesc *sd = &s_states[i];
|
|
// skip if none
|
|
if ( ! sd->m_name2 ) continue;
|
|
// get original name
|
|
uint64_t nh64 = getWordXorHash ( sd->m_name1 );
|
|
// get the PlaceDesc. this will scan all the matches and
|
|
// get the one that is a state in the US
|
|
PlaceDesc *pd = getState_new ( nh64 , CRID_US , 0 );
|
|
// must be there
|
|
if ( ! pd ) { char *xx=NULL;*xx=0; }
|
|
// make key (d.c. colo. n.m.)
|
|
uint64_t anh64 = getWordXorHash ( sd->m_name2 );
|
|
// store OFFSETS in nametable
|
|
int32_t offset = (char *)pd - placeBuf.getBufStart();
|
|
// add the alias
|
|
if ( ! g_nameTable.addKey ( &anh64 , &offset ) ) return false;
|
|
}
|
|
|
|
// add our CITY aliases i.e. "abq" or "nyc" for cities in the US
|
|
n = (int32_t)sizeof(s_cityList)/ sizeof(AliasDesc);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// get it
|
|
AliasDesc *ad = &s_cityList[i];
|
|
// get the PlaceDesc. this will scan all the matches and
|
|
// get the one that is a state in the US
|
|
PlaceDesc *pd = getCity2_new(ad->m_s2, ad->m_adm1 , CRID_US,0);
|
|
// must be there
|
|
if ( ! pd ) { char *xx=NULL;*xx=0; }
|
|
// make key (d.c. colo. n.m.)
|
|
uint64_t ach64 = getWordXorHash ( ad->m_s1 );
|
|
// store OFFSETS in nametable
|
|
int32_t offset = (char *)pd - placeBuf.getBufStart();
|
|
// add the alias
|
|
if ( ! g_nameTable.addKey ( &ach64 , &offset ) ) return false;
|
|
}
|
|
|
|
// size of placeBuf
|
|
int32_t placeBufLength = placeBuf.length();
|
|
// concatenate nameBuf to placeBuf for saving to disk
|
|
if ( ! placeBuf.cat ( nameBuf ) ) return false;
|
|
// adjust all PlaceDesc::m_officialNameOffset vars to compensate for
|
|
// this concatenation
|
|
PlaceDesc *pd = (PlaceDesc *)placeBuf.getBufStart();
|
|
PlaceDesc *pdend = (PlaceDesc *)(((char *)pd) + placeBufLength);
|
|
for ( ; pd < pdend ; pd++ )
|
|
pd->m_officialNameOffset += placeBufLength;
|
|
|
|
// test it out
|
|
PlaceDesc *pd2 = getCity2_new ( "abq", "nm", CRID_US,0);
|
|
if ( ! pd2 ) { char *xx=NULL;*xx=0; }
|
|
|
|
int64_t ph64 = getWordXorHash ( "Tokyo" );
|
|
pd2 = getMostPopularPlace_new ( ph64 ,CRID_ANY ,PDF_CITY,0 );
|
|
if ( ! pd2 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// pasadena texas is more popular than california!
|
|
ph64 = getWordXorHash ( "Pasadena" );
|
|
pd2 = getMostPopularPlace_new ( ph64 ,CRID_US ,PDF_CITY,0 );
|
|
//if ( pd2->m_population != 144618 ) { char *xx=NULL;*xx=0; }
|
|
if ( ! pd2 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . now the g_nameTable points into the buffer of PlaceDesc, save it
|
|
// . HashTableX can save the buffer too now!
|
|
if ( ! g_nameTable.save ( g_hostdb.m_dir ,
|
|
"places.dat" ,
|
|
placeBuf.getBufStart() ,
|
|
placeBuf.length() ) )
|
|
return false;
|
|
|
|
// ok, try loading now
|
|
placeBuf.purge();
|
|
g_nameTable.reset();
|
|
|
|
log("places: loading generated table places.dat from disk");
|
|
|
|
return g_nameTable.load ( g_hostdb.m_dir , "places.dat" ,
|
|
&g_pbuf ,
|
|
&g_pbufSize );
|
|
}
|
|
|
|
// get the state in this country
|
|
PlaceDesc *getState_new ( uint64_t pd64 , uint8_t crid , int32_t niceness ) {
|
|
int32_t slot = g_nameTable.getSlot ( &pd64 );
|
|
// scan the slots
|
|
for ( ; slot >= 0 ; slot = g_nameTable.getNextSlot(slot,&pd64) ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get the placedesc
|
|
int32_t offset = *(int32_t *)g_nameTable.getValueFromSlot(slot);
|
|
PlaceDesc *pd = (PlaceDesc *)(g_pbuf + offset);
|
|
// skip if not a state
|
|
if ( ! (pd->m_flags & PDF_STATE ) ) continue;
|
|
// skip if not right country
|
|
if ( pd->m_crid != crid ) continue;
|
|
// we got it!
|
|
return pd;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
// get the state in this country
|
|
PlaceDesc *getState2_new ( char *state , uint8_t crid , int32_t niceness ) {
|
|
uint64_t sh64 = getWordXorHash ( state );
|
|
return getState_new ( sh64, crid,niceness);
|
|
}
|
|
|
|
PlaceDesc *getCity_new ( uint64_t ch64 ,
|
|
char *stateAbbr ,
|
|
uint8_t crid ,
|
|
int32_t niceness ) {
|
|
|
|
// sanity
|
|
if ( ! is_lower_a(stateAbbr[0]) ) { char *xx=NULL;*xx=0; }
|
|
if ( ! is_lower_a(stateAbbr[1]) ) { char *xx=NULL;*xx=0; }
|
|
|
|
int32_t slot = g_nameTable.getSlot ( &ch64 );
|
|
// scan the slots
|
|
for ( ; slot >= 0 ; slot = g_nameTable.getNextSlot(slot,&ch64) ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get the placedesc
|
|
int32_t offset = *(int32_t *)g_nameTable.getValueFromSlot(slot);
|
|
PlaceDesc *pd = (PlaceDesc *)(g_pbuf + offset);
|
|
// skip if not a city
|
|
if ( ! (pd->m_flags & PDF_CITY ) ) continue;
|
|
// skip if not right country
|
|
if ( crid != CRID_ANY && pd->m_crid != crid ) continue;
|
|
// or right state
|
|
if ( stateAbbr[0] != pd->m_adm1[0] ) continue;
|
|
if ( stateAbbr[1] != pd->m_adm1[1] ) continue;
|
|
// we got it!
|
|
return pd;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
PlaceDesc *getCity2_new ( char *city ,
|
|
char *stateAbbr ,
|
|
uint8_t crid ,
|
|
int32_t niceness ) {
|
|
uint64_t ch64 = getWordXorHash ( city );
|
|
return getCity_new ( ch64, stateAbbr,crid,niceness);
|
|
}
|
|
|
|
PlaceDesc *getCity3_new ( uint64_t ch64 ,
|
|
uint64_t stateHash64,
|
|
uint8_t crid ,
|
|
int32_t niceness ) {
|
|
|
|
int32_t slot1 = g_nameTable.getSlot ( &ch64 );
|
|
// scan the slots
|
|
for ( ; slot1 >= 0 ; slot1 = g_nameTable.getNextSlot(slot1,&ch64) ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get the placedesc
|
|
int32_t offset1 = *(int32_t *)g_nameTable.getValueFromSlot(slot1);
|
|
PlaceDesc *pd1 = (PlaceDesc *)(g_pbuf + offset1);
|
|
// skip if not a city
|
|
if ( ! (pd1->m_flags & PDF_CITY ) ) continue;
|
|
// skip if not right country
|
|
if ( crid != CRID_ANY && pd1->m_crid != crid ) continue;
|
|
// see if we got a state that matches "stateHash64" and
|
|
// "pd->m_adm1"
|
|
int32_t slot2 = g_nameTable.getSlot ( &stateHash64 );
|
|
for ( ; slot2 >= 0 ;
|
|
slot2=g_nameTable.getNextSlot(slot2,&stateHash64)) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get the placedesc
|
|
int32_t offset2;
|
|
offset2 = *(int32_t *)g_nameTable.getValueFromSlot(slot2);
|
|
PlaceDesc *pd2 = (PlaceDesc *)(g_pbuf + offset2);
|
|
// skip if not a city
|
|
if ( ! (pd2->m_flags & PDF_CITY ) ) continue;
|
|
// skip if not right country
|
|
if ( crid != CRID_ANY && pd2->m_crid != crid) continue;
|
|
// matching abbr?
|
|
if ( pd2->m_adm1[0] != pd1->m_adm1[0] ) continue;
|
|
if ( pd2->m_adm1[1] != pd1->m_adm1[1] ) continue;
|
|
// it's a match!
|
|
return pd1;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
bool getLongestPlaceName_new ( int32_t a,
|
|
int32_t alnumPos,
|
|
Words *words,
|
|
// must match! PDF_CITY|STATE|COUNTRY
|
|
uint8_t placeType,
|
|
uint8_t crid,
|
|
char *stateAbbr,
|
|
uint64_t *placeHash64,
|
|
int32_t *placeAlnumA,
|
|
int32_t *placeAlnumB,
|
|
int32_t *placeA,
|
|
int32_t *placeB ,
|
|
// set to most popular match
|
|
PlaceDesc **pdp ) {
|
|
// assume none
|
|
if ( placeHash64 ) *placeHash64 = 0LL;
|
|
// init hash to zero
|
|
int64_t h = 0LL;
|
|
// max count
|
|
int32_t count = 0;
|
|
// record start
|
|
int32_t startAlnumPos = alnumPos;
|
|
// fix this
|
|
alnumPos--;
|
|
// for some filtering
|
|
static bool s_flag = false;
|
|
static int64_t h_university;
|
|
static int64_t h_of;
|
|
if ( ! s_flag ) {
|
|
s_flag = true;
|
|
h_university = hash64n("university");
|
|
h_of = hash64n("of");
|
|
}
|
|
// int16_tcut
|
|
int32_t nw = words->m_numWords;
|
|
int32_t wcount = 0;
|
|
// loop over words in [a,b)
|
|
for ( int32_t k = a ; k < nw ; k++ ) {
|
|
// or 15 words is good enough too!
|
|
if ( ++wcount >= 20 ) break;
|
|
// skip if not alnum
|
|
if ( ! words->isAlnum(k) ) continue;
|
|
// count it
|
|
alnumPos++;
|
|
// only up to 4 words in a place name
|
|
if ( ++count >= 5 ) break;
|
|
// get the hash of potential place name
|
|
int64_t wid = words->m_wordIds[k];
|
|
// int16_tcut
|
|
int32_t wlen = words->m_wordLens[k];
|
|
char *wptr = words->m_words[k];
|
|
// if it ended in apostrophe s then fix that
|
|
if ( wlen > 2 &&
|
|
wptr[wlen-2]=='\'' &&
|
|
to_lower_a(wptr[wlen-1]) == 's' )
|
|
// hash the word without the 's
|
|
wid = hash64Lower_utf8(wptr,wlen-2);
|
|
// mix it up
|
|
h <<= 1;
|
|
// hash it into our ongoing hash
|
|
h ^= wid;
|
|
// ignore "University" if "of" follows
|
|
if ( h == h_university &&
|
|
k + 2 < nw &&
|
|
words->m_wordIds[k+2] == h_of )
|
|
continue;
|
|
// get it. just get the most popular that matches
|
|
PlaceDesc *pd = getPlaceDesc ( h,placeType,crid,stateAbbr,0);
|
|
if ( ! pd ) continue;
|
|
// check for "county" (santa fe county is not a city name)
|
|
if ( k + 2 < nw && words->m_wordIds[k+2] == h_county ) {
|
|
// nuke it
|
|
if ( placeHash64 ) *placeHash64 = 0LL;
|
|
return true;
|
|
}
|
|
// int16_tcuts
|
|
//char **wptrs = words->getWords();
|
|
//int32_t *wlens = words->getWordLens();
|
|
// set the place
|
|
*placeA = a;
|
|
*placeB = k+1;
|
|
*placeAlnumA = startAlnumPos;
|
|
*placeAlnumB = alnumPos+1;
|
|
if ( placeHash64 ) *placeHash64 = h;
|
|
if ( pdp ) *pdp = pd;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . placeType is like PDF_CITY or PDF_STATE or PDF_COUNTRY
|
|
// . return most popular i guess
|
|
PlaceDesc *getPlaceDesc ( uint64_t placeHash64 ,
|
|
uint8_t placeType ,
|
|
uint8_t crid,
|
|
char *stateAbbr,
|
|
int32_t niceness ) {
|
|
int32_t maxPop = -1;
|
|
PlaceDesc *best = NULL;
|
|
int32_t slot = g_nameTable.getSlot ( &placeHash64 );
|
|
// scan the slots
|
|
for ( ; slot >= 0 ; slot = g_nameTable.getNextSlot(slot,&placeHash64)){
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get the placedesc
|
|
int32_t offset = *(int32_t *)g_nameTable.getValueFromSlot(slot);
|
|
PlaceDesc *pd = (PlaceDesc *)(g_pbuf + offset);
|
|
// skip if not the right type of place
|
|
if ( ! (pd->m_flags & placeType ) ) continue;
|
|
// crid too match?
|
|
if ( crid != CRID_ANY && pd->m_crid != crid ) continue;
|
|
// state match?
|
|
if ( stateAbbr && pd->m_adm1[0] != stateAbbr[0] ) continue;
|
|
if ( stateAbbr && pd->m_adm1[1] != stateAbbr[1] ) continue;
|
|
// get pop
|
|
if ( pd->m_population <= maxPop ) continue;
|
|
// otherwise, a new max
|
|
maxPop = pd->m_population;
|
|
// save it
|
|
best = pd;
|
|
}
|
|
return best;
|
|
}
|
|
|
|
bool getZip_new ( int32_t a ,
|
|
int32_t alnumPos ,
|
|
Words *words ,
|
|
uint64_t *zipHash64 ,
|
|
uint64_t *zipCityHash64 ,
|
|
uint64_t *zipStateHash64 ,
|
|
int32_t *zipAlnumA,
|
|
int32_t *zipAlnumB,
|
|
int32_t *zipA,
|
|
int32_t *zipB,
|
|
float *zipLat,
|
|
float *zipLon ) {
|
|
// assume none
|
|
if ( zipHash64 ) *zipHash64 = 0LL;
|
|
// must be a number
|
|
if ( ! is_digit(words->m_words[a][0]) ) return true;
|
|
// make hash
|
|
int64_t h = 0 ^ words->m_wordIds[a];
|
|
// check for zip code
|
|
int32_t slot = g_zips.getSlot(&h);
|
|
// skip if not
|
|
if ( slot < 0 ) return true;
|
|
// get the place
|
|
ZipDesc *zd =(ZipDesc *)g_zips.getValueFromSlot(slot);
|
|
// set state hash
|
|
if ( zipStateHash64 ) *zipStateHash64 = hash64(zd->m_adm1,2,0LL);
|
|
// and city hash
|
|
if ( zipCityHash64 ) *zipCityHash64 = zd->m_cityHash;
|
|
*zipA = a;
|
|
*zipB = a+1;
|
|
*zipAlnumA = alnumPos;
|
|
*zipAlnumB = alnumPos+1;
|
|
if ( zipHash64 ) *zipHash64 = h;
|
|
*zipLat = zd->m_latitude;
|
|
*zipLon = zd->m_longitude;
|
|
return true;
|
|
}
|
|
|
|
PlaceDesc *getMostPopularPlace_new ( int64_t placeHash64,
|
|
uint8_t crid ,
|
|
uint8_t placeType,
|
|
int32_t niceness ) {
|
|
int32_t maxPop = -1;
|
|
PlaceDesc *best = NULL;
|
|
int32_t slot = g_nameTable.getSlot ( &placeHash64 );
|
|
// scan the slots
|
|
for ( ; slot >= 0; slot = g_nameTable.getNextSlot(slot,&placeHash64)){
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get the placedesc
|
|
int32_t offset = *(int32_t *)g_nameTable.getValueFromSlot(slot);
|
|
PlaceDesc *pd = (PlaceDesc *)(g_pbuf + offset);
|
|
// skip if not a the right type of place
|
|
if ( ! (pd->m_flags & placeType ) ) continue;
|
|
// skip if not right country
|
|
if ( crid != CRID_ANY && pd->m_crid != crid ) continue;
|
|
// get pop
|
|
if ( pd->m_population <= maxPop ) continue;
|
|
// otherwise, a new max
|
|
maxPop = pd->m_population;
|
|
// save it
|
|
best = pd;
|
|
}
|
|
return best;
|
|
}
|
|
|
|
//
|
|
// . the new getNearestCity_new() function
|
|
// . copied from getNearestCity() function above
|
|
//
|
|
|
|
//static int32_t *s_latList2 = NULL;
|
|
//static int32_t s_latListSize2 = 0;
|
|
//static int32_t s_ni2 = 0;
|
|
static SafeBuf s_cityLatList;
|
|
|
|
|
|
// . we need a list of the city ids sorted by lat, and a list sorted by lon
|
|
// . then we do b-stepping on each list
|
|
// . bstep down to a 20 mile by 20 mile box
|
|
// . then intersect using a hashtable
|
|
// . if empty, then increase to 30 by 30 mile box, etc.
|
|
// . there are 123k US cities in cities.dat
|
|
// . these 2 lists should be about 2MB then
|
|
// . then lookup cityid in g_timezones to get timezone
|
|
PlaceDesc *getNearestCity_new ( float lat ,
|
|
float lon ,
|
|
int32_t niceness ,
|
|
float *distInMilesSquared ) {
|
|
|
|
// . radius is 10 miles, put miles into degrees
|
|
// . when it was 5 we did not get "Santa Fe" for an event, it
|
|
// thought it was in "Agua Fria"
|
|
float radius = 10.0 / 69.0;
|
|
PlaceDesc *pd = NULL;
|
|
// how many cities we got?
|
|
int32_t ni = s_cityLatList.length() / 4;
|
|
int32_t *latList = (int32_t *)s_cityLatList.getBufStart();
|
|
|
|
tryagain:
|
|
|
|
int32_t step = ni / 2;
|
|
// get lat boundaries using bstep
|
|
int32_t start = ni / 2;
|
|
// do the bstepping
|
|
for ( ; ; ) {
|
|
// get that city
|
|
int32_t cityOffset = latList[start];
|
|
// get PlaceDesc
|
|
pd = (PlaceDesc *)(g_pbuf + cityOffset);
|
|
// increase resolution for next round
|
|
step /= 2;
|
|
// step it down?
|
|
if ( lat < pd->m_lat ) start -= step;
|
|
// use " - radius" here as well to avoid infinite loop?
|
|
else if ( lat > pd->m_lat ) start += step;
|
|
// ok, we are in range, done
|
|
else break;
|
|
// avoid breaching!
|
|
if ( start < 0 ) { start = 0 ; break; }
|
|
if ( start >= ni ) { start = ni-1; break; }
|
|
// stop if we hit steps of 0
|
|
if ( step <= 0 ) break;
|
|
}
|
|
|
|
int32_t lata = start;
|
|
int32_t latb = start;
|
|
int32_t count = 0;
|
|
// TODO: do b-step on these too, takes like 3500 iterations for
|
|
// both of these loops
|
|
// adjust lata/latb until just out of range
|
|
for ( ; lata > 0 ; lata-- ) {
|
|
int32_t cityOffset = latList[lata];
|
|
pd = (PlaceDesc *)(g_pbuf + cityOffset);
|
|
if ( pd->m_lat < lat - radius ) break;
|
|
count++;
|
|
}
|
|
for ( ; latb < ni ; latb++ ) {
|
|
int32_t cityOffset = latList[latb];
|
|
pd = (PlaceDesc *)(g_pbuf + cityOffset);
|
|
if ( pd->m_lat > lat + radius ) break;
|
|
count++;
|
|
}
|
|
|
|
//
|
|
// first do a loop to get the absolutely closest place
|
|
// to this lat/lon regardless of population
|
|
//
|
|
float min1 = -1.0;
|
|
PlaceDesc *minpd1 = NULL;
|
|
// add in the lat cities
|
|
for ( int32_t i = lata ; i <= latb ; i++ ) {
|
|
// break?
|
|
if ( i >= ni ) break;
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get that city
|
|
int32_t cityOffset = latList[i];
|
|
pd = (PlaceDesc *)(g_pbuf + cityOffset);
|
|
// sanity check
|
|
if ( cityOffset > g_pbufSize ) { char *xx=NULL;*xx=0; }
|
|
if ( cityOffset < 0 ) { char *xx=NULL;*xx=0; }
|
|
// just compute distance
|
|
float latDiff = pd->m_lat - lat;
|
|
float lonDiff = pd->m_lon - lon;
|
|
// add up
|
|
float dist = latDiff*latDiff + lonDiff*lonDiff;
|
|
// min?
|
|
if ( dist > min1 && minpd1 ) continue;
|
|
// set it
|
|
min1 = dist;
|
|
minpd1 = pd;
|
|
}
|
|
|
|
|
|
//
|
|
// then do a second loop to find the closest place, taking population
|
|
// into account, but also keeping the state/country the same
|
|
// as in "minpd1"
|
|
//
|
|
float min2 = -1.0;
|
|
PlaceDesc *minpd2 = NULL;
|
|
// add in the lat cities
|
|
for ( int32_t i = lata ; i <= latb ; i++ ) {
|
|
// break?
|
|
if ( i >= ni ) break;
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get that city
|
|
int32_t cityOffset = latList[i];
|
|
pd = (PlaceDesc *)(g_pbuf + cityOffset);
|
|
// just compute distance
|
|
float latDiff = pd->m_lat - lat;
|
|
float lonDiff = pd->m_lon - lon;
|
|
// convert into miles
|
|
latDiff *= 69;
|
|
lonDiff *= 69;
|
|
// must match that of minpd1's state and country
|
|
if ( pd->m_adm1[0] != minpd1->m_adm1[0] ) continue;
|
|
if ( pd->m_adm1[1] != minpd1->m_adm1[1] ) continue;
|
|
if ( pd->m_crid != minpd1->m_crid ) continue;
|
|
// but consider the radius of the city to be up to 10 miles
|
|
// for a population of 1M people...
|
|
// one degree is 69.0 miles
|
|
float pop = pd->m_population;
|
|
// restrict to 500k people
|
|
if ( pop > 500000.0 ) pop = 500000.0;
|
|
// compute the city radius, can be up to 33*33 miles
|
|
float cityRadiusSquared = (1000.0 * pop) / 500000.0;
|
|
// square that
|
|
//float cityRadiusSquared = cityRadius * cityRadius;
|
|
// add up
|
|
float dist = latDiff*latDiff + lonDiff*lonDiff;
|
|
// subtract
|
|
dist -= cityRadiusSquared;
|
|
// DEBUG
|
|
//if ( dist < 200 )
|
|
// log("places: city=%s dist=%.01f rad=%.01f",
|
|
// pd->getOfficialName(),dist,cityRadiusSquared);
|
|
// min?
|
|
if ( dist > min2 && minpd2 ) continue;
|
|
// set it
|
|
min2 = dist;
|
|
minpd2 = pd;
|
|
}
|
|
|
|
// must have one
|
|
if ( ! minpd2 ) {
|
|
// note it
|
|
log("addr: what the hell.");
|
|
// increase stripe width
|
|
radius += 10.0;
|
|
// try again
|
|
goto tryagain;
|
|
}
|
|
|
|
// debug point -- undo this later
|
|
//if ( ! strcmp(minpd2->getOfficialName(),"Agua Fria") )
|
|
// log("hey");
|
|
|
|
|
|
if ( distInMilesSquared ) *distInMilesSquared = min2;
|
|
|
|
// return that then
|
|
return minpd2;
|
|
}
|
|
|
|
|
|
int latcmp_new ( const void *arg1 , const void *arg2 ) {
|
|
int32_t off1 = *(int32_t *)arg1;
|
|
int32_t off2 = *(int32_t *)arg2;
|
|
// get the addresses
|
|
PlaceDesc *cd1;
|
|
PlaceDesc *cd2;
|
|
cd1 = (PlaceDesc *)(g_pbuf + off1);
|
|
cd2 = (PlaceDesc *)(g_pbuf + off2);
|
|
// simple compare
|
|
if ( cd1->m_lat < cd2->m_lat ) return -1;
|
|
if ( cd1->m_lat > cd2->m_lat ) return 1;
|
|
return 0;
|
|
}
|
|
|
|
bool testCityList ( ) {
|
|
PlaceDesc *pd;
|
|
char *name;
|
|
|
|
pd = getNearestCity_new ( 35.596035,-106.052246,0,NULL);
|
|
if ( ! pd ) { char *xx=NULL;*xx=0; }
|
|
name = pd->m_officialNameOffset + g_pbuf;
|
|
if ( strcmp ( name , "Santa Fe" ) ) { char *xx=NULL;*xx=0; }
|
|
|
|
// try this. make sure this is albuquerque
|
|
pd = getNearestCity_new ( 35.08449 ,-106.6511,0,NULL);
|
|
if ( ! pd ) { char *xx=NULL;*xx=0; }
|
|
name = pd->m_officialNameOffset + g_pbuf;
|
|
if ( strcmp ( name , "Albuquerque" ) ) { char *xx=NULL;*xx=0; }
|
|
|
|
return true;
|
|
}
|
|
|
|
// . our data is used by getNearestCityId
|
|
// . about 123k cities, sort them by lat in one list, lon in the other
|
|
// . 4 bytes per entry, we are talking 1.2MB for both lists
|
|
bool initCityLists_new ( ) {
|
|
|
|
// bail if not indexing events
|
|
//if ( ! g_conf.m_indexEventsOnly ) return true;
|
|
return true;
|
|
|
|
log ("places: loading citylatlist.dat");
|
|
|
|
// first try to load the list of city offsets into g_pbuf
|
|
// which are pre-sorted
|
|
if ( s_cityLatList.fillFromFile(g_hostdb.m_dir,"citylatlist.dat")>=1) {
|
|
// test it out right quick
|
|
testCityList();
|
|
return true;
|
|
}
|
|
|
|
// scan the buffer of placeDescriptors
|
|
PlaceDesc *pd = (PlaceDesc *) g_pbuf;
|
|
PlaceDesc *pdend ;//= (PlaceDesc *)(g_pbuf + g_pbufSize);
|
|
|
|
// find the real end of it!
|
|
for ( pdend = pd ; ; pdend++ ) {
|
|
// stop if we enter the name buf space
|
|
if ( ((char *)pdend)[0] == 'u' &&
|
|
((char *)pdend)[1] == 'n' &&
|
|
! strcmp((char *)pdend,"unknown name" ) )
|
|
break;
|
|
}
|
|
|
|
// count how many cities we got
|
|
int32_t cityCount = 0;
|
|
for ( ; pd < pdend ; pd++ )
|
|
if ( pd->m_flags & PDF_CITY ) cityCount++;
|
|
|
|
// . alloc for the "ptrs" which will really be offsets into g_pbuf
|
|
// . use offsets so we can save/load to/from disk easily
|
|
int32_t need = cityCount * 4;
|
|
// alloc it
|
|
if ( ! s_cityLatList.reserve ( need ) ) return false;
|
|
// point into it so we can fill it up
|
|
int32_t *latList = (int32_t *)s_cityLatList.getBufStart();
|
|
int32_t nc = 0;
|
|
|
|
pd = (PlaceDesc *)g_pbuf;
|
|
// scan the cities again
|
|
for ( ; pd < pdend ; pd++ ) {
|
|
// skip if not city
|
|
if ( ! (pd->m_flags & PDF_CITY ) ) continue;
|
|
// get offset
|
|
int32_t cityOffset = ((char *)pd) - g_pbuf;
|
|
// add to the list
|
|
latList[nc++] = cityOffset;
|
|
}
|
|
// sanity
|
|
if ( cityCount != nc ) { char *xx=NULL;*xx=0; }
|
|
// now sort each list
|
|
gbqsort ( latList , nc , 4 , latcmp_new , 0 );
|
|
|
|
// update length
|
|
s_cityLatList.m_length = nc * 4;
|
|
|
|
// test it out right quick
|
|
testCityList();
|
|
|
|
log ("places: saving citylatlist.dat");
|
|
// save it
|
|
s_cityLatList.saveToFile(g_hostdb.m_dir,"citylatlist.dat");
|
|
|
|
return true;
|
|
}
|
|
|