985 lines
23 KiB
C++
985 lines
23 KiB
C++
|
|
// See docs in language.h
|
|
|
|
#include "gb-include.h"
|
|
#include "LanguageIdentifier.h"
|
|
#include "LangList.h"
|
|
#include "geo_ip_table.h"
|
|
#include "Speller.h"
|
|
#include "CountryCode.h"
|
|
#include "ValidPointer.h"
|
|
#include "Categories.h"
|
|
#include "Linkdb.h"
|
|
|
|
LanguageIdentifier g_langId;
|
|
|
|
/// List of TLDs that should not be used for language detection.
|
|
/// NULL terminated.
|
|
///
|
|
/// Sadly, .de seems to be about half German pages and about half
|
|
/// English as well. We cannot use it to distinguish language.
|
|
/// Also, .at has some english pages.
|
|
/// Also, .nl has some english pages.
|
|
/// Also, .no has some english pages.
|
|
/// Also, .vn has some english pages.
|
|
/// Also, .ro has some english pages.
|
|
/// Also, .gr has some english pages.
|
|
/// Also, .th has some english pages.
|
|
/// Also, .pl has some english pages.
|
|
/// Also, .gs has some english pages.
|
|
///
|
|
/// (Pretty soon it will be faster to have a list of domains that
|
|
/// WILL work instead of domains that won't.)
|
|
///
|
|
static char *ambiguousTLDs[] = {
|
|
"info",
|
|
"com",
|
|
"org",
|
|
"net",
|
|
"mil",
|
|
"de",
|
|
"at",
|
|
"tv",
|
|
"nl",
|
|
"no",
|
|
"ws",
|
|
"vn",
|
|
"ro",
|
|
"ru",
|
|
"gr",
|
|
"th",
|
|
"pl",
|
|
"gs",
|
|
NULL
|
|
};
|
|
|
|
const uint8_t *langToTopic[] = {
|
|
(uint8_t*)"Unknown",
|
|
(uint8_t*)"English",
|
|
(uint8_t*)"Français",
|
|
(uint8_t*)"Español",
|
|
(uint8_t*)"Russian",
|
|
(uint8_t*)"There Is No 5!",
|
|
(uint8_t*)"Japanese",
|
|
(uint8_t*)"Chinese_Traditional",
|
|
(uint8_t*)"Chinese_Simplified",
|
|
(uint8_t*)"Korean",
|
|
(uint8_t*)"Deutsch", // 10
|
|
(uint8_t*)"Nederlands",
|
|
(uint8_t*)"Italiano",
|
|
(uint8_t*)"Suomi",
|
|
(uint8_t*)"Svenska",
|
|
(uint8_t*)"Norsk",
|
|
(uint8_t*)"Português",
|
|
(uint8_t*)"Vietnamese",
|
|
(uint8_t*)"Arabic",
|
|
(uint8_t*)"Hebrew",
|
|
(uint8_t*)"Bahasa_Indonesia", // 20
|
|
(uint8_t*)"Greek",
|
|
(uint8_t*)"Thai", // 22
|
|
(uint8_t*)"Hindi",
|
|
(uint8_t*)"Bangla",
|
|
(uint8_t*)"Polska",
|
|
(uint8_t*)"Tagalog"
|
|
};
|
|
|
|
/// Find a language tag in a DOCTYPE element.
|
|
///
|
|
/// This looks more complex than it is.
|
|
/// Find second quote mark, back up to
|
|
/// slash, move forward one, and that
|
|
/// should be the language identifier.
|
|
///
|
|
/// @param content pointer to the document's content
|
|
///
|
|
/// @return pointer to the language tag, or NULL
|
|
///
|
|
static char * FindLanguageIndex(char *content) {
|
|
char *str;
|
|
str = strchr(content, '"');
|
|
if(!str)
|
|
return(NULL);
|
|
|
|
// Got first quote, skip it
|
|
str++;
|
|
str = strchr(str, '"');
|
|
if(!str)
|
|
return(NULL);
|
|
|
|
// Got second quote char, skip it
|
|
str++;
|
|
// now back up to slash character...
|
|
while(str && *str && str > content && *str != '/')
|
|
str--;
|
|
// make sure we found the slash...
|
|
if(str && *str && str > content && *str == '/') {
|
|
str++;
|
|
return(str);
|
|
}
|
|
return(NULL);
|
|
}
|
|
|
|
/// Copy a language tag.
|
|
///
|
|
/// Does NULL terminate dst.
|
|
///
|
|
/// @param dst the destination
|
|
/// @param src the source (returned from FindLanguageIndex())
|
|
/// @param maxSize max length of dst, not counting NULL
|
|
///
|
|
/// @return true on successful copy, false otherwise
|
|
///
|
|
static bool copyLangTag(char *dst, char *src, int maxSize) {
|
|
int len = 0;
|
|
|
|
if(!dst || !src || maxSize < 1)
|
|
return(false);
|
|
|
|
while ( *src && *src != '"' ) { // && len++ < maxSize) {
|
|
//if(len < 2) {
|
|
// *dst++ = tolower(*src++);
|
|
//} else {
|
|
// *dst++ = *src++;
|
|
//}
|
|
*dst++ = tolower(*src++);
|
|
// how many chars have we copied over?
|
|
len++;
|
|
// leave 1 char for a \0 termination
|
|
if ( len + 1 >= maxSize ) break;
|
|
}
|
|
*dst = 0;
|
|
return(true);
|
|
}
|
|
|
|
|
|
LanguageIdentifier::LanguageIdentifier() {
|
|
return;
|
|
}
|
|
|
|
inline bool LanguageIdentifier::isAmbiguousTLD(char *tld, int len) {
|
|
int x;
|
|
for(x = 0; ambiguousTLDs[x]; x++) {
|
|
if(!strncmp(tld, ambiguousTLDs[x],
|
|
maxOf(len, gbstrlen(ambiguousTLDs[x]))))
|
|
return(true);
|
|
}
|
|
return(false);
|
|
}
|
|
|
|
uint8_t getLanguageFromAbbr2 ( char *str , int32_t len ) {
|
|
// truncate
|
|
if ( len > 5 ) len = 5;
|
|
// copy it and check it
|
|
char lang[6];
|
|
for ( int32_t j = 0 ; j < len ; j++ )
|
|
lang[j] = to_lower_a(str[j]);
|
|
lang[len]='\0';
|
|
return getLanguageFromAbbr(lang);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromTag(Xml *xml) {
|
|
uint8_t rv = langUnknown;
|
|
int32_t len = 0;
|
|
//char lang[6];
|
|
int id;
|
|
char *str;
|
|
|
|
if(!xml) return(langUnknown);
|
|
|
|
for(int32_t i = 0; i < xml->getNumNodes(); i++) {
|
|
id = xml->getNodeId(i);
|
|
|
|
// look for meta tag
|
|
if(id == TAG_META) {
|
|
str = (char *) xml->getString(i, "name", &len);
|
|
if(str &&
|
|
(!strncasecmp(str, "Content-Language",16) ||
|
|
!strncasecmp(str, "language",8) ||
|
|
!strncasecmp(str, "Content_Language",16) ) ) {
|
|
str = (char *) xml->getString(i, "content", &len);
|
|
rv = getLanguageFromAbbr2(str,len);
|
|
if(rv != langUnknown) return(rv);
|
|
}
|
|
else {
|
|
str = (char *) xml->getString(i, "http-equiv", &len);
|
|
if(str && !strncasecmp(str, "Language", 8) ) {
|
|
str = (char *) xml->getString(i, "content", &len);
|
|
rv = getLanguageFromAbbr2(str,len);
|
|
if(rv != langUnknown) return(rv);
|
|
}
|
|
}
|
|
} // end looking for meta tag
|
|
|
|
|
|
if(id != TAG_HTML && // html
|
|
id != TAG_BODY && // body
|
|
id != TAG_HEAD) // head
|
|
continue;
|
|
|
|
str = (char *) xml->getString(i, "lang", &len);
|
|
rv = getLanguageFromAbbr2(str,len);
|
|
if(rv != langUnknown) return(rv);
|
|
}
|
|
return(rv);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromOutlinks(Links *links) {
|
|
char link[MAX_URL_LEN];
|
|
int32_t langs[32];
|
|
int lc;
|
|
char *cp = NULL;
|
|
int max = 0;
|
|
int oldmax = 0;
|
|
uint8_t l;
|
|
uint8_t maxlang = 0;
|
|
int len;
|
|
|
|
if(!links) return(langUnknown);
|
|
|
|
// Try to catch bad pointers
|
|
//if(!isValidPointer(links)) {
|
|
// log(LOG_WARN, "build: Bad pointer 0x%08x not above data segment.\n",
|
|
// (uint32_t) links);
|
|
// return(langUnknown);
|
|
//}
|
|
|
|
if(links->getNumLinks() < 1) {
|
|
return(langUnknown);
|
|
}
|
|
|
|
if(links->getNumLinks() < 15) {
|
|
return(langUnknown);
|
|
}
|
|
|
|
// clear list
|
|
memset(langs, 0, sizeof(uint32_t) * 32);
|
|
|
|
// trim to only 100 links to prevent
|
|
// spinning on some large pages
|
|
for(lc = 0; lc < links->getNumLinks() && lc < 100; lc++) {
|
|
cp = links->getLink(lc);
|
|
|
|
if(cp) {
|
|
// skip http://
|
|
cp += 7;
|
|
|
|
len = links->getLinkLen(lc) - 7;
|
|
char* p = link;
|
|
while(*cp && *cp != '/') *p++ = *cp++;
|
|
*p = '\0';
|
|
|
|
if((cp = strrchr(link, '.')) != NULL) {
|
|
|
|
// skip to tld
|
|
cp++;
|
|
|
|
// only bother if not a common TLD
|
|
len = gbstrlen(cp);
|
|
if(!isAmbiguousTLD(cp, len)) {
|
|
for(l = 1; l < 32; l++) {
|
|
if(g_langList.isLangValidForTld(cp, len, l))
|
|
langs[l]++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// look for a clear winner from the list
|
|
// don't bother with langUnknown, it reduces hits
|
|
for(l = 1; l < 32; l++) {
|
|
if(langs[l] >= max) {
|
|
oldmax = max;
|
|
max = langs[l];
|
|
maxlang = l;
|
|
}
|
|
}
|
|
|
|
// 1st place must beat 2nd place by 5
|
|
if(max - oldmax > 5) {
|
|
return(maxlang);
|
|
}
|
|
return(langUnknown);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromTld(char *linktext) {
|
|
#if 0
|
|
// This is not a good check of language
|
|
int len = 0;
|
|
char *cp;
|
|
|
|
if(!linktext) return(langUnknown);
|
|
|
|
// skip http://
|
|
cp = linktext + 7;
|
|
|
|
// if no slash, start at the end of the link
|
|
if(!(cp = strchr(cp, '/')))
|
|
cp = linktext + (gbstrlen(linktext) - 1);
|
|
|
|
// find last dot
|
|
while(*cp && cp > linktext && *cp != '.') {
|
|
cp--;
|
|
len++;
|
|
}
|
|
|
|
// skip '.'
|
|
len--; cp++;
|
|
|
|
if(len != 2) return(langUnknown);
|
|
#endif // 0
|
|
|
|
return(langUnknown);
|
|
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromInlinks(LinkInfo *linkInfo, int32_t ip) {
|
|
int32_t x;
|
|
//int32_t y;
|
|
uint8_t languages[32];
|
|
uint8_t max = langUnknown;
|
|
uint8_t oldmax = langUnknown;
|
|
uint8_t maxIndex = 0;
|
|
uint8_t oldmaxIndex = 0;
|
|
int hits = 0;
|
|
|
|
// sanity check
|
|
//if(linkInfo->m_numLangs != linkInfo->getNumDocIds()) {
|
|
// log(LOG_DEBUG, "build: Number of languages (%"INT32") != number of docids (%"INT32")\n",
|
|
// linkInfo->m_numLangs, linkInfo->getNumDocIds());
|
|
// return(langUnknown);
|
|
//}
|
|
|
|
if(linkInfo->getNumGoodInlinks() < 7) return(langUnknown);
|
|
|
|
memset(languages, 0, 32);
|
|
|
|
// only check the first 100 inlinks, or we'll spin
|
|
// on some monstrous sites.
|
|
//for(x = 0; x < linkInfo->m_numLangs && x < 100; x++) {
|
|
for (Inlink*k=NULL;(k=linkInfo->getNextInlink(k)); ) {
|
|
//int32_t id = linkInfo->getLanguageId(x);
|
|
int32_t id = k->m_language;
|
|
// sanity check, we are still getting bad lang ids!!
|
|
if ( id < 0 || id >= 32 ) {
|
|
log("build: Got bad lang id of %" INT32 ". how can this "
|
|
"happen?",id);
|
|
continue;
|
|
}
|
|
// don't count langUnknown pages, it reduces hits
|
|
if ( ! id ) continue;
|
|
|
|
// skip if not from a different enough IP
|
|
if((k->m_ip&0x0000ffff)==(ip&0x0000ffff) )
|
|
continue;
|
|
// otherwise count it
|
|
languages[id]++;
|
|
hits++;
|
|
}
|
|
if(hits < 7) return(langUnknown);
|
|
for(x = 1; x < 32; x++) {
|
|
if(languages[x] >= max) {
|
|
oldmax = max;
|
|
max = languages[x];
|
|
oldmaxIndex = maxIndex;
|
|
maxIndex = x;
|
|
}
|
|
}
|
|
|
|
// sanity check
|
|
if(maxIndex > 31 || oldmaxIndex > 31) {
|
|
log(LOG_INFO,
|
|
"build: guessLanguageFromInlinks(): Possible stack corruption: %d:%d\n",
|
|
maxIndex, oldmaxIndex);
|
|
return(langUnknown);
|
|
}
|
|
|
|
// Need better than 50%
|
|
// if(max - oldmax > 4)
|
|
if(max > (linkInfo->getNumGoodInlinks() / 2))
|
|
return(maxIndex);
|
|
return(langUnknown);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromDoctype(Xml *xml, char *content) {
|
|
uint8_t rvDoc = langUnknown;
|
|
int id;
|
|
char *str;
|
|
char lang[6];
|
|
|
|
if(!content) return(langUnknown);
|
|
|
|
for(int32_t i = 0; i < xml->getNumNodes(); i++) {
|
|
id = xml->getNodeId(i);
|
|
// skip if not DOCTYPE
|
|
if ( id != TAG_DOCTYPE ) continue;
|
|
// get the tag ptr to the tag
|
|
char *tag = xml->getNode(i);
|
|
// this is in BYTES
|
|
//int32_t tagLen = xml->getNodeLen(i);
|
|
// case might be upper, so we change
|
|
// the first two letters to lower.
|
|
str = FindLanguageIndex(tag);
|
|
if(!str) continue;
|
|
if(copyLangTag(lang, str, 5))
|
|
rvDoc = getLanguageFromAbbr(lang);
|
|
return(rvDoc);
|
|
}
|
|
return(rvDoc);
|
|
}
|
|
|
|
/// Skip whitespace in a string.
|
|
///
|
|
/// Includes CR and LF.
|
|
///
|
|
/// @param str the string
|
|
///
|
|
/// @return pointer to next character that is not whitespace, or NULL
|
|
///
|
|
static char *skipwhite(char *str) {
|
|
while(str && *str &&
|
|
(*str == ' ' ||
|
|
*str == '\t' ||
|
|
*str == '\n' ||
|
|
*str == '\r'))
|
|
str++;
|
|
return(str);
|
|
}
|
|
|
|
/// Skip over 'words' in a string.
|
|
///
|
|
/// Skips over everything until there's whitespace.
|
|
///
|
|
/// @param str the string to search
|
|
///
|
|
/// @return the pointer to the next whitespace character
|
|
///
|
|
static char *skipword(char *str) {
|
|
while(str && *str &&
|
|
(*str != ' ' &&
|
|
*str != '\t' &&
|
|
*str != '\n' &&
|
|
*str != '\r'))
|
|
str++;
|
|
return(str);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromUserAgent(char *str) {
|
|
// Mozilla/5.0 (X11; U; Linux i686;
|
|
// en-US; rv:1.8.1.4) Gecko/20070531 Firefox/2.0.0.4
|
|
uint8_t lang = langUnknown;
|
|
while(*str) {
|
|
if(!(str = skipwhite(str)))
|
|
return(langUnknown);
|
|
if((lang = getLanguageFromUserAgent(str)) != langUnknown)
|
|
return(lang);
|
|
if(!(str = skipword(str)))
|
|
return(langUnknown);
|
|
}
|
|
return(langUnknown);
|
|
}
|
|
|
|
// non-recursive bisect search
|
|
char *LanguageIdentifier::findGeoIP(uint32_t address, uint32_t max,
|
|
uint32_t min, uint32_t *ldepth) {
|
|
#if 0
|
|
uint32_t limit = max;
|
|
register uint32_t median;
|
|
|
|
if(aGeoIP[0].firstAddr > address || aGeoIP[max].lastAddr < address) {
|
|
return("ob");
|
|
}
|
|
|
|
do {
|
|
// extra debugging steps
|
|
if(ldepth) {
|
|
*ldepth += 1;
|
|
if(*ldepth > limit) {
|
|
log(LOG_INFO, "build: findGeoIP(): depth exceeded limit.\n");
|
|
return("zz");
|
|
}
|
|
}
|
|
|
|
median = (max+min)/2;
|
|
|
|
// check if narrowed all the way
|
|
if(median == max || median == min) {
|
|
break;
|
|
}
|
|
|
|
// bisect down?
|
|
if(aGeoIP[median].firstAddr > address) {
|
|
max = median;
|
|
continue;
|
|
}
|
|
|
|
// bisect up?
|
|
if(aGeoIP[median].lastAddr < address) {
|
|
min = median;
|
|
continue;
|
|
}
|
|
|
|
// in range, pop out
|
|
break;
|
|
|
|
} while(max > min);
|
|
|
|
if(aGeoIP[median].firstAddr <= address && aGeoIP[median].lastAddr >= address)
|
|
return(aGeoIP[median].cCode);
|
|
|
|
#endif // 0
|
|
return("zz");
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromIP(uint32_t address) {
|
|
return langUnknown; // temp change
|
|
uint32_t ldepth = 0;
|
|
char *code = findGeoIP(address, geoIPNumRows - 1, 0, &ldepth);
|
|
if(!code) return(langUnknown);
|
|
if(code[0] == 'z' && code[1] == 'z') return(langUnknown);
|
|
if(code[0] == 'o' && code[1] == 'b') return(langUnknown);
|
|
|
|
// return unknown for some ambiguous results
|
|
if(code[0] == 'e' && code[1] == 'u') return(langUnknown);
|
|
if(code[0] == 'c' && code[1] == 'a') return(langUnknown);
|
|
|
|
return(getLanguageFromCountryCode(code));
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromDMOZ(char *addr) {
|
|
return(g_categories->findLanguage(addr));
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromQuery(Query *q) {
|
|
uint8_t lang;
|
|
if(q->getNumTerms() == 1) {
|
|
if(g_langList.lookup(q->getTermId(1), &lang))
|
|
return(lang);
|
|
} else {
|
|
// Look for two consecutive identical languages
|
|
// Not as good as a frequency count, but much faster
|
|
uint8_t last = 255;
|
|
int32_t qcount;
|
|
for(qcount = 0; qcount < q->getNumTerms(); qcount++) {
|
|
if(g_langList.lookup(q->getTermId(qcount), &lang) &&
|
|
last == lang) {
|
|
return(lang);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return(langUnknown);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::getBestLanguage(char** method,
|
|
Url* url,
|
|
Xml* xml,
|
|
Links* links,
|
|
LinkInfo* linkInfo,
|
|
char* content) {
|
|
uint8_t langEnum;
|
|
// Let the site tell us what language it's in
|
|
langEnum = g_langId.guessLanguageFromTag(xml);
|
|
*method = "Tag";
|
|
|
|
if(langEnum != langUnknown) return langEnum;
|
|
|
|
// Get the language from a DMOZ category
|
|
// Accurate, but low hit rate
|
|
langEnum = g_langId.guessLanguageFromDMOZ(url->getUrl());
|
|
*method = "DMOZ";
|
|
if(langEnum != langUnknown) return langEnum;
|
|
|
|
|
|
// Guess from the TLD
|
|
uint8_t possibleLanguage = g_langId.guessLanguageFromTld(url->getUrl());
|
|
if(possibleLanguage) langEnum = possibleLanguage;
|
|
*method = "TLD";
|
|
if(langEnum != langUnknown) return langEnum;
|
|
|
|
// m_newDoc->getLinks() can return a bad address
|
|
// Guess from the outlinks
|
|
langEnum = g_langId.guessLanguageFromOutlinks(links);
|
|
*method = "Outlinks";
|
|
if(langEnum != langUnknown) return langEnum;
|
|
// m_newDoc->getLinks() can return a bad address
|
|
|
|
// Guess from the inlinks
|
|
// langEnum = g_langId.guessLanguageFromInlinks(linkInfo);
|
|
// *method = "Inlinks";
|
|
if(langEnum != langUnknown) return langEnum;
|
|
|
|
// Word frequency count
|
|
langEnum = xml->getLanguage();
|
|
*method = "Freq";
|
|
if(langEnum != langUnknown) return langEnum;
|
|
|
|
// Let the doctype tell us what language it's in
|
|
langEnum = g_langId.guessLanguageFromDoctype(xml, content);
|
|
*method = "Doctype";
|
|
|
|
return langEnum;
|
|
}
|
|
|
|
|
|
|
|
uint8_t LanguageIdentifier::getBestLangsFromVec(char* langCount,
|
|
//SiteType* typeVec,
|
|
int32_t *langIds ,
|
|
uint8_t *langScores ,
|
|
int32_t tagVecSize) {
|
|
int32_t bestCount = -1;
|
|
uint8_t numTags = 0;
|
|
|
|
int32_t langTotal = 0;
|
|
for(int32_t j = 0; j < MAX_LANGUAGES; j++) {
|
|
langTotal += langCount[j];
|
|
}
|
|
if(langTotal == 0 || langCount[langUnknown] == langTotal)
|
|
return 0;
|
|
|
|
//dont store unknown language
|
|
langTotal -= langCount[langUnknown];
|
|
langCount[langUnknown] = 0;
|
|
|
|
|
|
for(int32_t i = 0; i < tagVecSize; i++) {
|
|
int32_t maxCount = 0;
|
|
int32_t maxCountNdx = 0;
|
|
for(int32_t j = 0; j < MAX_LANGUAGES; j++) {
|
|
if(langCount[j] > maxCount) {
|
|
maxCount = langCount[j];
|
|
maxCountNdx = j;
|
|
}
|
|
}
|
|
if(i == 0) bestCount = maxCount;
|
|
//if none found or this one is half as much as previous
|
|
//then quit.
|
|
if(maxCount == 0 ||
|
|
maxCount < (bestCount/2)) break;
|
|
//typeVec[i].m_type = maxCountNdx;
|
|
//typeVec[i].m_score = (uint8_t)((maxCount * 100.0)
|
|
// / langTotal);
|
|
langIds [i] = maxCountNdx;
|
|
langScores[i] = (uint8_t)((maxCount * 100.0) / langTotal);
|
|
langCount[maxCountNdx] = 0;
|
|
numTags++;
|
|
}
|
|
return numTags;
|
|
}
|
|
|
|
|
|
uint8_t LanguageIdentifier::findLangFromDMOZTopic(char *topic) {
|
|
int x;
|
|
for(x = 0; x < (int)(sizeof(langToTopic)/sizeof(uint8_t *)); x++) {
|
|
if ( ! langToTopic[x] ) continue;
|
|
if(!strncasecmp((char*)langToTopic[x], topic,
|
|
gbstrlen((char *)langToTopic[x])))
|
|
return(x);
|
|
}
|
|
return(langUnknown);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessGBLanguageFromUrl(char *url) {
|
|
if(!url) return(langUnknown);
|
|
uint8_t lang;
|
|
if((lang = guessLanguageFromUrl(url)) != langUnknown)
|
|
return(lang);
|
|
char code[6];
|
|
char *cp = url;
|
|
memset(code, 0, 6);
|
|
for(int x = 0; x < 6; x++) {
|
|
if((cp[x] < 'a' || cp[x] > 'z') &&
|
|
(cp[x] < 'A' || cp[x] > 'Z') &&
|
|
cp[x] != '_' && cp[x] != '-')
|
|
break;
|
|
code[x] = cp[x];
|
|
}
|
|
return(getLanguageFromCountryCode(code));
|
|
}
|
|
|
|
static inline bool s_checkCharIsBoundary(uint8_t x) {
|
|
if(x < '0') return(true);
|
|
if(x > '9' && x < 'A') return(true);
|
|
if(x > 'Z' && x < 'a') return(true);
|
|
if(x > 'z' && x < 128) return(true);
|
|
return(false);
|
|
}
|
|
|
|
static inline bool s_isRightBoundedAbbr(char *pointer, uint8_t l) {
|
|
if(s_checkCharIsBoundary(*(pointer + 2)))
|
|
return(true);
|
|
if((*(pointer + 3) == '-' || *(pointer + 3) == '_') &&
|
|
s_checkCharIsBoundary(*(pointer + 5)))
|
|
return(true);
|
|
return(false);
|
|
}
|
|
|
|
static inline bool s_isRightBoundedLanguageWord(char *pointer, uint8_t l) {
|
|
if(s_checkCharIsBoundary(*(pointer + gbstrlen(getNativeLanguageString(l)))))
|
|
return(true);
|
|
if(s_checkCharIsBoundary(*(pointer + gbstrlen(getLanguageString(l)))))
|
|
return(true);
|
|
return(false);
|
|
}
|
|
|
|
uint8_t s_lookForLanguageParam(char *url) {
|
|
char *cp = url;
|
|
uint8_t l;
|
|
// Try to find lan= or lang= or language=
|
|
while(cp && *cp && (cp = strstr(cp, "lan"))) {
|
|
if(!s_checkCharIsBoundary(*(cp - 1))) {
|
|
cp++;
|
|
continue;
|
|
}
|
|
if(!strncmp(cp, "lan=", 4)) cp += 4;
|
|
else if(!strncmp(cp, "lang=", 5)) cp += 5;
|
|
else if(!strncmp(cp, "language=", 9)) cp += 9;
|
|
|
|
if((l = getLanguageFromName((uint8_t*)cp)) &&
|
|
s_isRightBoundedLanguageWord(cp, l))
|
|
return(l);
|
|
|
|
if((l = getLanguageFromAbbrN(cp)) &&
|
|
s_isRightBoundedAbbr(cp, l))
|
|
return(l);
|
|
cp++;
|
|
}
|
|
// Try to find l=
|
|
cp = url;
|
|
while(cp && *cp && (cp = strstr(cp, "l="))) {
|
|
if(!s_checkCharIsBoundary(*(cp - 1))) {
|
|
cp++;
|
|
continue;
|
|
}
|
|
|
|
if((l = getLanguageFromName((uint8_t*)cp)) &&
|
|
s_isRightBoundedLanguageWord(cp, l))
|
|
return(l);
|
|
|
|
if((l = getLanguageFromAbbrN(cp)) &&
|
|
s_isRightBoundedAbbr(cp, l))
|
|
return(l);
|
|
cp++;
|
|
}
|
|
return(0);
|
|
}
|
|
|
|
uint8_t s_lookForLanguagePrefix(char *url) {
|
|
char *cp = url;
|
|
uint8_t l = 0;
|
|
// Look for a prefix on the url
|
|
// Do not add a postfix or TLD detector,
|
|
// they are not good indications at all.
|
|
if(!strncmp(url, "http://", 7)) cp = url + 7;
|
|
else cp = url;
|
|
|
|
if((l = getLanguageFromAbbrN(cp)) &&
|
|
s_isRightBoundedAbbr(cp, l))
|
|
return(l);
|
|
|
|
// Lookup, and see if it's on a word boundary
|
|
if((l = getLanguageFromName((uint8_t*)cp)) &&
|
|
s_isRightBoundedLanguageWord(cp, l))
|
|
return(l);
|
|
return(0);
|
|
}
|
|
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFromUrl(char *url) {
|
|
int len = 0;
|
|
char *cp = url;
|
|
char code[3];
|
|
uint8_t l = 0;
|
|
|
|
if(!url) return(langUnknown);
|
|
|
|
// Look for a parameter that would indicate the language
|
|
if((l = s_lookForLanguageParam(url))) return(l);
|
|
|
|
// Look for a prefix that would indicate the language
|
|
if((l = s_lookForLanguagePrefix(url))) return(l);
|
|
|
|
// if no slash, start at the end of the link
|
|
if(!(cp = strchr(url, '/')))
|
|
cp = url + (gbstrlen(url) - 1);
|
|
|
|
// find last dot
|
|
while(*cp && cp > url && *cp != '.') {
|
|
cp--;
|
|
len++;
|
|
}
|
|
|
|
// No dot?
|
|
if(cp <= url) return(langUnknown);
|
|
|
|
// skip '.'
|
|
len--; cp++;
|
|
|
|
code[0] = cp[0];
|
|
code[1] = cp[1];
|
|
code[2] = 0;
|
|
|
|
return(getLanguageFromCountryCode(code));
|
|
}
|
|
|
|
static inline int s_findMaxInList(int *list, int numItems) {
|
|
int max, oldmax, idx;
|
|
if(!list) return(0);
|
|
max = oldmax = INT_MIN;
|
|
idx = 0;
|
|
for(int x = 0; x < numItems; x++) {
|
|
if(list[x] >= max) {
|
|
oldmax = max;
|
|
max = list[x];
|
|
idx = x;
|
|
}
|
|
}
|
|
if(oldmax == max) return(0);
|
|
return(idx);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessLanguageFreqCount(Xml *xml,
|
|
int pageLimit /* = 512 */) {
|
|
if(!xml) return(langUnknown);
|
|
|
|
int votes[MAX_LANGUAGES];
|
|
int limit = xml->getNumNodes();
|
|
int scores[MAX_LANGUAGES];
|
|
|
|
if(pageLimit < limit) limit = pageLimit;
|
|
|
|
memset(votes, 0, sizeof(int) * MAX_LANGUAGES);
|
|
|
|
// Do term frequency count
|
|
for(int x = 0; x < limit; x++) {
|
|
if(xml->isTag(x) || xml->getNodeLen((int32_t)x) < 2) continue;
|
|
char *cp = g_speller.getPhraseRecord(xml->getNode((int32_t)x),
|
|
xml->getNodeLen((int32_t)x));
|
|
if(!cp) continue;
|
|
memset(scores, 0, sizeof(int) * MAX_LANGUAGES);
|
|
while(*cp) {
|
|
// skip leading whitespace
|
|
while(*cp && (*cp == ' ' || *cp == '\t')) cp++;
|
|
// get language
|
|
int l = atoi(cp);
|
|
// skip to next delimiter
|
|
while(*cp && *cp != '\t') cp++;
|
|
// skip over tab
|
|
cp++;
|
|
// get score
|
|
scores[l] = atoi(cp);
|
|
// skip to next delimiter
|
|
while(*cp && *cp != '\t') cp++;
|
|
}
|
|
votes[s_findMaxInList(scores, MAX_LANGUAGES)]++;
|
|
}
|
|
|
|
// Find max
|
|
int max = 0;
|
|
int maxidx = 0;
|
|
int oldmax = 0;
|
|
for(int x = 0; x < MAX_LANGUAGES; x++) {
|
|
if(votes[x] < max) continue;
|
|
oldmax = max;
|
|
max = votes[x];
|
|
maxidx = x;
|
|
}
|
|
|
|
if(max == 0) maxidx = 0;
|
|
|
|
#if 0
|
|
// English, British, and Australian are no longer separate
|
|
// If it's a toss up between any version of English, go with it.
|
|
if((max == langEnglish || max == langAustralia || max == langBritish) &&
|
|
(oldmax == langEnglish || oldmax == langAustralia || oldmax == langBritish))
|
|
return(maxidx);
|
|
#endif // 0
|
|
|
|
// Note the winner
|
|
if(oldmax <= 0 || max > oldmax)
|
|
return maxidx;
|
|
return langUnknown;
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessCountryTLD(const char *url) {
|
|
uint8_t country = 0;
|
|
char code[3];
|
|
code[0] = code[1] = code [2] = 0;
|
|
|
|
// check for prefix
|
|
if(url[9] == '.') {
|
|
code[0] = url[7];
|
|
code[1] = url[8];
|
|
code[2] = 0;
|
|
country = g_countryCode.getIndexOfAbbr(code);
|
|
if(country) return(country);
|
|
}
|
|
|
|
// Check for two letter TLD
|
|
const char *cp = strchr(url+7, ':');
|
|
if(!cp)
|
|
cp = strchr(url+7, '/');
|
|
if(cp && *(cp -3) == '.') {
|
|
cp -= 2;
|
|
code[0] = cp[0];
|
|
code[1] = cp[1];
|
|
code[2] = 0;
|
|
country = g_countryCode.getIndexOfAbbr(code);
|
|
if(country) return(country);
|
|
}
|
|
return(country);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessCountryIP(uint32_t ip) {
|
|
// Lookup IP address
|
|
uint8_t country = 0;
|
|
char *codep = findGeoIP(ip, geoIPNumRows - 1, 0);
|
|
if(!codep) return(0);
|
|
country = g_countryCode.getIndexOfAbbr(codep);
|
|
return(country);
|
|
}
|
|
|
|
static int s_wordLen(char *str) {
|
|
char *cp = str;
|
|
while(*cp && *cp != ' ' && *cp != ';' &&*cp != '\t' &&
|
|
*cp != '\n' && *cp != '\r' && *cp != '.' && *cp != ',')
|
|
cp++;
|
|
return(cp - str);
|
|
}
|
|
|
|
static bool s_isLangTag(char *str) {
|
|
int len = s_wordLen(str);
|
|
if(len == 2) return(true);
|
|
if(len != 5) return(false);
|
|
if(str[2] == '_' || str[2] == '-') return(true);
|
|
return(false);
|
|
}
|
|
|
|
static uint8_t s_getCountryFromSpec(char *str) {
|
|
char code[6];
|
|
memset(code, 0,6);
|
|
gbmemcpy(code, str, s_wordLen(str));
|
|
for(int x = 0; x < 6; x++)
|
|
if(code[x] > 'A' && code[x] < 'Z') code[x] -= ('A' - 'a');
|
|
if(code[2] == '_' || code[2] == '-')
|
|
return g_countryCode.getIndexOfAbbr(&code[3]);
|
|
return g_countryCode.getIndexOfAbbr(code);
|
|
}
|
|
|
|
uint8_t LanguageIdentifier::guessCountryFromUserAgent(char *ua) {
|
|
if(!ua) return(0);
|
|
uint8_t country = 0;
|
|
while(*ua) {
|
|
if(!(ua = skipwhite(ua)))
|
|
return(0);
|
|
if(s_isLangTag(ua) &&
|
|
(country = s_getCountryFromSpec(ua)) != 0)
|
|
return(country);
|
|
if(!(ua = skipword(ua)))
|
|
return(0);
|
|
}
|
|
return(0);
|
|
}
|
|
|