186 lines
5.0 KiB
C++
186 lines
5.0 KiB
C++
/// \file language.c \brief Language detection utility routines.
|
|
///
|
|
/// Contains the main utility function, guessLanguage(), and all
|
|
/// the support routines for detecting the language of a web page.
|
|
///
|
|
/// 2007 May 24 09:02:52
|
|
/// $ID$
|
|
/// $Author: John Nanney$
|
|
/// $Workfile$
|
|
/// $Log$
|
|
///
|
|
|
|
// using a different macro because there's already a Language.h
|
|
#ifndef LANGUAGEIDENTIFIER_H
|
|
#define LANGUAGEIDENTIFIER_H
|
|
|
|
#include "gb-include.h"
|
|
#include "Xml.h"
|
|
#include "Linkdb.h"
|
|
//#include "LinkInfo.h"
|
|
#include "Query.h"
|
|
|
|
/// Contains methods of language identification by various means.
|
|
class LanguageIdentifier {
|
|
public:
|
|
/// Constructor, does very little.
|
|
LanguageIdentifier();
|
|
|
|
/// Destructor, does very little.
|
|
~LanguageIdentifier() { return; }
|
|
|
|
/// Get the language from the page's lang="" tag.
|
|
///
|
|
/// Looks for a lang="x" property in the HTML, BODY, or HEAD
|
|
/// tag. Returns the first match. This is usually a very
|
|
/// accurate guess of the language, since the author of the
|
|
/// page went through all the trouble to make sure it was
|
|
/// in there.
|
|
///
|
|
/// @param xml the page's xml object
|
|
///
|
|
/// @return the language, or langUnknown
|
|
///
|
|
uint8_t guessLanguageFromTag(Xml *xml);
|
|
|
|
/// Guess the language from the TLDs of outlinks found in the page.
|
|
///
|
|
/// TLDs which are ambiguous like .com are skipped.
|
|
///
|
|
/// @param links a list of links
|
|
///
|
|
/// @return the language, or langUnknown
|
|
///
|
|
uint8_t guessLanguageFromOutlinks(Links *links);
|
|
|
|
/// Guess the language from the page's TLD.
|
|
///
|
|
/// @param linktext the ascii URL
|
|
///
|
|
/// @return the language, or langUnknown
|
|
///
|
|
uint8_t guessLanguageFromTld(char *linktext);
|
|
|
|
/// Guess the language from the languages of the inlinks.
|
|
///
|
|
/// @param linkInfo
|
|
///
|
|
/// @return the language, or langUnknown
|
|
///
|
|
uint8_t guessLanguageFromInlinks(LinkInfo *linkInfo, int32_t ip);
|
|
|
|
/// Determine whether a given TLD is suitable for language detection.
|
|
/// @param tld the TLD in ascii
|
|
/// @param len the length of tld
|
|
/// @return true if suitable, false if not
|
|
///
|
|
inline bool isAmbiguousTLD(char *tld, int len);
|
|
|
|
/// Return the greater of two ints.
|
|
inline int maxOf(int a, int b) {
|
|
if(b > a) return(b);
|
|
return(a);
|
|
}
|
|
|
|
/// Guesses language from the DOCTYPE string present in many pages.
|
|
///
|
|
/// @param xml the page's xml object
|
|
/// @param content the page's content, for finding the doctype
|
|
///
|
|
/// @return the language, or langUnknown
|
|
///
|
|
uint8_t guessLanguageFromDoctype(Xml *xml, char *content);
|
|
|
|
/// Guess a language from a tag in the user agent string.
|
|
///
|
|
/// @param str the user agent string
|
|
///
|
|
/// @return the language, or langUknown
|
|
///
|
|
uint8_t guessLanguageFromUserAgent(char *str);
|
|
/// Find a language for a given IP address.
|
|
///
|
|
/// @param address the address
|
|
///
|
|
/// @return the language ID, or langUnknown
|
|
///
|
|
uint8_t guessLanguageFromIP(uint32_t address);
|
|
|
|
/// Find a country code for a given IP address.
|
|
///
|
|
/// This probably should not be called from user code,
|
|
/// but it may be useful in some situations.
|
|
///
|
|
/// @param address the IP address as an unsigned int
|
|
/// @param max optional, highest entry to search
|
|
/// @param min optional, lowest entry to search
|
|
/// @param *ldepth to be filled in by the highest depth of recursion
|
|
///
|
|
/// @return country code on success, or NULL (or "zz") on failure
|
|
///
|
|
char *findGeoIP(uint32_t address, uint32_t max,
|
|
uint32_t min = 0, uint32_t *ldepth = NULL);
|
|
|
|
/// Find an address in DMOZ for the language.
|
|
///
|
|
/// Looks up the page address in the category language tables.
|
|
///
|
|
/// @param addr the page address
|
|
///
|
|
/// @return language, or langUnknown if not found
|
|
///
|
|
uint8_t guessLanguageFromDMOZ(char *addr);
|
|
|
|
/// Guess the query language from the query terms.
|
|
///
|
|
/// This algorithm looks for two consecutive terms with the
|
|
/// same language.
|
|
///
|
|
/// @param q the query object
|
|
///
|
|
/// @return the language, or langUnknown
|
|
///
|
|
uint8_t guessLanguageFromQuery(Query *q);
|
|
|
|
/// Find a language from DMOZ topic.
|
|
///
|
|
/// The function name is a bit misleading, we expect
|
|
/// the country from the Top/World/X node.
|
|
///
|
|
/// @param topic the country name
|
|
///
|
|
/// @return the language, or langUnknown
|
|
///
|
|
uint8_t findLangFromDMOZTopic(char *topic);
|
|
|
|
|
|
uint8_t getBestLanguage(char** method,
|
|
Url* url,
|
|
Xml* xml,
|
|
Links* links,
|
|
LinkInfo* linkInfo,
|
|
char* content);
|
|
|
|
uint8_t getBestLangsFromVec(char* langCount,
|
|
//SiteType* typeVec,
|
|
int32_t *langIds ,
|
|
uint8_t *langScores ,
|
|
int32_t tagVecSize);
|
|
|
|
uint8_t guessGBLanguageFromUrl(char *url);
|
|
uint8_t guessLanguageFromUrl(char *url);
|
|
|
|
uint8_t guessLanguageFreqCount(Xml *xml,
|
|
int pageLimit /* = 512 */);
|
|
|
|
uint8_t guessCountryTLD(const char *url);
|
|
uint8_t guessCountryIP(uint32_t ip);
|
|
|
|
uint8_t guessCountryFromUserAgent(char *ua);
|
|
};
|
|
|
|
extern class LanguageIdentifier g_langId;
|
|
extern const uint8_t *langToTopic[];
|
|
|
|
#endif // LANGUAGEIDENTIFIER_H
|