mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-14 02:36:06 -04:00
Merge branch 'master' into tokenizer
This commit is contained in:
@ -164,7 +164,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {
|
||||
|
||||
// we might be getting inlinks for a spider request
|
||||
// so make sure timeout is inifinite for that...
|
||||
const int32_t timeout = (req->m_niceness==0)
|
||||
const int64_t timeout = (req->m_niceness==0)
|
||||
? multicast_msg20_summary_timeout
|
||||
: multicast_infinite_send_timeout;
|
||||
|
||||
|
@ -311,12 +311,11 @@ bool SiteGetter::gotSiteList ( ) {
|
||||
log("site: sitegetter gotList: %s",mstrerror(g_errno));
|
||||
// mark it so caller knows
|
||||
m_errno = g_errno;
|
||||
// so try again without increasing m_pathDepth
|
||||
// i've seen a host return EBADRDBID for some reason
|
||||
// and put host #0 in an infinite log spam loop so stop it
|
||||
if ( g_errno != EBADRDBID ) m_tryAgain = true;
|
||||
|
||||
// let UdpServer do the retries for error scenario
|
||||
return true;
|
||||
}
|
||||
|
||||
// how many urls at this path depth?
|
||||
int32_t count = ( m_list.getListSize() - 6 ) / 6;
|
||||
// if we do not have enough to quality this as a subsite path depth
|
||||
|
43
Summary.cpp
43
Summary.cpp
@ -124,6 +124,7 @@ bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *
|
||||
|
||||
// itemprop = "description"
|
||||
if ( xml->getTagContent("itemprop", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen) ) {
|
||||
maybeRemoveHtmlFormatting();
|
||||
if ( verifySummary( titleBuf, titleBufLen ) ) {
|
||||
m_isSetFromTags = true;
|
||||
|
||||
@ -136,6 +137,7 @@ bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *
|
||||
|
||||
// meta property = "og:description"
|
||||
if ( xml->getTagContent("property", "og:description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
|
||||
maybeRemoveHtmlFormatting();
|
||||
if ( verifySummary( titleBuf, titleBufLen ) ) {
|
||||
m_isSetFromTags = true;
|
||||
|
||||
@ -148,6 +150,7 @@ bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *
|
||||
|
||||
// meta name = "description"
|
||||
if ( xml->getTagContent("name", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
|
||||
maybeRemoveHtmlFormatting();
|
||||
if ( verifySummary( titleBuf, titleBufLen ) ) {
|
||||
m_isSetFromTags = true;
|
||||
|
||||
@ -162,6 +165,7 @@ bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *
|
||||
if ( xml->getTagContent("property", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
|
||||
if ( verifySummary( titleBuf, titleBufLen ) ) {
|
||||
m_isSetFromTags = true;
|
||||
maybeRemoveHtmlFormatting();
|
||||
|
||||
logDebug(g_conf.m_logDebugSummary, "sum: generated from meta property description. summary='%.*s'", m_summaryLen, m_summary);
|
||||
logTrace(g_conf.m_logTraceSummary, "END. Generated from meta property description. Returning true");
|
||||
@ -1153,3 +1157,42 @@ bool Summary::getDefaultSummary(const Xml *xml, const TokenizerResult *tr, const
|
||||
logTrace(g_conf.m_logTraceSummary, "END. Returning true");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void Summary::maybeRemoveHtmlFormatting() {
|
||||
//Some websites have junk in their meta tags. Eg <br> in the meta description
|
||||
//We don't fix all cases as that could hurt correctly written pages about how to write proper html. But
|
||||
//if they don't mention "html", "tag" nor "element" then we remove the most common offenders br/b/i/p
|
||||
//When changing this function consider keeping in sync with XmlDoc_Indexing.cpp:possiblyDecodeHtmlEntitiesAgain()
|
||||
if(memmem(m_summary,m_summaryLen,"html",4)==0 &&
|
||||
memmem(m_summary,m_summaryLen,"HTML",4)==0 &&
|
||||
memmem(m_summary,m_summaryLen,"tag",3)==0 &&
|
||||
memmem(m_summary,m_summaryLen,"Tag",3)==0 &&
|
||||
memmem(m_summary,m_summaryLen,"element",7)==0 &&
|
||||
memmem(m_summary,m_summaryLen,"Element",7)==0)
|
||||
{
|
||||
for(int i=0; i<m_summaryLen; ) {
|
||||
char *p = (char*)memchr(m_summary+i,'<',m_summaryLen-i);
|
||||
if(!p)
|
||||
break;
|
||||
i = p-m_summary;
|
||||
if(i+4<m_summaryLen) {
|
||||
if(memcmp(p,"<br>",4)==0) {
|
||||
memmove(m_summary+i,m_summary+i+4,m_summaryLen-i-4);
|
||||
m_summaryLen -= 4;
|
||||
} else if(memcmp(p,"<b>",3)==0) {
|
||||
memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
|
||||
m_summaryLen -= 3;
|
||||
} else if(memcmp(p,"<i>",3)==0) {
|
||||
memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
|
||||
m_summaryLen -= 3;
|
||||
} else if(memcmp(p,"<p>",3)==0) {
|
||||
memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
|
||||
m_summaryLen -= 3;
|
||||
} else
|
||||
i++;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -63,6 +63,8 @@ private:
|
||||
int64_t getBestWindow (const Matches *matches, int32_t mn, int32_t *lasta, int32_t *besta, int32_t *bestb,
|
||||
char *gotIt, char *retired, int32_t maxExcerptLen );
|
||||
|
||||
void maybeRemoveHtmlFormatting();
|
||||
|
||||
// null terminate and store the summary here.
|
||||
char m_summary[ MAX_SUMMARY_LEN ];
|
||||
int32_t m_summaryLen;
|
||||
|
@ -21,6 +21,56 @@
|
||||
#endif
|
||||
|
||||
|
||||
static void possiblyDecodeHtmlEntitiesAgain(const char **s, int32_t *len, SafeBuf *sb, bool also_remove_certain_html_elements) {
|
||||
//some documents have incorrectly encoded html entities twice. Example:
|
||||
//correct: <meta name="foo" content="Boa">
|
||||
//incorrect: <meta name="foo" content="&#66;oa">
|
||||
//If it seems likely that this has happened then we decode the entities again and put the result in 'sb' and update '*s' and '*len'
|
||||
|
||||
//Due to the (il)logic of GB the correct form is decoded, while the incorrect form is still raw, needing double decoding
|
||||
|
||||
//require & following by a second semicolon
|
||||
const char *amppos = (const char*)memmem(*s,*len, "&", 5);
|
||||
if((amppos && memchr(amppos+5, ';', *len-(amppos-*s))!=NULL) ||
|
||||
(memmem(*s,*len,"<",4)!=NULL && memmem(*s,*len,">",4)!=NULL)) {
|
||||
//shortest entity is 4 char (<), longest utf8 encoding of a codepoint is 4 + a bit
|
||||
StackBuf<1024> tmpBuf;
|
||||
if(!tmpBuf.reserve(*len + *len/2 + 4))
|
||||
return;
|
||||
if(!sb->reserve(*len + *len/2 + 4))
|
||||
return;
|
||||
|
||||
int32_t tmpLen = htmlDecode(tmpBuf.getBufStart(), *s,*len, false);
|
||||
|
||||
int32_t newlen = htmlDecode(sb->getBufStart(), tmpBuf.getBufStart(), tmpLen, false);
|
||||
|
||||
sb->setLength(newlen);
|
||||
|
||||
//Furthermore, some websites have junk in their meta tags. Eg <br> in the meta description
|
||||
//We don't fix all cases as that could hurt correctly written pages about how to write proper html. But
|
||||
//if they don't mention "html", "tag" nor "element" then we remove the most common offenders br/b/i/p
|
||||
//When changing this function consider keeping in sync with Summary::maybeRemoveHtmlFormatting()
|
||||
if(also_remove_certain_html_elements) {
|
||||
if(memmem(sb->getBufStart(),sb->length(),"html",4)==0 &&
|
||||
memmem(sb->getBufStart(),sb->length(),"HTML",4)==0 &&
|
||||
memmem(sb->getBufStart(),sb->length(),"tag",3)==0 &&
|
||||
memmem(sb->getBufStart(),sb->length(),"Tag",3)==0 &&
|
||||
memmem(sb->getBufStart(),sb->length(),"element",7)==0 &&
|
||||
memmem(sb->getBufStart(),sb->length(),"Element",7)==0)
|
||||
{
|
||||
sb->safeReplace2("<br>",4,"",0,0);
|
||||
sb->safeReplace2("<b>",3,"",0,0);
|
||||
sb->safeReplace2("<u>",3,"",0,0);
|
||||
sb->safeReplace2("<p>",3,"",0,0);
|
||||
}
|
||||
}
|
||||
*s = sb->getBufStart();
|
||||
*len = sb->length();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// a ptr to HashInfo is passed to hashString() and hashWords()
|
||||
class HashInfo {
|
||||
public:
|
||||
@ -466,10 +516,6 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing meta tags" );
|
||||
|
||||
// assume it's empty
|
||||
char buf [ 32*1024 ];
|
||||
int32_t bufLen = 32*1024 - 1;
|
||||
buf[0] = '\0';
|
||||
int32_t n = m_xml.getNumNodes();
|
||||
XmlNode *nodes = m_xml.getNodes();
|
||||
|
||||
@ -481,36 +527,15 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
|
||||
// find the first meta summary node
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
// continue if not a meta tag
|
||||
if ( nodes[i].m_nodeId != TAG_META ) continue;
|
||||
//we are only interested in meta tags
|
||||
if(nodes[i].m_nodeId != TAG_META)
|
||||
continue;
|
||||
// only get content for <meta name=..> not <meta http-equiv=..>
|
||||
int32_t tagLen;
|
||||
char *tag = m_xml.getString ( i , "name" , &tagLen );
|
||||
char tagLower[128];
|
||||
int32_t j ;
|
||||
int32_t code;
|
||||
// skip if empty
|
||||
const char *tag = m_xml.getString(i, "name", &tagLen);
|
||||
// skip if error/empty
|
||||
if ( ! tag || tagLen <= 0 ) continue;
|
||||
// make tag name lower case and do not allow bad chars
|
||||
if ( tagLen > 126 ) tagLen = 126 ;
|
||||
to_lower3_a ( tag , tagLen , tagLower );
|
||||
for ( j = 0 ; j < tagLen ; j++ ) {
|
||||
// bail if has unacceptable chars
|
||||
if ( ! is_alnum_a ( tag[j] ) &&
|
||||
tag[j] != '-' &&
|
||||
tag[j] != '_' &&
|
||||
tag[j] != '.' ) break;
|
||||
// convert to lower
|
||||
tagLower[j] = to_lower_a ( tag[j] );
|
||||
}
|
||||
// skip this meta if had unacceptable chars
|
||||
if ( j < tagLen ) continue;
|
||||
// is it recognized?
|
||||
code = getFieldCode ( tag , tagLen );
|
||||
|
||||
// . do not allow reserved tag names
|
||||
// . title,url,suburl,
|
||||
if ( code != FIELD_GENERIC ) continue;
|
||||
// this is now reserved
|
||||
// do not hash keyword, keywords, description, or summary metas
|
||||
// because that is done in hashRange() below based on the
|
||||
@ -540,48 +565,18 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
|
||||
// get the content
|
||||
int32_t len;
|
||||
char *s = m_xml.getString ( i , "content" , &len );
|
||||
const char *s = m_xml.getString ( i , "content" , &len );
|
||||
if ( ! s || len <= 0 ) continue;
|
||||
// . ensure not too big for our buffer (keep room for a \0)
|
||||
// . TODO: this is wrong, should be len+1 > bufLen,
|
||||
// but can't fix w/o resetting the index (COME BACK HERE
|
||||
// and see where we index meta tags besides this place!!!)
|
||||
// remove those other places, except... what about keywords
|
||||
// and description?
|
||||
if ( len+1 >= bufLen ) {
|
||||
//len = bufLen - 1;
|
||||
// assume no punct to break on!
|
||||
len = 0;
|
||||
// only cut off at punctuation
|
||||
char *p = s;
|
||||
char *pend = s + len;
|
||||
char *last = NULL;
|
||||
int32_t size ;
|
||||
for ( ; p < pend ; p += size ) {
|
||||
// skip if utf8 char
|
||||
size = getUtf8CharSize(*p);
|
||||
// skip if 2+ bytes
|
||||
if ( size > 1 ) continue;
|
||||
// skip if not punct
|
||||
if ( is_alnum_a(*p) ) continue;
|
||||
// mark it
|
||||
last = p;
|
||||
}
|
||||
if ( last ) len = last - s;
|
||||
// this old way was faster...:
|
||||
//while ( len > 0 && is_alnum(s[len-1]) ) len--;
|
||||
}
|
||||
// convert html entities to their chars
|
||||
len = saftenTags ( buf , bufLen , s , len );
|
||||
// NULL terminate the buffer
|
||||
buf[len] = '\0';
|
||||
|
||||
StackBuf<1024> doubleDecodedContent;
|
||||
possiblyDecodeHtmlEntitiesAgain(&s, &len, &doubleDecodedContent, true);
|
||||
|
||||
// Now index the wanted meta tags as normal text without prefix so they
|
||||
// are used in user searches automatically.
|
||||
hi.m_prefix = NULL;
|
||||
|
||||
// desc is NULL, prefix will be used as desc
|
||||
bool status = hashString ( buf,len,&hi );
|
||||
bool status = hashString ( s,len, &hi );
|
||||
|
||||
// bail on error, g_errno should be set
|
||||
if ( ! status ) return false;
|
||||
@ -1377,11 +1372,22 @@ bool XmlDoc::hashTitle ( HashTableX *tt ) {
|
||||
|
||||
//FIXME: assumption: title tokens are the phase-1 tokens and the tokens are in contiguous memory
|
||||
//FIXME: also grab the alternative tokens from phase 2 in the title part
|
||||
if ( ! hashString(a, i, &hi) ) return false;
|
||||
|
||||
//clean indexing:
|
||||
// if ( ! hashString(a, i, &hi) ) return false;
|
||||
//but due to bad webmasters:
|
||||
|
||||
const char *title = m_tokenizerResult[a].token_start;
|
||||
const char *titleEnd = m_tokenizerResult[i].token_end();
|
||||
int32_t titleLen = titleEnd - title;
|
||||
StackBuf<1024> doubleDecodedContent;
|
||||
possiblyDecodeHtmlEntitiesAgain(&title, &titleLen, &doubleDecodedContent, false);
|
||||
|
||||
if ( ! hashString(title, titleLen, &hi)) return false;
|
||||
|
||||
// now hash as without title: prefix
|
||||
hi.m_prefix = NULL;
|
||||
if ( ! hashString(a, i, &hi) ) return false;
|
||||
if ( ! hashString(title, titleLen, &hi)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -1486,11 +1492,14 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing meta summary" );
|
||||
|
||||
StackBuf<1024> doubleDecodedContent;
|
||||
|
||||
// hash the meta keywords tag
|
||||
//char buf [ 2048 + 2 ];
|
||||
//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
|
||||
int32_t mslen;
|
||||
const char *ms = getMetaSummary ( &mslen );
|
||||
possiblyDecodeHtmlEntitiesAgain(&ms, &mslen, &doubleDecodedContent, true);
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
@ -1505,7 +1514,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
||||
|
||||
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
|
||||
int32_t mdlen;
|
||||
char *md = getMetaDescription ( &mdlen );
|
||||
const char *md = getMetaDescription ( &mdlen );
|
||||
possiblyDecodeHtmlEntitiesAgain(&md, &mdlen, &doubleDecodedContent, true);
|
||||
|
||||
// udpate hashing parms
|
||||
hi.m_desc = "meta desc";
|
||||
|
Reference in New Issue
Block a user