2013-08-02 13:12:24 -07:00
# include "gb-include.h"
2015-12-01 12:38:51 +01:00
# include "Query.h"
2013-08-02 13:12:24 -07:00
# include "Title.h"
# include "Words.h"
# include "Sections.h"
# include "Pops.h"
# include "Pos.h"
2016-02-18 14:45:48 +01:00
# include "Matches.h"
2013-08-02 13:12:24 -07:00
# include "HashTable.h"
2016-02-18 14:45:48 +01:00
# include "HttpMime.h"
# include "Linkdb.h"
2016-06-20 12:30:26 +02:00
# include "Process.h"
2016-03-08 11:10:50 +01:00
# ifdef _VALGRIND_
# include <valgrind/memcheck.h>
# endif
2013-08-02 13:12:24 -07:00
// test urls
// http://www.thehindu.com/2009/01/05/stories/2009010555661000.htm
// http://xbox360.ign.com/objects/142/14260912.html
// http://www.scmp.com/portal/site/SCMP/menuitem.2c913216495213d5df646910cba0a0a0?vgnextoid=edeb63a0191ae110VgnVCM100000360a0a0aRCRD&vgnextfmt=teaser&ss=Markets&s=Business
// http://www.legacy.com/shelbystar/Obituaries.asp?Page=LifeStory&PersonId=122245831
// http://web.me.com/bluestocking_bb/The_Bluestocking_Guide/Book_Reviews/Entries/2009/1/6_Hamlet.html
// http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/
// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html
// www4.gsb.columbia.edu/cbs-directory/detail/6335554/Schoenberg
// http://www.washingtonpost.com/wp-dyn/content/article/2008/10/29/AR2008102901960.html
// http://www.w3.org/2008/12/wcag20-pressrelease.html
// http://www.usnews.com/articles/business/best-careers/2008/12/11/best-careers-2009-librarian.html
// http://www.verysmartbrothas.com/2008/12/09/
// http://www.slashgear.com/new-palm-nova-handset-to-have-touchscreen-and-qwerty-keyboard-0428710/
// still bad
// http://66.231.188.171:8500/search?k3j=668866&c=main&n=20&ldays=1&q=url%3Ahttp%3A%2F%2Fmichellemalkin.com%2F2008%2F12%2F29%2Fgag-worthy%2F selects
// "gag-worthy" instead of
// "Gag-worthy: Bipartisan indignance over .Barack the Magic Negro. parody"
// http://www.1800pocketpc.com/2009/01/09/web-video-downloader-00160-download-videos-from-youtube-on-your-pocket-pc.html : need to fix the numbers in the
// path somehow so similarity is higher
Title : : Title ( ) {
2016-01-15 15:52:17 +01:00
m_title [ 0 ] = ' \0 ' ;
m_titleLen = 0 ;
2016-02-18 14:45:48 +01:00
m_titleTagStart = - 1 ;
m_titleTagEnd = - 1 ;
2013-08-02 13:12:24 -07:00
}
Title : : ~ Title ( ) {
}
void Title : : reset ( ) {
2016-01-15 15:52:17 +01:00
m_title [ 0 ] = ' \0 ' ;
m_titleLen = 0 ;
2013-08-02 13:12:24 -07:00
m_titleTagStart = - 1 ;
m_titleTagEnd = - 1 ;
}
2016-02-25 13:43:37 +01:00
bool Title : : setTitleFromTags ( Xml * xml , int32_t maxTitleLen , uint8_t contentType ) {
2016-06-09 14:51:52 +02:00
/// @todo cater for CT_DOC (when antiword is replaced)
// only allow html & pdf documents for now
if ( contentType ! = CT_HTML & & contentType ! = CT_PDF ) {
2016-02-25 13:43:37 +01:00
return false ;
}
2016-01-20 13:32:13 +01:00
/// @todo ALC configurable minTitleLen so we can tweak this as needed
const int minTitleLen = 3 ;
2016-01-15 15:52:17 +01:00
// meta property = "og:title"
2016-06-09 14:51:52 +02:00
if ( contentType = = CT_HTML & &
xml - > getTagContent ( " property " , " og:title " , m_title , MAX_TITLE_LEN , minTitleLen , maxTitleLen , & m_titleLen , true , TAG_META ) ) {
logDebug ( g_conf . m_logDebugTitle , " title: generated from meta property og:title. title='%.*s' " , m_titleLen , m_title ) ;
2016-01-15 15:52:17 +01:00
return true ;
}
// meta name = "title"
2016-06-09 14:51:52 +02:00
if ( contentType = = CT_HTML & &
xml - > getTagContent ( " name " , " title " , m_title , MAX_TITLE_LEN , minTitleLen , maxTitleLen , & m_titleLen , true , TAG_META ) ) {
logDebug ( g_conf . m_logDebugTitle , " title: generated from meta property title. title='%.*s' " , m_titleLen , m_title ) ;
2016-01-15 15:52:17 +01:00
return true ;
}
// title
if ( xml - > getTagContent ( " " , " " , m_title , MAX_TITLE_LEN , minTitleLen , maxTitleLen , & m_titleLen , true , TAG_TITLE ) ) {
2016-06-09 14:51:52 +02:00
if ( contentType = = CT_PDF ) {
// when using pdftohtml, the title tag is the filename when PDF property does not have title tag
const char * result = strnstr ( m_title , " /in. " , m_titleLen ) ;
if ( result ! = NULL ) {
char * endp = NULL ;
// do some further verification to avoid screwing up title
if ( ( strtoll ( result + 4 , & endp , 10 ) > 0 ) & & ( endp = = m_title + m_titleLen ) ) {
m_title [ 0 ] = ' \0 ' ;
m_titleLen = 0 ;
return false ;
}
}
2016-01-15 15:52:17 +01:00
}
2016-06-09 14:51:52 +02:00
logDebug ( g_conf . m_logDebugTitle , " title: generated from title tag. title='%.*s' " , m_titleLen , m_title ) ;
2016-01-15 15:52:17 +01:00
return true ;
}
2016-06-09 14:51:52 +02:00
logDebug ( g_conf . m_logDebugTitle , " title: unable to generate title from meta/title tags " ) ;
2016-01-20 15:57:11 +01:00
2016-01-15 15:52:17 +01:00
return false ;
}
2013-08-02 13:12:24 -07:00
// types of titles. indicates where they came from.
# define TT_LINKTEXTLOCAL 1
# define TT_LINKTEXTREMOTE 2
# define TT_RSSITEMLOCAL 3
# define TT_RSSITEMREMOTE 4
# define TT_BOLDTAG 5
# define TT_HTAG 6
# define TT_TITLETAG 7
# define TT_FIRSTLINE 9
# define TT_DIVTAG 10
# define TT_FONTTAG 11
# define TT_ATAG 12
# define TT_TDTAG 13
# define TT_PTAG 14
# define TT_URLPATH 15
# define TT_TITLEATT 16
# define MAX_TIT_CANDIDATES 100
// does word qualify as a subtitle delimeter?
2014-11-10 14:45:11 -08:00
bool isWordQualified ( char * wp , int32_t wlen ) {
2013-08-02 13:12:24 -07:00
// must be punct word
2016-02-18 14:45:48 +01:00
if ( is_alnum_utf8 ( wp ) ) {
return false ;
}
2013-08-02 13:12:24 -07:00
// scan the chars
2016-02-18 14:45:48 +01:00
int32_t x ;
for ( x = 0 ; x < wlen ; x + + ) {
if ( wp [ x ] = = ' ' ) {
continue ;
}
2013-08-02 13:12:24 -07:00
break ;
}
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// does it qualify as a subtitle delimeter?
bool qualified = false ;
2016-02-18 14:45:48 +01:00
if ( x < wlen ) {
qualified = true ;
}
2013-08-02 13:12:24 -07:00
// fix amazon.com from splitting on period
2016-02-18 14:45:48 +01:00
if ( wlen = = 1 ) {
qualified = false ;
}
2013-08-02 13:12:24 -07:00
return qualified ;
}
2016-02-25 13:43:37 +01:00
// returns false and sets g_errno on error
bool Title : : setTitle ( Xml * xml , Words * words , int32_t maxTitleLen , Query * query ,
2016-03-08 11:10:50 +01:00
LinkInfo * linkInfo , Url * firstUrl , const char * filteredRootTitleBuf , int32_t filteredRootTitleBufSize ,
2016-02-25 13:43:37 +01:00
uint8_t contentType , uint8_t langId , int32_t niceness ) {
// make Msg20.cpp faster if it is just has
// Msg20Request::m_setForLinkInfo set to true, no need to extricate a title.
if ( maxTitleLen < = 0 ) {
return true ;
}
m_niceness = niceness ;
m_maxTitleLen = maxTitleLen ;
// if this is too big the "first line" algo can be huge!!!
// and really slow everything way down with a huge title candidate
int32_t maxTitleWords = 128 ;
2013-08-02 13:12:24 -07:00
// assume no title
reset ( ) ;
2016-02-25 13:43:37 +01:00
int32_t NW = words - > getNumWords ( ) ;
2013-08-02 13:12:24 -07:00
//
// now get all the candidates
//
// . allow up to 100 title CANDIDATES
// . "as" is the word # of the first word in the candidate
// . "bs" is the word # of the last word IN the candidate PLUS ONE
2015-12-01 12:38:51 +01:00
int32_t n = 0 ;
int32_t as [ MAX_TIT_CANDIDATES ] ;
int32_t bs [ MAX_TIT_CANDIDATES ] ;
float scores [ MAX_TIT_CANDIDATES ] ;
Words * cptrs [ MAX_TIT_CANDIDATES ] ;
int32_t types [ MAX_TIT_CANDIDATES ] ;
int32_t parent [ MAX_TIT_CANDIDATES ] ;
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// record the scoring algos effects
2016-01-11 15:46:09 +01:00
float baseScore [ MAX_TIT_CANDIDATES ] ;
float noCapsBoost [ MAX_TIT_CANDIDATES ] ;
float qtermsBoost [ MAX_TIT_CANDIDATES ] ;
float inCommonCandBoost [ MAX_TIT_CANDIDATES ] ;
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// reset these
2015-11-26 14:48:58 +01:00
for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i + + ) {
2013-08-02 13:12:24 -07:00
// assume no parent
parent [ i ] = - 1 ;
2015-11-26 14:48:58 +01:00
}
2013-08-02 13:12:24 -07:00
// xml and words class for each link info, rss item
Xml tx [ MAX_TIT_CANDIDATES ] ;
Words tw [ MAX_TIT_CANDIDATES ] ;
2014-11-10 14:45:11 -08:00
int32_t ti = 0 ;
2013-08-02 13:12:24 -07:00
// restrict how many link texts and rss blobs we check for titles
// because title recs like www.google.com have hundreds and can
// really slow things down to like 50ms for title generation
2014-11-10 14:45:11 -08:00
int32_t kcount = 0 ;
int32_t rcount = 0 ;
2013-08-02 13:12:24 -07:00
2014-10-30 13:36:39 -06:00
//int64_t x = gettimeofdayInMilliseconds();
2013-08-02 13:12:24 -07:00
// . get every link text
// . TODO: repeat for linkInfo2, the imported link text
2016-02-25 13:43:37 +01:00
for ( Inlink * k = NULL ; linkInfo & & ( k = linkInfo - > getNextInlink ( k ) ) ; ) {
2013-08-02 13:12:24 -07:00
// breathe
QUICKPOLL ( m_niceness ) ;
// fast skip check for link text
if ( k - > size_linkText > = 3 & & + + kcount > = 20 ) continue ;
// fast skip check for rss item
if ( k - > size_rssItem > 10 & & + + rcount > = 20 ) continue ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// set Url
Url u ;
2016-04-04 23:31:07 +02:00
u . set ( k - > getUrl ( ) , k - > size_urlBuf ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// is it the same host as us?
bool sh = true ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// skip if not from same host and should be
2016-02-18 14:45:48 +01:00
if ( firstUrl - > getHostLen ( ) ! = u . getHostLen ( ) ) {
sh = false ;
}
2013-08-02 13:12:24 -07:00
// skip if not from same host and should be
2016-02-18 14:45:48 +01:00
if ( strncmp ( firstUrl - > getHost ( ) , u . getHost ( ) , u . getHostLen ( ) ) ) {
sh = false ;
}
2013-08-02 13:12:24 -07:00
// get the link text
if ( k - > size_linkText > = 3 ) {
2014-11-17 18:13:36 -08:00
char * p = k - > getLinkText ( ) ;
2014-11-10 14:45:11 -08:00
int32_t plen = k - > size_linkText - 1 ;
2013-08-02 13:12:24 -07:00
if ( ! verifyUtf8 ( p , plen ) ) {
2015-12-01 12:38:51 +01:00
log ( " title: set4 bad link text from url=%s " , k - > getUrl ( ) ) ;
2013-08-02 13:12:24 -07:00
continue ;
}
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// now the words.
2016-01-11 15:46:09 +01:00
if ( ! tw [ ti ] . set ( k - > getLinkText ( ) , k - > size_linkText - 1 , true , 0 ) ) {
2013-08-02 13:12:24 -07:00
return false ;
2016-01-11 15:46:09 +01:00
}
2013-08-02 13:12:24 -07:00
// set the bookends, it is the whole thing
cptrs [ n ] = & tw [ ti ] ;
as [ n ] = 0 ;
bs [ n ] = tw [ ti ] . getNumWords ( ) ;
// score higher if same host
if ( sh ) scores [ n ] = 1.05 ;
// do not count so high if remote!
else scores [ n ] = 0.80 ;
// set the type
if ( sh ) types [ n ] = TT_LINKTEXTLOCAL ;
else types [ n ] = TT_LINKTEXTREMOTE ;
// another candidate
n + + ;
// use xml and words
ti + + ;
// break out if too many already. save some for below.
if ( n + 30 > = MAX_TIT_CANDIDATES ) break ;
}
// get the rss item
if ( k - > size_rssItem < = 10 ) continue ;
// . returns false and sets g_errno on error
// . use a 0 for niceness
if ( ! k - > setXmlFromRSS ( & tx [ ti ] , 0 ) ) return false ;
// get the word range
2014-11-10 14:45:11 -08:00
int32_t tslen ;
2013-08-02 13:12:24 -07:00
bool isHtmlEnc ;
char * ts = tx [ ti ] . getRSSTitle ( & tslen , & isHtmlEnc ) ;
// skip if not in the rss
if ( ! ts ) continue ;
// skip if empty
if ( tslen < = 0 ) continue ;
// now set words to that
2016-01-11 15:46:09 +01:00
if ( ! tw [ ti ] . set ( ts , tslen , true , 0 ) ) {
2013-08-02 13:12:24 -07:00
return false ;
2016-01-11 15:46:09 +01:00
}
2013-08-02 13:12:24 -07:00
// point to that
cptrs [ n ] = & tw [ ti ] ;
as [ n ] = 0 ;
bs [ n ] = tw [ ti ] . getNumWords ( ) ;
// increment since we are using it
ti + + ;
// base score for rss title
if ( sh ) scores [ n ] = 5.0 ;
// if not same host, treat like link text
else scores [ n ] = 2.0 ;
// set the type
if ( sh ) types [ n ] = TT_RSSITEMLOCAL ;
else types [ n ] = TT_RSSITEMREMOTE ;
// advance
n + + ;
// break out if too many already. save some for below.
if ( n + 30 > = MAX_TIT_CANDIDATES ) break ;
}
2016-05-20 09:18:32 +02:00
//logf(LOG_DEBUG,"title: took1=%" PRId64,gettimeofdayInMilliseconds()-x);
2013-08-02 13:12:24 -07:00
//x = gettimeofdayInMilliseconds();
// . set the flags array
// . indicates what words are in title candidates already, but
// that is set below
// . up here we set words that are not allowed to be in candidates,
// like words that are in a link that is not a self link
// . alloc for it
char * flags = NULL ;
2015-12-01 12:38:51 +01:00
char localBuf [ 10000 ] ;
2016-02-25 13:43:37 +01:00
int32_t need = words - > getNumWords ( ) ;
2015-12-01 12:38:51 +01:00
if ( need < = 10000 ) {
flags = ( char * ) localBuf ;
} else {
flags = ( char * ) mmalloc ( need , " TITLEflags " ) ;
}
if ( ! flags ) {
return false ;
}
2013-08-02 13:12:24 -07:00
// clear it
memset ( flags , 0 , need ) ;
// check tags in body
2016-02-25 13:43:37 +01:00
nodeid_t * tids = words - > getTagIds ( ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// scan to set link text flags
// loop over all "words" in the html body
char inLink = false ;
char selfLink = false ;
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < NW ; i + + ) {
2013-08-02 13:12:24 -07:00
// breathe
QUICKPOLL ( m_niceness ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// if in a link that is not self link, cannot be in a candidate
2015-12-01 12:38:51 +01:00
if ( inLink & & ! selfLink ) {
flags [ i ] | = 0x02 ;
}
2013-08-02 13:12:24 -07:00
// out of a link
2015-12-01 12:38:51 +01:00
if ( tids [ i ] = = ( TAG_A | BACKBIT ) ) {
inLink = false ;
}
2013-08-02 13:12:24 -07:00
// if not start of <a> tag, skip it
2015-12-01 12:38:51 +01:00
if ( tids [ i ] ! = TAG_A ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// flag it
inLink = true ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// get the node in the xml
2016-05-23 16:39:52 +02:00
int32_t xn = words - > getNodes ( ) [ i ] ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// is it a self link?
2014-11-10 14:45:11 -08:00
int32_t len ;
2016-02-25 13:43:37 +01:00
char * link = xml - > getString ( xn , " href " , & len ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// . set the url class to this
// . TODO: use the base url in the doc
2015-12-01 12:38:51 +01:00
Url u ;
2016-04-18 17:30:26 +02:00
u . set ( link , len , true , false ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// compare
2016-02-18 14:45:48 +01:00
selfLink = u . equals ( firstUrl ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// skip if not selfLink
2015-12-01 12:38:51 +01:00
if ( ! selfLink ) {
continue ;
}
2016-01-13 11:52:10 +01:00
// if it is a selflink , check for an "onClick" tag in the
2013-08-02 13:12:24 -07:00
// anchor tag to fix that Mixx issue for:
// http://www.npr.org/templates/story/story.php?storyId=5417137
2015-12-01 12:38:51 +01:00
2014-11-10 14:45:11 -08:00
int32_t oclen ;
2016-02-25 13:43:37 +01:00
char * oc = xml - > getString ( xn , " onclick " , & oclen ) ;
2015-12-01 12:38:51 +01:00
if ( ! oc ) {
2016-02-25 13:43:37 +01:00
oc = xml - > getString ( xn , " onClick " , & oclen ) ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// assume not a self link if we see that...
2015-12-01 12:38:51 +01:00
if ( oc ) {
selfLink = false ;
}
2013-08-02 13:12:24 -07:00
// if this <a href> link has a "title" attribute, use that
// instead! that thing is solid gold.
2014-11-10 14:45:11 -08:00
int32_t atlen ;
2016-02-25 13:43:37 +01:00
char * atitle = xml - > getString ( xn , " title " , & atlen ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// stop and use that, this thing is gold!
2015-12-01 12:38:51 +01:00
if ( ! atitle | | atlen < = 0 ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// craziness? ignore it...
2015-12-01 12:38:51 +01:00
if ( atlen > 400 ) {
continue ;
}
2016-01-13 11:52:10 +01:00
// if it contains permanent, permalink or share, ignore it!
if ( strncasestr ( atitle , " permalink " , atlen ) | |
strncasestr ( atitle , " permanent " , atlen ) | |
strncasestr ( atitle , " share " , atlen ) ) {
2015-12-01 12:38:51 +01:00
continue ;
}
2013-08-02 13:12:24 -07:00
// do not count the link text as viable
selfLink = false ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// aw, dammit
2015-12-01 12:38:51 +01:00
if ( ti > = MAX_TIT_CANDIDATES ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// other dammit
2015-12-01 12:38:51 +01:00
if ( n > = MAX_TIT_CANDIDATES ) {
break ;
}
2013-08-02 13:12:24 -07:00
// ok, process it
2015-12-01 12:38:51 +01:00
if ( ! tw [ ti ] . set ( atitle , atlen , true , 0 ) ) {
2013-08-02 13:12:24 -07:00
return false ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// set the bookends, it is the whole thing
cptrs [ n ] = & tw [ ti ] ;
as [ n ] = 0 ;
bs [ n ] = tw [ ti ] . getNumWords ( ) ;
scores [ n ] = 3.0 ; // not ALWAYS solid gold!
types [ n ] = TT_TITLEATT ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// we are using the words class
ti + + ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// advance
n + + ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// break out if too many already. save some for below.
2015-12-01 12:38:51 +01:00
if ( n + 20 > = MAX_TIT_CANDIDATES ) {
break ;
}
2013-08-02 13:12:24 -07:00
}
2016-05-20 09:18:32 +02:00
//logf(LOG_DEBUG,"title: took2=%" PRId64,gettimeofdayInMilliseconds()-x);
2013-08-02 13:12:24 -07:00
//x = gettimeofdayInMilliseconds();
2014-10-30 13:36:39 -06:00
//int64_t *wids = WW->getWordIds();
2013-08-02 13:12:24 -07:00
// . find the last positive scoring guy
// . do not consider title candidates after "r" if "r" is non-zero
2015-12-01 12:38:51 +01:00
// . FIXES http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/
2013-08-02 13:12:24 -07:00
// the candidate # of the title tag
2014-11-10 14:45:11 -08:00
int32_t tti = - 1 ;
2013-08-02 13:12:24 -07:00
// allow up to 4 tags from each type
char table [ 512 ] ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// sanity check
2016-06-20 12:30:26 +02:00
if ( getNumXmlNodes ( ) > 512 ) { g_process . shutdownAbort ( true ) ; }
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// clear table counts
memset ( table , 0 , 512 ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// the first word
2015-12-01 12:38:51 +01:00
char * wstart = NULL ;
if ( NW > 0 ) {
2016-02-25 13:43:37 +01:00
wstart = words - > getWord ( 0 ) ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// loop over all "words" in the html body
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < NW ; i + + ) {
2013-08-02 13:12:24 -07:00
// come back up here if we encounter another "title-ish" tag
// within our first alleged "title-ish" tag
subloop :
2015-12-01 12:38:51 +01:00
// stop after 30k of text
2016-02-25 13:43:37 +01:00
if ( words - > getWord ( i ) - wstart > 200000 ) {
2015-12-01 12:38:51 +01:00
break ; // 1106
}
2013-08-02 13:12:24 -07:00
// get the tag id minus the back tag bit
nodeid_t tid = tids [ i ] & BACKBITCOMP ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// pen up and pen down for these comment like tags
if ( tid = = TAG_SCRIPT | | tid = = TAG_STYLE ) {
2015-12-01 12:38:51 +01:00
// ignore "titles" in script or style tags
if ( ! ( tids [ i ] & BACKBIT ) ) {
continue ;
}
2013-08-02 13:12:24 -07:00
}
2015-12-01 12:38:51 +01:00
2016-06-09 14:51:52 +02:00
/// @todo ALC we should allow more tags than just link
// skip if not a good tag. we're already checking for title tag in Title::setTitleFromTags
if ( tid ! = TAG_A ) {
2015-12-01 12:38:51 +01:00
continue ;
}
2013-08-02 13:12:24 -07:00
// must NOT be a back tag
2015-12-01 12:38:51 +01:00
if ( tids [ i ] & BACKBIT ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// skip if we hit our limit
2015-12-01 12:38:51 +01:00
if ( table [ tid ] > = 4 ) {
2013-08-02 13:12:24 -07:00
continue ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// skip over tag/word #i
i + + ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// no words in links, unless it is a self link
2015-12-01 12:38:51 +01:00
if ( i < NW & & ( flags [ i ] & 0x02 ) ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// the start should be here
2014-11-10 14:45:11 -08:00
int32_t start = - 1 ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// do not go too far
2014-11-10 14:45:11 -08:00
int32_t max = i + 200 ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// find the corresponding back tag for it
for ( ; i < NW & & i < max ; i + + ) {
// hey we got it, BUT we got no alnum word first
// so the thing was empty, so loop back to subloop
if ( ( tids [ i ] & BACKBITCOMP ) = = tid & &
( tids [ i ] & BACKBIT ) & &
2015-12-01 12:38:51 +01:00
start = = - 1 ) {
2013-08-02 13:12:24 -07:00
goto subloop ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// if we hit another title-ish tag, loop back up
2015-12-01 12:38:51 +01:00
if ( ( tids [ i ] & BACKBITCOMP ) = = TAG_TITLE | | ( tids [ i ] & BACKBITCOMP ) = = TAG_A ) {
2013-08-02 13:12:24 -07:00
// if no alnum text, restart at the top
2015-12-01 12:38:51 +01:00
if ( start = = - 1 ) {
2013-08-02 13:12:24 -07:00
goto subloop ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// otherwise, break out and see if title works
break ;
}
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// if we hit a breaking tag...
if ( isBreakingTagId ( tids [ i ] & BACKBITCOMP ) & &
// do not consider <span> tags breaking for
// our purposes. i saw a <h1><span> setup before.
2015-12-01 12:38:51 +01:00
tids [ i ] ! = TAG_SPAN ) {
2013-08-02 13:12:24 -07:00
break ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// skip if not alnum word
2016-02-25 13:43:37 +01:00
if ( ! words - > isAlnum ( i ) ) {
2015-12-01 12:38:51 +01:00
continue ;
}
2013-08-02 13:12:24 -07:00
// if we hit an alnum word, break out
2015-12-01 12:38:51 +01:00
if ( start = = - 1 ) {
start = i ;
}
2013-08-02 13:12:24 -07:00
}
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// if no start was found, must have had a 0 score in there
2015-12-01 12:38:51 +01:00
if ( start = = - 1 ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// if we exhausted the doc, we are done
2015-12-01 12:38:51 +01:00
if ( i > = NW ) {
2013-08-02 13:12:24 -07:00
break ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// skip if way too big!
2015-12-01 12:38:51 +01:00
if ( i > = max ) {
continue ;
}
2014-11-17 18:24:38 -08:00
// if was too long do not consider a title
2015-12-01 12:38:51 +01:00
if ( i - start > 300 ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// . skip if too many bytes
// . this does not include the length of word #i, but #(i-1)
2016-02-25 13:43:37 +01:00
if ( words - > getStringSize ( start , i ) > 1000 ) {
2015-12-01 12:38:51 +01:00
continue ;
}
2013-08-02 13:12:24 -07:00
// count it
table [ tid ] + + ;
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// max it out if we are positive scoring. stop after the
// first positive scoring guy in a section. this might
// hurt the "Hamlet" thing though...
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// store a point to the title tag guy. Msg20.cpp needs this
// because the zak's proximity algo uses it in Summary.cpp
// and in Msg20.cpp
2015-12-01 12:38:51 +01:00
// only get the first one! often the 2nd on is in an iframe!! which we now expand into here.
if ( tid = = TAG_TITLE & & m_titleTagStart = = - 1 ) {
2013-08-02 13:12:24 -07:00
m_titleTagStart = start ;
m_titleTagEnd = i ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// save the candidate # because we always use this
// as the title if we are a root
2015-12-01 12:38:51 +01:00
if ( tti < 0 ) {
tti = n ;
}
2013-08-02 13:12:24 -07:00
}
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// point to words class of the body that was passed in to us
2016-02-25 13:43:37 +01:00
cptrs [ n ] = words ;
2015-12-01 12:38:51 +01:00
as [ n ] = start ;
bs [ n ] = i ;
if ( tid = = TAG_B ) {
types [ n ] = TT_BOLDTAG ;
scores [ n ] = 1.0 ;
} else if ( tid = = TAG_H1 ) {
types [ n ] = TT_HTAG ;
scores [ n ] = 1.8 ;
} else if ( tid = = TAG_H2 ) {
types [ n ] = TT_HTAG ;
scores [ n ] = 1.7 ;
} else if ( tid = = TAG_H3 ) {
types [ n ] = TT_HTAG ;
scores [ n ] = 1.6 ;
} else if ( tid = = TAG_TITLE ) {
types [ n ] = TT_TITLETAG ;
scores [ n ] = 3.0 ;
} else if ( tid = = TAG_DIV ) {
types [ n ] = TT_DIVTAG ;
scores [ n ] = 1.0 ;
} else if ( tid = = TAG_TD ) {
types [ n ] = TT_TDTAG ;
scores [ n ] = 1.0 ;
} else if ( tid = = TAG_P ) {
types [ n ] = TT_PTAG ;
scores [ n ] = 1.0 ;
} else if ( tid = = TAG_FONT ) {
types [ n ] = TT_FONTTAG ;
scores [ n ] = 1.0 ;
} else if ( tid = = TAG_A ) {
types [ n ] = TT_ATAG ;
// . self link is very powerful BUT
// http://www.npr.org/templates/story/story.php?storyId=5417137
// doesn't use it right! so use
// 1.3 instead of 3.0. that has an "onClick" thing in the
// <a> tag, so check for that!
// this was bad for
// http://www.spiritualwoman.net/?cat=191
// so i am demoting from 3.0 to 1.5
scores [ n ] = 1.5 ;
}
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// count it
n + + ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// start loop over at tag #i, for loop does an i++, so negate
// that so this will work
i - - ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// break out if too many already. save some for below.
2015-12-01 12:38:51 +01:00
if ( n + 10 > = MAX_TIT_CANDIDATES ) {
break ;
}
2013-08-02 13:12:24 -07:00
}
2016-05-20 09:18:32 +02:00
//logf(LOG_DEBUG,"title: took3=%" PRId64,gettimeofdayInMilliseconds()-x);
2013-08-02 13:12:24 -07:00
//x = gettimeofdayInMilliseconds();
// to handle text documents, throw in the first line of text
// as a title candidate, just make the score really low
2016-02-18 14:45:48 +01:00
bool textDoc = ( contentType = = CT_UNKNOWN | | contentType = = CT_TEXT ) ;
2015-12-01 12:38:51 +01:00
if ( textDoc ) {
// make "i" point to first alphabetical word in the document
int32_t i ;
2016-02-25 13:43:37 +01:00
for ( i = 0 ; i < NW & & ! words - > isAlpha ( i ) ; i + + ) ;
2015-12-01 12:38:51 +01:00
// if we got a first alphabetical word, then assume that to be the start of our title
if ( i < NW & & n < MAX_TIT_CANDIDATES ) {
// first word in title is "t0"
int32_t t0 = i ;
// find end of first line
int32_t numWords = 0 ;
// set i to the end now. we MUST find a \n to terminate the
// title, otherwise we will not have a valid title
2016-02-25 13:43:37 +01:00
while ( i < NW & & numWords < maxTitleWords & & ( words - > isAlnum ( i ) | | ! words - > hasChar ( i , ' \n ' ) ) ) {
if ( words - > isAlnum ( i ) ) {
2015-12-01 12:38:51 +01:00
numWords + + ;
}
+ + i ;
}
// "t1" is the end
int32_t t1 = - 1 ;
// we must have found our \n in order to set "t1"
if ( i < = NW & & numWords < maxTitleWords ) {
t1 = i ;
}
// set the ptrs
2016-02-25 13:43:37 +01:00
cptrs [ n ] = words ;
2015-12-01 12:38:51 +01:00
// this is the last resort i guess...
scores [ n ] = 0.5 ;
types [ n ] = TT_FIRSTLINE ;
as [ n ] = t0 ;
bs [ n ] = t1 ;
// add it as a candidate if t0 and t1 were valid
if ( t0 > = 0 & & t1 > t0 ) {
n + + ;
}
2013-08-02 13:12:24 -07:00
}
}
2016-05-20 09:18:32 +02:00
//logf(LOG_DEBUG,"title: took4=%" PRId64,gettimeofdayInMilliseconds()-x);
2013-08-02 13:12:24 -07:00
//x = gettimeofdayInMilliseconds();
2015-12-01 12:38:51 +01:00
{
// now add the last url path to contain underscores or hyphens
2016-02-18 14:45:48 +01:00
char * pstart = firstUrl - > getPath ( ) ;
2013-08-02 13:12:24 -07:00
2015-12-01 12:38:51 +01:00
// get first url
2016-02-18 14:45:48 +01:00
Url * fu = firstUrl ;
2015-12-01 12:38:51 +01:00
// start at the end
char * p = fu - > getUrl ( ) + fu - > getUrlLen ( ) ;
// end pointer
char * pend = NULL ;
// come up here for each path component
while ( p > = pstart ) {
// save end
pend = p ;
// skip over /
if ( * p = = ' / ' ) {
p - - ;
}
// now go back to next /
int32_t count = 0 ;
for ( ; p > = pstart & & * p ! = ' / ' ; p - - ) {
if ( * p = = ' _ ' | | * p = = ' - ' ) {
count + + ;
}
}
// did we get it?
if ( count > 0 ) {
break ;
}
}
// did we get any?
if ( p > pstart & & n < MAX_TIT_CANDIDATES ) {
// now set words to that
if ( ! tw [ ti ] . set ( p , ( pend - p ) , true , 0 ) ) {
return false ;
}
// point to that
cptrs [ n ] = & tw [ ti ] ;
as [ n ] = 0 ;
bs [ n ] = tw [ ti ] . getNumWords ( ) ;
scores [ n ] = 1.0 ;
types [ n ] = TT_URLPATH ;
// increment since we are using it
ti + + ;
// advance
n + + ;
}
2013-08-02 13:12:24 -07:00
}
// save old n
2014-11-10 14:45:11 -08:00
int32_t oldn = n ;
2015-12-01 12:38:51 +01:00
// . do not split titles if we are a root url maps.yahoo.com was getting "Maps" for the title
2016-02-18 14:45:48 +01:00
if ( firstUrl - > isRoot ( ) ) {
2015-12-01 12:38:51 +01:00
oldn = - 2 ;
}
2013-08-02 13:12:24 -07:00
// point to list of \0 separated titles
2016-03-08 11:10:50 +01:00
const char * rootTitleBuf = NULL ;
const char * rootTitleBufEnd = NULL ;
2013-08-02 13:12:24 -07:00
// get the root title if we are not root!
2016-02-18 14:45:48 +01:00
if ( filteredRootTitleBuf ) {
2016-03-08 11:10:50 +01:00
# ifdef _VALGRIND_
2016-03-08 11:59:25 +01:00
VALGRIND_CHECK_MEM_IS_DEFINED ( filteredRootTitleBuf , filteredRootTitleBufSize ) ;
2016-03-08 11:10:50 +01:00
# endif
2013-08-02 13:12:24 -07:00
// point to list of \0 separated titles
2016-03-08 11:10:50 +01:00
rootTitleBuf = filteredRootTitleBuf ;
rootTitleBufEnd = filteredRootTitleBuf + filteredRootTitleBufSize ;
2013-08-02 13:12:24 -07:00
}
2015-12-01 12:38:51 +01:00
{
// convert into an array
int32_t nr = 0 ;
2016-03-08 11:10:50 +01:00
const char * pr = rootTitleBuf ;
const char * rootTitles [ 20 ] ;
2015-12-01 12:38:51 +01:00
int32_t rootTitleLens [ 20 ] ;
// loop over each root title segment
2016-03-07 17:37:54 +01:00
for ( ; pr & & pr < rootTitleBufEnd ; pr + = strnlen ( pr , rootTitleBufEnd - pr ) + 1 ) {
2015-12-01 12:38:51 +01:00
// if we had a query...
2016-02-25 13:43:37 +01:00
if ( query ) {
2016-06-28 11:48:06 +02:00
Matches m ;
m . setQuery ( query ) ;
2015-12-01 12:38:51 +01:00
// see if root title segment has query terms in it
2016-03-08 11:10:50 +01:00
m . addMatches ( const_cast < char * > ( pr ) , strnlen ( pr , rootTitleBufEnd - pr ) , MF_TITLEGEN , m_niceness ) ;
2015-12-01 12:38:51 +01:00
// if matches query, do NOT add it, we only add it for
// removing from the title of the page...
if ( m . getNumMatches ( ) ) {
continue ;
}
2013-08-02 13:12:24 -07:00
}
2015-12-01 12:38:51 +01:00
// point to it. it should start with an alnum already
// since it is the "filtered" list of root titles...
// if not, fix it in xmldoc then.
rootTitles [ nr ] = pr ;
2016-07-28 17:04:35 +02:00
rootTitleLens [ nr ] = strlen ( pr ) ;
2015-12-01 12:38:51 +01:00
// advance
nr + + ;
// no breaching
if ( nr > = 20 ) break ;
}
// now split up candidates in children candidates by tokenizing
// using :, | and - as delimters.
// the hyphen must have a space on at least one side, so "cd-rom" does
// not create a pair of tokens...
// FIX: for the title:
// Best Careers 2009: Librarian - US News and World Report
// we need to recognize "Best Careers 2009: Librarian" as a subtitle
// otherwise we don't get it as the title. so my question is are we
// going to have to do all the permutations at some point? for now
// let's just add in pairs...
for ( int32_t i = 0 ; i < oldn & & n + 3 < MAX_TIT_CANDIDATES ; i + + ) {
// stop if no root title segments
if ( nr < = 0 ) break ;
// get the word info
Words * w = cptrs [ i ] ;
int32_t a = as [ i ] ;
int32_t b = bs [ i ] ;
// init
int32_t lasta = a ;
char prev = false ;
// char length in bytes
//int32_t charlen = 1;
// see how many we add
int32_t added = 0 ;
char * skipTo = NULL ;
bool qualified = true ;
// . scan the words looking for a token
// . sometimes the candidates end in ": " so put in "k < b-1"
// . made this from k<b-1 to k<b to fix
// "Hot Tub Time Machine (2010) - IMDb" to strip IMDb
for ( int32_t k = a ; k < b & & n + 3 < MAX_TIT_CANDIDATES ; k + + ) {
// get word
char * wp = w - > getWord ( k ) ;
// skip if not alnum
if ( ! w - > isAlnum ( k ) ) {
// in order for next alnum word to
// qualify for "clipping" if it matches
// the root title, there has to be more
// than just spaces here, some punct.
// otherwise title
// "T. D. Jakes: Biography from Answers.com"
// becomes
// "T. D. Jakes: Biography from"
qualified = isWordQualified ( wp , w - > getWordLen ( k ) ) ;
continue ;
}
// gotta be qualified!
if ( ! qualified ) continue ;
// skip if in root title
if ( skipTo & & wp < skipTo ) continue ;
// does this match any root page title segments?
int32_t j ;
for ( j = 0 ; j < nr ; j + + ) {
// . compare to root title
// . break out if we matched!
if ( ! strncmp ( wp , rootTitles [ j ] , rootTitleLens [ j ] ) ) {
break ;
}
}
// if we did not match a root title segment,
// keep on chugging
if ( j > = nr ) continue ;
// . we got a root title match!
// . skip over
skipTo = wp + rootTitleLens [ j ] ;
// must land on qualified punct then!!
int32_t e = k + 1 ;
2016-05-23 16:39:52 +02:00
for ( ; e < b & & w - > getWord ( e ) < skipTo ; e + + ) ;
2015-12-01 12:38:51 +01:00
// ok, word #e must be a qualified punct
if ( e < b & &
! isWordQualified ( w - > getWord ( e ) , w - > getWordLen ( e ) ) )
// assume no match then!!
continue ;
// if we had a previous guy, reset the end of the
// previous candidate
if ( prev ) {
bs [ n - 2 ] = k ;
bs [ n - 1 ] = k ;
}
// . ok, we got two more candidates
// . well, only one more if this is not the 1st time
if ( ! prev ) {
cptrs [ n ] = cptrs [ i ] ;
scores [ n ] = scores [ i ] ;
types [ n ] = types [ i ] ;
as [ n ] = lasta ;
bs [ n ] = k ;
parent [ n ] = i ;
n + + ;
added + + ;
}
// the 2nd one
cptrs [ n ] = cptrs [ i ] ;
scores [ n ] = scores [ i ] ;
types [ n ] = types [ i ] ;
as [ n ] = e + 1 ;
bs [ n ] = bs [ i ] ;
parent [ n ] = i ;
n + + ;
added + + ;
// now add in the last pair as a whole token
2013-08-02 13:12:24 -07:00
cptrs [ n ] = cptrs [ i ] ;
scores [ n ] = scores [ i ] ;
types [ n ] = types [ i ] ;
as [ n ] = lasta ;
2015-12-01 12:38:51 +01:00
bs [ n ] = bs [ i ] ;
2013-08-02 13:12:24 -07:00
parent [ n ] = i ;
n + + ;
added + + ;
2015-12-01 12:38:51 +01:00
// nuke the current candidate then since it got
// split up to not contain the root title...
//cptrs[i] = NULL;
// update this
lasta = k + 1 ;
// if we encounter another delimeter we will have to revise bs[n-1], so note that
prev = true ;
2013-08-02 13:12:24 -07:00
}
// nuke the current candidate then since it got
// split up to not contain the root title...
2015-12-01 12:38:51 +01:00
if ( added ) {
scores [ i ] = 0.001 ;
//cptrs[i] = NULL;
}
2013-08-02 13:12:24 -07:00
2015-12-01 12:38:51 +01:00
// erase the pair if that there was only one token
if ( added = = 3 ) n - - ;
2013-08-02 13:12:24 -07:00
}
}
2016-01-11 15:46:09 +01:00
for ( int32_t i = 0 ; i < n ; i + + ) baseScore [ i ] = scores [ i ] ;
2013-08-02 13:12:24 -07:00
//
// . now punish by 0.85 for every lower case non-stop word it has
// . reward by 1.1 if has a non-stopword in the query
//
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < n ; i + + ) {
2013-08-02 13:12:24 -07:00
// point to the words
Words * w = cptrs [ i ] ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// skip if got nuked above
2015-12-01 12:38:51 +01:00
if ( ! w ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// the word ptrs
char * * wptrs = w - > getWordPtrs ( ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// skip if empty
2015-12-01 12:38:51 +01:00
if ( w - > getNumWords ( ) < = 0 ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// get the word boundaries
2014-11-10 14:45:11 -08:00
int32_t a = as [ i ] ;
int32_t b = bs [ i ] ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// record the boosts
float ncb = 1.0 ;
2016-01-11 15:46:09 +01:00
float qtb = 1.0 ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// a flag
char uncapped = false ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// scan the words in this title candidate
2014-11-10 14:45:11 -08:00
for ( int32_t j = a ; j < b ; j + + ) {
2013-08-02 13:12:24 -07:00
// skip stop words
2016-02-18 14:45:48 +01:00
if ( w - > isQueryStopWord ( j , langId ) ) {
2015-12-01 12:38:51 +01:00
continue ;
}
2013-08-02 13:12:24 -07:00
// punish if uncapitalized non-stopword
2015-12-01 12:38:51 +01:00
if ( ! w - > isCapitalized ( j ) ) {
uncapped = true ;
}
2013-08-02 13:12:24 -07:00
// skip if no query
2016-02-25 13:43:37 +01:00
if ( ! query ) {
2015-12-01 12:38:51 +01:00
continue ;
}
2014-10-30 13:36:39 -06:00
int64_t wid = w - > getWordId ( j ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// reward if in the query
2016-02-25 13:43:37 +01:00
if ( query - > getWordNum ( wid ) > = 0 ) {
2016-01-11 15:46:09 +01:00
qtb * = 1.5 ;
2013-08-02 13:12:24 -07:00
scores [ i ] * = 1.5 ;
}
}
2015-12-01 12:38:51 +01:00
// . only punish once if missing a capitalized word hurts us for:
// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html
2013-08-02 13:12:24 -07:00
if ( uncapped ) {
2015-12-01 12:38:51 +01:00
ncb * = 1.00 ;
scores [ i ] * = 1.00 ;
2013-08-02 13:12:24 -07:00
}
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// punish if a http:// title thingy
2015-12-01 12:38:51 +01:00
char * s = wptrs [ a ] ;
int32_t size = w - > getStringSize ( a , b ) ;
if ( size > 9 & & memcmp ( " http:// " , s , 7 ) = = 0 ) {
2013-08-02 13:12:24 -07:00
ncb * = .10 ;
2015-12-01 12:38:51 +01:00
}
if ( size > 14 & & memcmp ( " h \0 t \0 t \0 p \0 : \0 / \0 / " , s , 14 ) = = 0 ) {
2013-08-02 13:12:24 -07:00
ncb * = .10 ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// set these guys
2015-12-01 12:38:51 +01:00
scores [ i ] * = ncb ;
2015-12-04 13:18:54 +01:00
2016-01-11 15:46:09 +01:00
noCapsBoost [ i ] = ncb ;
qtermsBoost [ i ] = qtb ;
2013-08-02 13:12:24 -07:00
}
// . now compare each candidate to the other candidates
// . give a boost if matches
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < n ; i + + ) {
2013-08-02 13:12:24 -07:00
// point to the words
Words * w1 = cptrs [ i ] ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// skip if got nuked above
2015-12-01 12:38:51 +01:00
if ( ! w1 ) {
continue ;
}
int32_t a1 = as [ i ] ;
int32_t b1 = bs [ i ] ;
2013-08-02 13:12:24 -07:00
// reset some flags
char localFlag1 = 0 ;
char localFlag2 = 0 ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// record the boost
2016-01-11 15:46:09 +01:00
float iccb = 1.0 ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// total boost
float total = 1.0 ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// to each other candidate
2014-11-10 14:45:11 -08:00
for ( int32_t j = 0 ; j < n ; j + + ) {
2013-08-02 13:12:24 -07:00
// not to ourselves
2015-12-01 12:38:51 +01:00
if ( j = = i ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// or our derivatives
2015-12-01 12:38:51 +01:00
if ( parent [ j ] = = i ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// or derivates to their parent
2015-12-01 12:38:51 +01:00
if ( parent [ i ] = = j ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// only check parents now. do not check kids.
// this was only for when doing percent contained
// not getSimilarity() per se
//if ( parent[j] != -1 ) continue;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// TODO: do not accumulate boosts from a parent
// and its kids, subtitles...
//
// do not compare type X to type Y
if ( types [ i ] = = TT_TITLETAG ) {
2015-12-01 12:38:51 +01:00
if ( types [ j ] = = TT_TITLETAG ) {
continue ;
}
2013-08-02 13:12:24 -07:00
}
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// do not compare a div candidate to another div cand
// http://friendfeed.com/foxiewire?start=30
// likewise, a TD to another TD
2015-12-01 12:38:51 +01:00
// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/match/351681.html
// ... etc.
2013-08-02 13:12:24 -07:00
if ( types [ i ] = = TT_BOLDTAG | |
types [ i ] = = TT_HTAG | |
types [ i ] = = TT_DIVTAG | |
types [ i ] = = TT_TDTAG | |
types [ i ] = = TT_FONTTAG ) {
if ( types [ j ] = = types [ i ] ) continue ;
}
// . do not compare one kid to another kid
// . i.e. if we got "x | y" as a title and "x | z"
// as a link text, it will emphasize "x" too much
2015-11-25 16:51:27 +01:00
// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html
2013-08-02 13:12:24 -07:00
if ( parent [ j ] ! = - 1 & & parent [ i ] ! = - 1 ) continue ;
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// . body type tags are mostly mutually exclusive
// . for the legacy.com url mentioned below, we have
// good stuff in <td> tags, so this hurts us...
// . but for the sake of
2015-11-25 16:51:27 +01:00
// http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/
// i put bold tags back
2013-08-02 13:12:24 -07:00
if ( types [ i ] = = TT_LINKTEXTLOCAL ) {
if ( types [ j ] = = TT_LINKTEXTLOCAL ) continue ;
}
if ( types [ i ] = = TT_RSSITEMLOCAL ) {
if ( types [ j ] = = TT_RSSITEMLOCAL ) continue ;
}
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// only compare to one local link text for each i
2015-12-01 12:38:51 +01:00
if ( types [ j ] = = TT_LINKTEXTLOCAL & & localFlag1 ) {
2013-08-02 13:12:24 -07:00
continue ;
2015-12-01 12:38:51 +01:00
}
if ( types [ j ] = = TT_RSSITEMLOCAL & & localFlag2 ) {
2013-08-02 13:12:24 -07:00
continue ;
2015-12-01 12:38:51 +01:00
}
if ( types [ j ] = = TT_LINKTEXTLOCAL ) {
localFlag1 = 1 ;
}
if ( types [ j ] = = TT_RSSITEMLOCAL ) {
localFlag2 = 1 ;
}
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// not link title attr to link title attr either
// fixes http://www.spiritualwoman.net/?cat=191
if ( types [ i ] = = TT_TITLEATT & &
types [ j ] = = TT_TITLEATT )
continue ;
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// get our words
Words * w2 = cptrs [ j ] ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// skip if got nuked above
if ( ! w2 ) continue ;
2014-11-10 14:45:11 -08:00
int32_t a2 = as [ j ] ;
int32_t b2 = bs [ j ] ;
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// how similar is title #i to title #j ?
2015-11-25 16:51:27 +01:00
float fp = getSimilarity ( w2 , a2 , b2 , w1 , a1 , b1 ) ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// error?
if ( fp = = - 1.0 ) return false ;
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// custom boosting...
float boost = 1.0 ;
if ( fp > = .95 ) boost = 3.0 ;
else if ( fp > = .90 ) boost = 2.0 ;
else if ( fp > = .85 ) boost = 1.5 ;
else if ( fp > = .80 ) boost = 1.4 ;
else if ( fp > = .75 ) boost = 1.3 ;
else if ( fp > = .70 ) boost = 1.2 ;
else if ( fp > = .60 ) boost = 1.1 ;
else if ( fp > = .50 ) boost = 1.08 ;
else if ( fp > = .40 ) boost = 1.04 ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// limit total
total * = boost ;
if ( total > 100.0 ) break ;
// if you are matching the url path, that is pretty
// good so give more!
// actually, that would hurt:
// http://michellemalkin.com/2008/12/29/gag-worthy/
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// custom boosting!
if ( fp > 0.0 & & g_conf . m_logDebugTitle )
2016-05-20 09:18:32 +02:00
logf ( LOG_DEBUG , " title: i=% " PRId32 " j=% " PRId32 " fp=%.02f "
2013-08-02 13:12:24 -07:00
" b=%.02f " , i , j , fp , boost ) ;
// apply it
scores [ i ] * = boost ;
2015-12-04 13:18:54 +01:00
2016-01-11 15:46:09 +01:00
iccb * = boost ;
2013-08-02 13:12:24 -07:00
}
2015-11-25 16:51:27 +01:00
2016-01-11 15:46:09 +01:00
inCommonCandBoost [ i ] = iccb ;
2013-08-02 13:12:24 -07:00
}
2016-05-20 09:18:32 +02:00
//logf(LOG_DEBUG,"title: took7=%" PRId64,gettimeofdayInMilliseconds()-x);
2013-08-02 13:12:24 -07:00
//x = gettimeofdayInMilliseconds();
// loop over all n candidates
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < n ; i + + ) {
2013-08-02 13:12:24 -07:00
// skip if not in the document body
2016-02-25 13:43:37 +01:00
if ( cptrs [ i ] ! = words ) continue ;
2013-08-02 13:12:24 -07:00
// point to the words
2014-11-10 14:45:11 -08:00
int32_t a1 = as [ i ] ;
int32_t b1 = bs [ i ] ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// . loop through this candidates words
// . TODO: use memset here?
2015-12-01 12:38:51 +01:00
for ( int32_t j = a1 ; j < = b1 & & j < NW ; j + + ) {
2013-08-02 13:12:24 -07:00
// flag it
flags [ j ] | = 0x01 ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
}
// free our stuff
2015-12-01 12:38:51 +01:00
if ( flags ! = localBuf ) {
mfree ( flags , need , " TITLEflags " ) ;
}
2013-08-02 13:12:24 -07:00
// now get the highest scoring candidate title
float max = - 1.0 ;
2014-11-10 14:45:11 -08:00
int32_t winner = - 1 ;
for ( int32_t i = 0 ; i < n ; i + + ) {
2013-08-02 13:12:24 -07:00
// skip if got nuked
2015-12-01 12:38:51 +01:00
if ( ! cptrs [ i ] ) {
continue ;
}
if ( winner ! = - 1 & & scores [ i ] < = max ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// url path's cannot be titles in and of themselves
2015-12-01 12:38:51 +01:00
if ( types [ i ] = = TT_URLPATH ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// skip if empty basically, like if title was exact
// copy of root, then the whole thing got nuked and
// some empty string added, where a > b
2015-12-01 12:38:51 +01:00
if ( as [ i ] > = bs [ i ] ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// got one
max = scores [ i ] ;
2015-12-01 12:38:51 +01:00
2013-08-02 13:12:24 -07:00
// save it
winner = i ;
}
// if we are a root, always pick the title tag as the title
2015-12-01 12:38:51 +01:00
if ( oldn = = - 2 & & tti > = 0 ) {
winner = tti ;
}
2013-08-02 13:12:24 -07:00
// if no winner, all done. no title
2015-12-01 12:38:51 +01:00
if ( winner = = - 1 ) {
// last resort use file name
2016-02-18 14:45:48 +01:00
if ( ( contentType = = CT_PDF ) & & ( firstUrl - > getFilenameLen ( ) ! = 0 ) ) {
2015-12-01 12:38:51 +01:00
Words w ;
2016-02-18 14:45:48 +01:00
w . set ( firstUrl - > getFilename ( ) , firstUrl - > getFilenameLen ( ) , true ) ;
2015-12-01 12:38:51 +01:00
if ( ! copyTitle ( & w , 0 , w . getNumWords ( ) ) ) {
return false ;
}
}
return true ;
}
2013-08-02 13:12:24 -07:00
// point to the words class of the winner
Words * w = cptrs [ winner ] ;
// skip if got nuked above
2016-06-20 12:30:26 +02:00
if ( ! w ) { g_process . shutdownAbort ( true ) ; }
2013-08-02 13:12:24 -07:00
// need to make our own Pos class if title not from body
Pos tp ;
2016-02-25 13:43:37 +01:00
if ( w ! = words ) {
2013-08-02 13:12:24 -07:00
// set "Scores" ptr to NULL. we assume all are positive scores
2015-11-25 16:51:27 +01:00
if ( ! tp . set ( w ) ) {
return false ;
}
2013-08-02 13:12:24 -07:00
}
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// the string ranges from word #a up to and including word #b
2014-11-10 14:45:11 -08:00
int32_t a = as [ winner ] ;
int32_t b = bs [ winner ] ;
2013-08-02 13:12:24 -07:00
// sanity check
2016-06-20 12:30:26 +02:00
if ( a < 0 | | b > w - > getNumWords ( ) ) { g_process . shutdownAbort ( true ) ; }
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
// save the title
2015-11-25 16:51:27 +01:00
if ( ! copyTitle ( w , a , b ) ) {
2013-08-02 13:12:24 -07:00
return false ;
2015-11-25 16:51:27 +01:00
}
2013-08-02 13:12:24 -07:00
2015-11-25 16:51:27 +01:00
/*
// debug logging
SafeBuf sb ;
SafeBuf * pbuf = & sb ;
2016-01-11 15:46:09 +01:00
log ( " title: candidates for %s " , xd - > getFirstUrl ( ) - > getUrl ( ) ) ;
2015-11-25 16:51:27 +01:00
pbuf - > safePrintf ( " <div stype= \" border:1px solid black \" > " ) ;
pbuf - > safePrintf ( " <b>***Finding Title***</b><br> \n " ) ;
2013-08-02 13:12:24 -07:00
pbuf - > safePrintf ( " <table cellpadding=5 border=2><tr> "
" <td colspan=20><center><b>Title Generation</b> "
" </center></td> "
" </tr> \n <tr> "
" <td>#</td> "
" <td>type</td> "
" <td>parent</td> "
" <td>base score</td> "
" <td>format penalty</td> "
" <td>query term boost</td> "
" <td>candidate intersection boost</td> "
" <td>FINAL SCORE</td> "
" <td>title</td> "
" </tr> \n " ) ;
// print out all candidates
2014-11-10 14:45:11 -08:00
for ( int32_t i = 0 ; i < n ; i + + ) {
2013-08-02 13:12:24 -07:00
char * ts = " unknown " ;
if ( types [ i ] = = TT_LINKTEXTLOCAL ) ts = " local inlink text " ;
if ( types [ i ] = = TT_LINKTEXTREMOTE ) ts = " remote inlink text " ;
if ( types [ i ] = = TT_RSSITEMLOCAL ) ts = " local rss title " ;
if ( types [ i ] = = TT_RSSITEMREMOTE ) ts = " remote rss title " ;
if ( types [ i ] = = TT_BOLDTAG ) ts = " bold tag " ;
if ( types [ i ] = = TT_HTAG ) ts = " header tag " ;
if ( types [ i ] = = TT_TITLETAG ) ts = " title tag " ;
if ( types [ i ] = = TT_FIRSTLINE ) ts = " first line in text " ;
if ( types [ i ] = = TT_FONTTAG ) ts = " font tag " ;
if ( types [ i ] = = TT_ATAG ) ts = " anchor tag " ;
if ( types [ i ] = = TT_DIVTAG ) ts = " div tag " ;
if ( types [ i ] = = TT_TDTAG ) ts = " td tag " ;
if ( types [ i ] = = TT_PTAG ) ts = " p tag " ;
if ( types [ i ] = = TT_URLPATH ) ts = " url path " ;
if ( types [ i ] = = TT_TITLEATT ) ts = " title attribute " ;
// get the title
pbuf - > safePrintf (
" <tr> "
2016-05-20 09:18:32 +02:00
" <td>#% " PRId32 " </td> "
2013-08-02 13:12:24 -07:00
" <td><nobr>%s</nobr></td> "
2016-05-20 09:18:32 +02:00
" <td>% " PRId32 " </td> "
2013-08-02 13:12:24 -07:00
" <td>%0.2f</td> " // baseScore
" <td>%0.2f</td> "
" <td>%0.2f</td> "
" <td>%0.2f</td> "
" <td>%0.2f</td> "
" <td> " ,
i ,
ts ,
parent [ i ] ,
baseScore [ i ] ,
noCapsBoost [ i ] ,
qtermsBoost [ i ] ,
inCommonCandBoost [ i ] ,
scores [ i ] ) ;
// ptrs
Words * w = cptrs [ i ] ;
2014-11-10 14:45:11 -08:00
int32_t a = as [ i ] ;
int32_t b = bs [ i ] ;
2013-08-02 13:12:24 -07:00
// skip if no words
if ( w - > getNumWords ( ) < = 0 ) continue ;
// the word ptrs
char * * wptrs = w - > getWordPtrs ( ) ;
// string ptrs
char * ptr = wptrs [ a ] ; //w->getWord(a);
2014-11-10 14:45:11 -08:00
int32_t size = w - > getStringSize ( a , b ) ;
2013-08-02 13:12:24 -07:00
// it is utf8
pbuf - > safeMemcpy ( ptr , size ) ;
// end the line
pbuf - > safePrintf ( " </td></tr> \n " ) ;
}
pbuf - > safePrintf ( " </table> \n <br> \n " ) ;
// log these for now
2016-01-11 15:46:09 +01:00
log ( " title: %s " , sb . getBufStart ( ) ) ;
2015-12-04 13:18:54 +01:00
*/
2015-11-25 16:51:27 +01:00
2013-08-02 13:12:24 -07:00
return true ;
2015-12-04 13:18:54 +01:00
2013-08-02 13:12:24 -07:00
}
// . returns 0.0 to 1.0
// . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1)
// . gets 50% points if has all single words, and the other 50% if all phrases
// . Scores class applies to w1 only, use NULL if none
// . use word popularity information for scoring rarer term matches more
// . ONLY CHECKS FIRST 1000 WORDS of w2 for speed
2014-11-10 14:45:11 -08:00
float Title : : getSimilarity ( Words * w1 , int32_t i0 , int32_t i1 ,
Words * w2 , int32_t t0 , int32_t t1 ) {
2013-08-02 13:12:24 -07:00
// if either empty, that's 0% contained
if ( w1 - > getNumWords ( ) < = 0 ) return 0 ;
if ( w2 - > getNumWords ( ) < = 0 ) return 0 ;
if ( i0 > = i1 ) return 0 ;
if ( t0 > = t1 ) return 0 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// invalids vals
if ( i0 < 0 ) return 0 ;
if ( t0 < 0 ) return 0 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// . for this to be useful we must use idf
// . get the popularity of each word in w1
// . w1 should only be a few words since it is a title candidate
// . does not add pop for word #i if scores[i] <= 0
// . take this out for now since i removed the unified dict,
// we could use this if we added popularity to g_wiktionary
// but it would have to be language dependent
Pops pops1 ;
Pops pops2 ;
if ( ! pops1 . set ( w1 , i0 , i1 ) ) return - 1.0 ;
if ( ! pops2 . set ( w2 , t0 , t1 ) ) return - 1.0 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// now hash the words in w1, the needle in the haystack
2014-11-10 14:45:11 -08:00
int32_t nw1 = w1 - > getNumWords ( ) ;
2013-08-02 13:12:24 -07:00
if ( i1 > nw1 ) i1 = nw1 ;
HashTable table ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// this augments the hash table
2014-10-30 13:36:39 -06:00
int64_t lastWid = - 1 ;
2013-08-02 13:12:24 -07:00
float lastScore = 0.0 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// but we cannot have more than 1024 slots then
if ( ! table . set ( 1024 ) ) return - 1.0 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// and table auto grows when 90% full, so limit us here
2014-11-10 14:45:11 -08:00
int32_t count = 0 ;
2016-02-18 14:45:48 +01:00
int32_t maxCount = 20 ;
2013-08-02 13:12:24 -07:00
// sum up everything we add
float sum = 0.0 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// loop over all words in "w1" and hash them
2014-11-10 14:45:11 -08:00
for ( int32_t i = i0 ; i < i1 ; i + + ) {
2013-08-02 13:12:24 -07:00
// the word id
2016-05-23 16:39:52 +02:00
int64_t wid = w1 - > getWordId ( i ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// skip if not indexable
2016-02-18 14:45:48 +01:00
if ( wid = = 0 ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// no room left in table!
if ( count + + > maxCount ) {
//logf(LOG_DEBUG, "query: Hash table for title "
// "generation too small. Truncating words from w1.");
break ;
}
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// . make this a float. it ranges from 0.0 to 1.0
// . 1.0 means the word occurs in 100% of documents sampled
// . 0.0 means it occurs in none of them
// . but "val" is the complement of those two statements!
float score = 1.0 - pops1 . getNormalizedPop ( i ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// accumulate
sum + = score ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// add to table
2016-02-18 14:45:48 +01:00
if ( ! table . addKey ( ( int32_t ) wid , ( int32_t ) score , NULL ) ) {
2013-08-02 13:12:24 -07:00
return - 1.0 ;
2016-02-18 14:45:48 +01:00
}
2013-08-02 13:12:24 -07:00
// if no last wid, continue
2016-02-18 14:45:48 +01:00
if ( lastWid = = - 1LL ) {
lastWid = wid ;
lastScore = score ;
continue ;
}
2013-08-02 13:12:24 -07:00
// . what was his val?
// . the "val" of the phrase:
float phrScore = score + lastScore ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// do not count as much as single words
phrScore * = 0.5 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// accumulate
sum + = phrScore ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// get the phrase id
2014-10-30 13:36:39 -06:00
int64_t pid = hash64 ( wid , lastWid ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// now add that
2014-11-10 14:45:11 -08:00
if ( ! table . addKey ( ( int32_t ) pid , ( int32_t ) phrScore , NULL ) )
2013-08-02 13:12:24 -07:00
return - 1.0 ;
// we are now the last wid
lastWid = wid ;
lastScore = score ;
}
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// sanity check. it can't grow cuz we keep lastWids[] 1-1 with it
if ( table . getNumSlots ( ) ! = 1024 ) {
log ( LOG_LOGIC , " query: Title has logic bug. " ) ;
return - 1.0 ;
}
// accumulate scores of words that are found
float found = 0.0 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// reset
lastWid = - 1LL ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// loop over all words in "w1" and hash them
2014-11-10 14:45:11 -08:00
for ( int32_t i = t0 ; i < t1 ; i + + ) {
2013-08-02 13:12:24 -07:00
// the word id
2016-05-23 16:39:52 +02:00
int64_t wid = w2 - > getWordId ( i ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// skip if not indexable
2016-02-18 14:45:48 +01:00
if ( wid = = 0 ) {
continue ;
}
2013-08-02 13:12:24 -07:00
// . make this a float. it ranges from 0.0 to 1.0
// . 1.0 means the word occurs in 100% of documents sampled
// . 0.0 means it occurs in none of them
// . but "val" is the complement of those two statements!
float score = 1.0 - pops2 . getNormalizedPop ( i ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// accumulate
sum + = score ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// is it in table?
2014-11-10 14:45:11 -08:00
int32_t slot = table . getSlot ( ( int32_t ) wid ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// . if in table, add that up to "found"
// . we essentially find his wid AND our wid, so 2.0 times
2016-02-18 14:45:48 +01:00
if ( slot > = 0 ) {
found + = 2.0 * score ;
}
2013-08-02 13:12:24 -07:00
// now the phrase
2016-02-18 14:45:48 +01:00
if ( lastWid = = - 1LL ) {
lastWid = wid ;
lastScore = score ;
continue ;
}
2013-08-02 13:12:24 -07:00
// . what was his val?
// . the "val" of the phrase:
float phrScore = score + lastScore ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// do not count as much as single words
phrScore * = 0.5 ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// accumulate
sum + = phrScore ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// get the phrase id
2014-10-30 13:36:39 -06:00
int64_t pid = hash64 ( wid , lastWid ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// is it in table?
2014-11-10 14:45:11 -08:00
slot = table . getSlot ( ( int32_t ) pid ) ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// . accumulate if in there
// . we essentially find his wid AND our wid, so 2.0 times
if ( slot > = 0 ) found + = 2.0 * phrScore ;
2016-02-18 14:45:48 +01:00
2013-08-02 13:12:24 -07:00
// we are now the last wid
lastWid = wid ;
lastScore = score ;
}
// do not divide by zero
if ( sum = = 0.0 ) return 0.0 ;
// sanity check
2016-06-20 12:30:26 +02:00
//if ( found > sum ) { g_process.shutdownAbort(true); }
if ( found < 0.0 | | sum < 0.0 ) { g_process . shutdownAbort ( true ) ; }
2013-08-02 13:12:24 -07:00
// . return the percentage matched
// . will range from 0.0 to 1.0
return found / sum ;
}
// . copy just words in [t0,t1)
// . returns false on error and sets g_errno
2015-11-25 16:51:27 +01:00
bool Title : : copyTitle ( Words * w , int32_t t0 , int32_t t1 ) {
2013-08-02 13:12:24 -07:00
// skip initial punct
2016-05-23 16:39:52 +02:00
const char * const * wp = w - > getWords ( ) ;
const int32_t * wlens = w - > getWordLens ( ) ;
int32_t nw = w - > getNumWords ( ) ;
2013-08-02 13:12:24 -07:00
// sanity check
2016-06-20 12:30:26 +02:00
if ( t1 < t0 ) { g_process . shutdownAbort ( true ) ; }
2013-08-02 13:12:24 -07:00
// don't breech number of words
2015-12-02 14:05:42 +01:00
if ( t1 > nw ) {
t1 = nw ;
}
2013-08-02 13:12:24 -07:00
// no title?
2015-12-02 14:05:42 +01:00
if ( nw = = 0 | | t0 = = t1 ) {
reset ( ) ;
return true ;
}
2013-08-02 13:12:24 -07:00
2016-05-23 16:39:52 +02:00
const char * end = wp [ t1 - 1 ] + wlens [ t1 - 1 ] ;
2013-08-02 13:12:24 -07:00
// allocate title
2014-11-10 14:45:11 -08:00
int32_t need = end - wp [ t0 ] ;
2013-08-02 13:12:24 -07:00
// add 3 bytes for "..." and 1 for \0
need + = 5 ;
2016-01-15 15:52:17 +01:00
// return false if could not hold the title
if ( need > MAX_TITLE_LEN ) {
m_title [ 0 ] = ' \0 ' ;
m_titleLen = 0 ;
2016-05-20 09:18:32 +02:00
log ( " query: Could not alloc % " PRId32 " bytes for title. " , need ) ;
2013-08-02 13:12:24 -07:00
return false ;
}
2015-12-02 14:05:42 +01:00
2013-08-02 13:12:24 -07:00
// point to the title to transcribe
2016-05-23 16:39:52 +02:00
const char * src = wp [ t0 ] ;
const char * srcEnd = end ;
2013-08-02 13:12:24 -07:00
// include a \" or \'
2015-12-02 14:05:42 +01:00
if ( t0 > 0 & & ( src [ - 1 ] = = ' \' ' | | src [ - 1 ] = = ' \" ' ) ) {
2013-08-02 13:12:24 -07:00
src - - ;
2015-12-01 12:38:51 +01:00
}
2013-08-02 13:12:24 -07:00
// and remove terminating | or :
for ( ;
srcEnd > src & &
( srcEnd [ - 1 ] = = ' : ' | |
srcEnd [ - 1 ] = = ' ' | |
srcEnd [ - 1 ] = = ' - ' | |
2014-07-15 10:06:33 -07:00
srcEnd [ - 1 ] = = ' \n ' | |
srcEnd [ - 1 ] = = ' \r ' | |
2013-08-02 13:12:24 -07:00
srcEnd [ - 1 ] = = ' | ' ) ;
srcEnd - - ) ;
// store in here
char * dst = m_title ;
2015-12-02 14:05:42 +01:00
2013-08-02 13:12:24 -07:00
// leave room for "...\0"
2016-01-15 15:52:17 +01:00
char * dstEnd = m_title + need - 4 ;
2015-12-02 14:05:42 +01:00
2013-08-02 13:12:24 -07:00
// size of character in bytes, usually 1
char cs ;
2015-12-02 14:05:42 +01:00
2013-08-02 13:12:24 -07:00
// point to last punct char
2014-07-02 08:03:33 -07:00
char * lastp = dst ; //NULL;
2015-12-02 14:05:42 +01:00
2014-11-10 14:45:11 -08:00
int32_t charCount = 0 ;
2013-08-02 13:12:24 -07:00
// copy the node @p into "dst"
for ( ; src < srcEnd ; src + = cs , dst + = cs ) {
// get src size
cs = getUtf8CharSize ( src ) ;
2015-11-26 11:57:21 +01:00
2013-08-02 13:12:24 -07:00
// break if we are full!
2015-11-26 11:57:21 +01:00
if ( dst + cs > = dstEnd ) {
break ;
}
2014-07-02 08:03:33 -07:00
// or hit our max char limit
2016-02-25 13:43:37 +01:00
if ( charCount + + > = m_maxTitleLen ) {
2015-11-26 11:57:21 +01:00
break ;
}
2015-12-02 14:05:42 +01:00
// skip unwanted character
if ( isUtf8UnwantedSymbols ( src ) ) {
dst - = cs ;
continue ;
}
2013-08-02 13:12:24 -07:00
// remember last punct for cutting purposes
2015-11-26 11:57:21 +01:00
if ( ! is_alnum_utf8 ( src ) ) {
lastp = dst ;
}
2013-08-02 13:12:24 -07:00
// encode it as an html entity if asked to
2015-12-02 14:05:42 +01:00
if ( * src = = ' < ' ) {
if ( dst + 4 > = dstEnd ) {
break ;
}
2015-01-13 12:25:42 -07:00
gbmemcpy ( dst , " < " , 4 ) ;
2013-08-02 13:12:24 -07:00
dst + = 4 - cs ;
continue ;
}
2015-11-26 11:57:21 +01:00
2013-08-02 13:12:24 -07:00
// encode it as an html entity if asked to
2015-12-02 14:05:42 +01:00
if ( * src = = ' > ' ) {
if ( dst + 4 > = dstEnd ) {
break ;
}
2015-01-13 12:25:42 -07:00
gbmemcpy ( dst , " > " , 4 ) ;
2013-08-02 13:12:24 -07:00
dst + = 4 - cs ;
continue ;
}
2015-11-26 11:57:21 +01:00
2015-01-13 12:25:42 -07:00
// if more than 1 byte in char, use gbmemcpy
2015-11-26 11:57:21 +01:00
if ( cs = = 1 ) {
* dst = * src ;
} else {
gbmemcpy ( dst , src , cs ) ;
}
2013-08-02 13:12:24 -07:00
}
// null term always
* dst = ' \0 ' ;
// do not split a word in the middle!
if ( src < srcEnd ) {
if ( lastp ) {
2015-01-13 12:25:42 -07:00
gbmemcpy ( lastp , " ... \0 " , 4 ) ;
2013-08-02 13:12:24 -07:00
dst = lastp + 3 ;
2015-11-26 11:57:21 +01:00
} else {
2015-01-13 12:25:42 -07:00
gbmemcpy ( dst , " ... \0 " , 4 ) ;
2013-08-02 13:12:24 -07:00
dst + = 3 ;
}
}
// set size. does not include the terminating \0
2016-01-15 15:52:17 +01:00
m_titleLen = dst - m_title ;
2013-08-02 13:12:24 -07:00
return true ;
}