Fix title for PDF files & add some simple tests for it

This commit is contained in:
Ai Lin Chia
2015-12-01 12:38:51 +01:00
parent cb12d07632
commit 0884edf08e
15 changed files with 962 additions and 676 deletions

@ -344,10 +344,10 @@ bool Matches::set ( XmlDoc *xd ,
niceness ) )
return false;
// also use the title from the title tag, because sometimes
// it does not equal "tt->getTitle()"
int32_t a = tt->m_titleTagStart;
int32_t b = tt->m_titleTagEnd;
// also use the title from the title tag, because sometimes it does not equal "tt->getTitle()"
int32_t a = tt->getTitleTagStart();
int32_t b = tt->getTitleTagEnd();
char *start = NULL;
char *end = NULL;
if ( a >= 0 && b >= 0 && b>a ) {

@ -2738,7 +2738,7 @@ bool printCrawlDetails2 (SafeBuf *sb , CollectionRec *cx , char format ) {
if ( format == FORMAT_JSON ) {
sb->safePrintf("{"
"\"response:{\n"
"\"response\":{\n"
"\t\"statusCode\":%"INT32",\n"
"\t\"statusMsg\":\"%s\",\n"
"\t\"jobCreationTimeUTC\":%"INT32",\n"
@ -2758,7 +2758,7 @@ bool printCrawlDetails2 (SafeBuf *sb , CollectionRec *cx , char format ) {
);
sb->safePrintf("\t\"currentTime\":%"UINT32",\n",
(uint32_t)getTimeGlobal() );
sb->safePrintf("\t\"currentTimeUTC\":%"UINT32",\n",
sb->safePrintf("\t\"currentTimeUTC\":%"UINT32"\n",
(uint32_t)getTimeGlobal() );
sb->safePrintf("\t}\n");
sb->safePrintf("}\n");

148
Pos.cpp

@ -32,83 +32,65 @@ int32_t Pos::filter( char *p, char *pend, class Words *words, int32_t a, int32_t
// . returns false and sets g_errno on error
// . if f is non-NULL store filtered words into there. back to back spaces
// are eliminated.
bool Pos::set (Words *words ,
char *f ,
char *fend,
int32_t *len ,
int32_t a ,
int32_t b ,
char *buf ,
int32_t bufSize ) {
bool Pos::set (Words *words, char *f, char *fend, int32_t *len , int32_t a , int32_t b, char *buf, int32_t bufSize ) {
// free m_buf in case this is a second call
if ( ! f ) reset();
if ( ! f ) {
reset();
}
int32_t nw = words->getNumWords();
int32_t *wlens = words->m_wordLens;
nodeid_t *tids = words->getTagIds(); // m_tagIds;
char **wp = words->m_words;
//int32_t *ss = NULL;
//int64_t *wids = words->m_wordIds;
//if ( scores ) ss = scores->m_scores;
int32_t nw = words->getNumWords();
int32_t *wlens = words->m_wordLens;
nodeid_t *tids = words->getTagIds(); // m_tagIds;
char **wp = words->m_words;
// save start point for filtering
char *fstart = f;
// -1 is the default value
if ( b == -1 ) b = nw;
if ( b == -1 ) {
b = nw;
}
// alloc array if need to
int32_t need = (nw+1) * 4;
// do not destroy m_pos/m_numWords if only filtering into a buffer
if ( f ) goto skip;
if ( !f ) {
m_needsFree = false;
m_needsFree = false;
m_buf = m_localBuf;
if ( need > POS_LOCALBUFSIZE && need < bufSize )
m_buf = buf;
else if ( need > POS_LOCALBUFSIZE ) {
m_buf = (char *)mmalloc(need,"Pos");
m_needsFree = true;
m_buf = m_localBuf;
if ( need > POS_LOCALBUFSIZE && need < bufSize )
m_buf = buf;
else if ( need > POS_LOCALBUFSIZE ) {
m_buf = (char *)mmalloc(need,"Pos");
m_needsFree = true;
}
// bail on error
if ( ! m_buf ) return false;
m_bufSize = need;
m_pos = (int32_t *)m_buf;
m_numWords = nw;
}
// bail on error
if ( ! m_buf ) return false;
m_bufSize = need;
m_pos = (int32_t *)m_buf;
m_numWords = nw;
skip:
// this is the CHARACTER count.
int32_t pos = 0;
bool trunc = false;
char *p , *pend;
//char *nextp;
//int32_t skip;
char* lastBreak = NULL;
// utf8 char
//int32_t c;
// its size in bytes
//char cs;
// int16_tcut
//Section **sp = NULL;
//if ( sections ) sp = sections->m_sectionPtrs;
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
// flag for stopping back-to-back spaces. only count those as one char.
bool lastSpace = false;
int32_t maxCharSize = 4; // we are utf8
int32_t maxCharSize = 4; // we are utf8
for ( int32_t i = a ; i < b ; i++ ) {
if (trunc) break;
// set pos for the ith word to "pos"
if ( ! f ) m_pos[i] = pos;
if (trunc) {
break;
}
// if inside a bad tag, skip it
//if ( sp && (sp[i]->m_flags & badFlags) ) continue;
// set pos for the ith word to "pos"
if ( ! f ) {
m_pos[i] = pos;
}
// is tag?
if ( tids && tids[i] ) {
@ -130,35 +112,44 @@ bool Pos::set (Words *words ,
}
// if had a previous breaking tag and no non-tag
// word after it, do not count back-to-back spaces
if ( lastSpace ) continue;
if ( lastSpace ) {
continue;
}
// if had a br tag count it as a '.'
if ( tids[i] ) { // == 20 ) { // <br>
if ( tids[i] ) { // <br>
// are we filtering?
if ( f && f != fstart ) {
if ((fend-f>2*maxCharSize)) {
*f++ = '.';
*f++ = ' ';
} else {
trunc = true;
}
else trunc = true;
}
// count as double periods
//pos += 3;
// no, just single period.
pos += 2;
lastSpace = true;
continue;
}
// are we filtering?
if ( f ) {
if ((fend-f > maxCharSize)) {
*f++ = ' ';
} else {
trunc = true;
}
else trunc = true;
}
// count as a single space
pos++;
// do not allow back-to-back spaces
lastSpace = true;
continue;
}
@ -171,52 +162,71 @@ bool Pos::set (Words *words ,
for ( ; p < pend ; p += cs ) {
// get size
cs = getUtf8CharSize(p);
// do not count space if one before
if ( is_wspace_utf8 (p) ) {
if ( lastSpace ) continue;
if ( lastSpace ) {
continue;
}
lastSpace = true;
// are we filtering?
if ( f ) {
if (fend-f > 1 ) {
lastBreak = f;
*f++ = ' ';
} else {
trunc = true;
}
else trunc = true;
}
pos++;
continue;
}
if ( f ) {
if (fend-f > cs){
if (fend-f > cs) {
// change '|' to commas
if ( *p == '|' )
if ( *p == '|' ) {
*f++ = ',';
else if ( cs == 1 )
} else if ( cs == 1 ) {
*f++ = *p;
else {
} else {
gbmemcpy(f,p,cs);
f += cs;
}
}
else trunc = true;
else {
trunc = true;
}
}
pos++;
lastSpace = false;
}
}
if (trunc) {
if(lastBreak == NULL) {
*len = 0;
return false;
}
else if(f) f = lastBreak;
else if (f) {
f = lastBreak;
}
}
// set pos for the END of the last word here (used in Summary.cpp)
if ( ! f ) m_pos[nw] = pos;
// NULL terminate f
else { *len = f - fstart; }
if ( fend-f > maxCharSize) { *f = '\0';}
if ( ! f ) {
m_pos[nw] = pos;
} else { // NULL terminate f
*len = f - fstart;
}
if ( fend-f > maxCharSize) {
*f = '\0';
}
// Success
return true;
}

2
Pos.h

@ -38,8 +38,6 @@ class Pos {
int32_t a = 0 ,
int32_t b = -1 );
int32_t getMemUsed () { return m_bufSize; };
// . the position in CHARACTERS of word i is given by m_pos[i]
// . this is NOT the byte position. you can have 2, 3 or even 4
// byte characters in utf8. the purpose here is for counting

1185
Title.cpp

File diff suppressed because it is too large Load Diff

66
Title.h

@ -3,10 +3,16 @@
#ifndef _TITLE_H_
#define _TITLE_H_
#include "Query.h"
#include <stdint.h>
#define TITLE_LOCAL_SIZE 2048
// forward declaration
class XmlDoc;
class Xml;
class Words;
class Query;
class Title {
public:
Title();
@ -14,6 +20,7 @@ public:
void reset();
/// @todo correct comments
// . set m_title to the title of the document represented by "xd"
// . if getHardTitle is true will always use the title in the <title>
// tag, but if that is not present, will try dmoz titles before
@ -30,42 +37,45 @@ public:
// . does not consult words with scores of 0 (unless a meta tag)
// . maxTitleChars is in chars (a utf16 char is 2 bytes or more)
// . maxTitleChars of -1 means no max
bool setTitle (class XmlDoc *xd ,
class Xml *xml ,
class Words *words ,
int32_t maxTitleChars ,
int32_t maxTitleWords ,
Query *q ,
int32_t niceness );
bool setTitle(XmlDoc *xd, Xml *xml, Words *words, int32_t maxTitleChars, Query *q, int32_t niceness);
char *getTitle() {
return m_title;
}
char *getTitle ( ) { return m_title; }
int32_t getTitleLen ( ) { return m_titleBytes; } // does NOT include \0
// does NOT include \0
int32_t getTitleLen() {
return m_titleBytes;
}
int32_t getTitleTagStart() {
return m_titleTagStart;
}
bool copyTitle (class Words *words, int32_t t0, int32_t t1 );
int32_t getTitleTagEnd() {
return m_titleTagEnd;
}
float getSimilarity ( Words *w1 , int32_t i0 , int32_t i1 , Words *w2 , int32_t t0 , int32_t t1 );
bool copyTitle(Words *words, int32_t t0, int32_t t1);
char *m_title;
int32_t m_titleBytes; // in bytes. does NOT include \0
int32_t m_titleAllocSize;
char m_localBuf [ TITLE_LOCAL_SIZE ];
char m_niceness;
int32_t m_maxTitleChars;
int32_t m_titleTagStart ;
int32_t m_titleTagEnd ;
float getSimilarity(Words *w1, int32_t i0, int32_t i1, Words *w2, int32_t t0, int32_t t1);
private:
bool setTitle4 (class XmlDoc *xd,
class Xml *xml,
class Words *words,
int32_t maxTitleChars,
int32_t maxTitleWords,
Query *q);
char *m_title;
int32_t m_titleBytes; // in bytes. does NOT include \0
int32_t m_titleAllocSize;
char m_localBuf[TITLE_LOCAL_SIZE];
char m_niceness;
int32_t m_maxTitleChars;
int32_t m_titleTagStart;
int32_t m_titleTagEnd;
bool setTitle4(XmlDoc *xd, Xml *xml, Words *words, int32_t maxTitleChars, int32_t maxTitleWords, Query *q);
};

@ -3081,7 +3081,7 @@ bool XmlDoc::indexDoc2 ( ) {
//if ( m_forceDelete ) m_decCount = true;
// fix for the exact quota bug found on eurekster collection. bug 229
// if we're not a new doc, then don't increment the count because
// if we're not a new doc, then don't increment the count because
// we have been already counted as the old doc. MDW: i added the
// condition that if decCount is true we need to update the count!
if ( *isIndexed && ! m_decCount ) return logIt();
@ -3106,12 +3106,12 @@ bool XmlDoc::indexDoc2 ( ) {
if ( m_msg16.m_termIdHost == 0 ) { char *xx = NULL; *xx = 0; }
if ( m_msg16.m_termIdDom == 0 ) { char *xx = NULL; *xx = 0; }
// . Msg36 gets the correct count from disk and puts it in cache. It
// . Msg36 gets the correct count from disk and puts it in cache. It
// doesn't try to increment or decrement the quotas in cache, because
// then it would have to be done on all twins, and also the correct
// split will have to be found.
// . Actually, we should only use the cache on one host to hold the
// sum of all splits. This will be the authority cache.
// split will have to be found.
// . Actually, we should only use the cache on one host to hold the
// sum of all splits. This will be the authority cache.
if ( ! m_updatedCounts ) {
// only call this once
m_updatedCounts = true;
@ -9247,7 +9247,7 @@ int32_t *XmlDoc::getSummaryVector ( ) {
//int32_t avail = 5000;
//int32_t len;
// put title into there
int32_t tlen = ti->m_titleBytes - 1;
int32_t tlen = ti->getTitleLen() - 1;
//if ( len > avail ) len = avail - 10;
if ( tlen < 0 ) tlen = 0;
@ -9260,7 +9260,7 @@ int32_t *XmlDoc::getSummaryVector ( ) {
//gbmemcpy ( p , ti->m_title , len );
//p += len;
sb.safeMemcpy ( ti->m_title , tlen );
sb.safeMemcpy ( ti->getTitle() , tlen );
// space separting the title from summary
if ( tlen > 0 ) sb.pushChar(' ');
@ -21547,7 +21547,7 @@ int32_t *XmlDoc::getSpiderPriority ( ) {
return &m_priority;
}
bool XmlDoc::logIt ( SafeBuf *bb ) {
bool XmlDoc::logIt (SafeBuf *bb ) {
// set errCode
int32_t errCode = m_indexCode;
@ -33774,31 +33774,47 @@ SafeBuf *XmlDoc::getHeaderTagBuf() {
Title *XmlDoc::getTitle ( ) {
if ( m_titleValid ) return &m_title;
if ( m_titleValid ) {
return &m_title;
}
// need a buncha crap
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (Title *)xml;
if ( ! xml || xml == (Xml *)-1 ) {
return (Title *)xml;
}
Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Title *)ww;
if ( ! ww || ww == (Words *)-1 ) {
return (Title *)ww;
}
Query *q = getQuery();
if ( ! q ) return (Title *)q;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
int32_t titleMaxLen = cr->m_titleMaxLen;
if ( m_req ) titleMaxLen = m_req->m_titleMaxLen;
if ( ! q ) {
return (Title *)q;
}
int32_t titleMaxLen = 256;
if ( m_req ) {
titleMaxLen = m_req->m_titleMaxLen;
} else {
CollectionRec *cr = getCollRec();
if (cr) {
titleMaxLen = cr->m_titleMaxLen;
}
}
// limit for speed, some guys have a 100k word title!
if ( titleMaxLen > 256 ) titleMaxLen = 256;
if ( titleMaxLen > 256 ) {
titleMaxLen = 256;
}
m_titleValid = true;
if ( ! m_title.setTitle ( this ,
xml ,
ww ,
titleMaxLen ,
0xffff ,
q ,
m_niceness ) )
if ( ! m_title.setTitle( this, xml, ww, titleMaxLen, q, m_niceness) ) {
return NULL;
}
return &m_title;
}
@ -33857,9 +33873,9 @@ Summary *XmlDoc::getSummary () {
m_cpuSummaryStartTime = start;
// make sure summary does not include title
char *tbuf = ti->m_title;
char *tbuf = ti->getTitle();
// this does not include the terminating \0
int32_t tbufLen = ti->m_titleBytes;
int32_t tbufLen = ti->getTitleLen();
// compute the summary
bool status;

@ -17,7 +17,8 @@ def gb(request):
# verify gb is running
try:
requests.get('http://' + gb_config.host + ':' + gb_config.port)
api = gigablast.GigablastAPI(gb_config)
api.status()
except requests.exceptions.ConnectionError:
pytest.skip('Gigablast instance down')
@ -26,3 +27,14 @@ def gb(request):
request.addfinalizer(finalize)
return gb_config
@pytest.fixture(scope='function')
def gb_api(request, gb):
api = gigablast.GigablastAPI(gb)
def finalize():
api.finalize()
request.addfinalizer(finalize)
return api

Binary file not shown.

Binary file not shown.

@ -7,17 +7,113 @@ class GigablastConfig:
self.port = config['port']
class GigablastSearch:
class GigablastAPI:
class _HTTPStatus:
@staticmethod
def compare(status, expected_status):
return status[status.find('(')+1:status.find(')')] == expected_status
@staticmethod
def doc_force_delete():
return 'Doc force deleted'
def __init__(self, gb_config):
self.config = gb_config
self._config = gb_config
self._add_urls = set()
def finalize(self):
# cleanup urls
for url in self._add_urls:
self.delete_url(url, True)
def _get_url(self, path):
return 'http://' + self._config.host + ':' + self._config.port + '/' + path
@staticmethod
def _apply_default_payload(payload):
payload.setdefault('c', 'main')
payload.setdefault('format', 'json')
payload.setdefault('showinput', '0')
def _check_http_status(self, e, expected_status):
# hacks to cater for inject returning invalid status line
if (len(e.args) == 1 and
type(e.args[0]) == requests.packages.urllib3.exceptions.ProtocolError and
len(e.args[0].args) == 2):
import http.client
if type(e.args[0].args[1]) == http.client.BadStatusLine:
if self._HTTPStatus.compare(str(e.args[0].args[1]), expected_status):
return True
return False
def _add_url(self, url, payload=None):
if not payload:
payload = {}
self._apply_default_payload(payload)
payload.update({'urls': url})
response = requests.get(self._get_url('admin/addurl'), params=payload)
return response.json()
def _inject(self, url, payload=None):
if not payload:
payload = {}
self._apply_default_payload(payload)
payload.update({'url': url})
response = requests.get(self._get_url('admin/inject'), params=payload)
# inject doesn't seem to wait until document is completely indexed
from time import sleep
sleep(0.1)
return response.json()
def add_url(self, url, real_time=True):
self._add_urls.add(url)
if real_time:
return self._inject(url)['response']['statusCode'] == 0
else:
return self._add_url(url)['response']['statusCode'] == 0
def delete_url(self, url, finalizer=False):
if not finalizer:
self._add_urls.discard(url)
payload = {'deleteurl': '1'}
try:
self._inject(url, payload)
except requests.exceptions.ConnectionError as e:
# delete url returns invalid HTTP status line
return self._check_http_status(e, self._HTTPStatus.doc_force_delete())
return False
def search(self, query, payload=None):
if not payload:
payload = {}
payload.update({'format': 'json'})
self._apply_default_payload(payload)
payload.update({'q': query})
response = requests.get('http://' + self.config.host + ':' + self.config.port + '/search', params=payload)
response = requests.get(self._get_url('search'), params=payload)
return response.json()
def status(self, payload=None):
if not payload:
payload = {}
self._apply_default_payload(payload)
response = requests.get(self._get_url('admin/status'), params=payload)
return response.json()

2
test/system/pytest.ini Normal file

@ -0,0 +1,2 @@
[pytest]
norecursedirs = data

@ -3,3 +3,4 @@
pytest==2.8.3
requests==2.7.0
pytest-localserver==0.3.4

@ -0,0 +1,27 @@
import pytest
import os
@pytest.mark.parametrize('file_location, url_with_file, content_type, expected_title', [
# file_location url_with_file content_type expected_title
('test_word_no_properties.pdf', True, 'application/pdf', 'test_word_no_properties.pdf'),
('test_word_no_properties.pdf', False, 'application/pdf', ''),
('test_word_with_properties.pdf', True, 'application/pdf', 'Title for Microsoft Word (in title)'),
])
def test_index_documents_office(gb_api, httpserver, file_location, url_with_file, content_type, expected_title):
httpserver.serve_content(content=open('data/office/' + file_location, 'rb').read(),
headers={'content-type': content_type})
print(httpserver.url)
# format url
file_url = httpserver.url + '/'
if url_with_file:
file_url += os.path.basename(file_location)
# add url
assert gb_api.add_url(file_url) == True
# verify result
result = gb_api.search('url:' + file_url)
assert len(result['results']) == 1
assert result['results'][0]['title'] == expected_title

@ -1,5 +1,4 @@
import pytest
import gigablast
@pytest.mark.parametrize('query, fx_qlang, fx_blang, fx_fetld, fx_country, expected_lang', [
@ -21,11 +20,9 @@ import gigablast
('Smurfene', '', '', '', '', 'is'), # wrong
('Smurfene', '', 'en-US', '', '', 'en'), # wrong
('Smurfene', '', 'no-NO', '', '', 'no'),
('Smurfene', '', '', '', 'no', 'no')
('Smurfene', '', '', '', 'no', 'no'),
])
def test_search_language_hint(gb, query, fx_qlang, fx_blang, fx_fetld, fx_country, expected_lang):
gb_search = gigablast.GigablastSearch(gb)
def test_search_language_hint(gb_api, query, fx_qlang, fx_blang, fx_fetld, fx_country, expected_lang):
payload = {}
# add language hints
@ -34,6 +31,6 @@ def test_search_language_hint(gb, query, fx_qlang, fx_blang, fx_fetld, fx_countr
payload.update({'fx_fetld': fx_fetld})
payload.update({'fx_country': fx_country})
result = gb_search.search(query, payload)
result = gb_api.search(query, payload)
assert result['queryInfo']['queryLanguageAbbr'] == expected_lang