mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-11 02:16:07 -04:00
Fix title for PDF files & add some simple tests for it
This commit is contained in:
@ -344,10 +344,10 @@ bool Matches::set ( XmlDoc *xd ,
|
||||
niceness ) )
|
||||
return false;
|
||||
|
||||
// also use the title from the title tag, because sometimes
|
||||
// it does not equal "tt->getTitle()"
|
||||
int32_t a = tt->m_titleTagStart;
|
||||
int32_t b = tt->m_titleTagEnd;
|
||||
// also use the title from the title tag, because sometimes it does not equal "tt->getTitle()"
|
||||
int32_t a = tt->getTitleTagStart();
|
||||
int32_t b = tt->getTitleTagEnd();
|
||||
|
||||
char *start = NULL;
|
||||
char *end = NULL;
|
||||
if ( a >= 0 && b >= 0 && b>a ) {
|
||||
|
@ -2738,7 +2738,7 @@ bool printCrawlDetails2 (SafeBuf *sb , CollectionRec *cx , char format ) {
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
sb->safePrintf("{"
|
||||
"\"response:{\n"
|
||||
"\"response\":{\n"
|
||||
"\t\"statusCode\":%"INT32",\n"
|
||||
"\t\"statusMsg\":\"%s\",\n"
|
||||
"\t\"jobCreationTimeUTC\":%"INT32",\n"
|
||||
@ -2758,7 +2758,7 @@ bool printCrawlDetails2 (SafeBuf *sb , CollectionRec *cx , char format ) {
|
||||
);
|
||||
sb->safePrintf("\t\"currentTime\":%"UINT32",\n",
|
||||
(uint32_t)getTimeGlobal() );
|
||||
sb->safePrintf("\t\"currentTimeUTC\":%"UINT32",\n",
|
||||
sb->safePrintf("\t\"currentTimeUTC\":%"UINT32"\n",
|
||||
(uint32_t)getTimeGlobal() );
|
||||
sb->safePrintf("\t}\n");
|
||||
sb->safePrintf("}\n");
|
||||
|
148
Pos.cpp
148
Pos.cpp
@ -32,83 +32,65 @@ int32_t Pos::filter( char *p, char *pend, class Words *words, int32_t a, int32_t
|
||||
// . returns false and sets g_errno on error
|
||||
// . if f is non-NULL store filtered words into there. back to back spaces
|
||||
// are eliminated.
|
||||
bool Pos::set (Words *words ,
|
||||
char *f ,
|
||||
char *fend,
|
||||
int32_t *len ,
|
||||
int32_t a ,
|
||||
int32_t b ,
|
||||
char *buf ,
|
||||
int32_t bufSize ) {
|
||||
|
||||
bool Pos::set (Words *words, char *f, char *fend, int32_t *len , int32_t a , int32_t b, char *buf, int32_t bufSize ) {
|
||||
// free m_buf in case this is a second call
|
||||
if ( ! f ) reset();
|
||||
if ( ! f ) {
|
||||
reset();
|
||||
}
|
||||
|
||||
int32_t nw = words->getNumWords();
|
||||
int32_t *wlens = words->m_wordLens;
|
||||
nodeid_t *tids = words->getTagIds(); // m_tagIds;
|
||||
char **wp = words->m_words;
|
||||
//int32_t *ss = NULL;
|
||||
//int64_t *wids = words->m_wordIds;
|
||||
//if ( scores ) ss = scores->m_scores;
|
||||
int32_t nw = words->getNumWords();
|
||||
int32_t *wlens = words->m_wordLens;
|
||||
nodeid_t *tids = words->getTagIds(); // m_tagIds;
|
||||
char **wp = words->m_words;
|
||||
|
||||
// save start point for filtering
|
||||
char *fstart = f;
|
||||
|
||||
// -1 is the default value
|
||||
if ( b == -1 ) b = nw;
|
||||
if ( b == -1 ) {
|
||||
b = nw;
|
||||
}
|
||||
|
||||
// alloc array if need to
|
||||
int32_t need = (nw+1) * 4;
|
||||
|
||||
// do not destroy m_pos/m_numWords if only filtering into a buffer
|
||||
if ( f ) goto skip;
|
||||
if ( !f ) {
|
||||
m_needsFree = false;
|
||||
|
||||
m_needsFree = false;
|
||||
|
||||
m_buf = m_localBuf;
|
||||
if ( need > POS_LOCALBUFSIZE && need < bufSize )
|
||||
m_buf = buf;
|
||||
else if ( need > POS_LOCALBUFSIZE ) {
|
||||
m_buf = (char *)mmalloc(need,"Pos");
|
||||
m_needsFree = true;
|
||||
m_buf = m_localBuf;
|
||||
if ( need > POS_LOCALBUFSIZE && need < bufSize )
|
||||
m_buf = buf;
|
||||
else if ( need > POS_LOCALBUFSIZE ) {
|
||||
m_buf = (char *)mmalloc(need,"Pos");
|
||||
m_needsFree = true;
|
||||
}
|
||||
// bail on error
|
||||
if ( ! m_buf ) return false;
|
||||
m_bufSize = need;
|
||||
m_pos = (int32_t *)m_buf;
|
||||
m_numWords = nw;
|
||||
}
|
||||
// bail on error
|
||||
if ( ! m_buf ) return false;
|
||||
m_bufSize = need;
|
||||
m_pos = (int32_t *)m_buf;
|
||||
m_numWords = nw;
|
||||
|
||||
skip:
|
||||
// this is the CHARACTER count.
|
||||
int32_t pos = 0;
|
||||
bool trunc = false;
|
||||
char *p , *pend;
|
||||
//char *nextp;
|
||||
//int32_t skip;
|
||||
|
||||
char* lastBreak = NULL;
|
||||
// utf8 char
|
||||
//int32_t c;
|
||||
// its size in bytes
|
||||
//char cs;
|
||||
|
||||
// int16_tcut
|
||||
//Section **sp = NULL;
|
||||
//if ( sections ) sp = sections->m_sectionPtrs;
|
||||
|
||||
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
|
||||
|
||||
// flag for stopping back-to-back spaces. only count those as one char.
|
||||
bool lastSpace = false;
|
||||
int32_t maxCharSize = 4; // we are utf8
|
||||
int32_t maxCharSize = 4; // we are utf8
|
||||
for ( int32_t i = a ; i < b ; i++ ) {
|
||||
if (trunc) break;
|
||||
// set pos for the ith word to "pos"
|
||||
if ( ! f ) m_pos[i] = pos;
|
||||
if (trunc) {
|
||||
break;
|
||||
}
|
||||
|
||||
// if inside a bad tag, skip it
|
||||
//if ( sp && (sp[i]->m_flags & badFlags) ) continue;
|
||||
// set pos for the ith word to "pos"
|
||||
if ( ! f ) {
|
||||
m_pos[i] = pos;
|
||||
}
|
||||
|
||||
// is tag?
|
||||
if ( tids && tids[i] ) {
|
||||
@ -130,35 +112,44 @@ bool Pos::set (Words *words ,
|
||||
}
|
||||
// if had a previous breaking tag and no non-tag
|
||||
// word after it, do not count back-to-back spaces
|
||||
if ( lastSpace ) continue;
|
||||
if ( lastSpace ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if had a br tag count it as a '.'
|
||||
if ( tids[i] ) { // == 20 ) { // <br>
|
||||
if ( tids[i] ) { // <br>
|
||||
// are we filtering?
|
||||
if ( f && f != fstart ) {
|
||||
if ((fend-f>2*maxCharSize)) {
|
||||
*f++ = '.';
|
||||
*f++ = ' ';
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
else trunc = true;
|
||||
}
|
||||
// count as double periods
|
||||
//pos += 3;
|
||||
|
||||
// no, just single period.
|
||||
pos += 2;
|
||||
lastSpace = true;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// are we filtering?
|
||||
if ( f ) {
|
||||
if ((fend-f > maxCharSize)) {
|
||||
*f++ = ' ';
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
else trunc = true;
|
||||
}
|
||||
|
||||
// count as a single space
|
||||
pos++;
|
||||
|
||||
// do not allow back-to-back spaces
|
||||
lastSpace = true;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -171,52 +162,71 @@ bool Pos::set (Words *words ,
|
||||
for ( ; p < pend ; p += cs ) {
|
||||
// get size
|
||||
cs = getUtf8CharSize(p);
|
||||
|
||||
// do not count space if one before
|
||||
if ( is_wspace_utf8 (p) ) {
|
||||
if ( lastSpace ) continue;
|
||||
if ( lastSpace ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
lastSpace = true;
|
||||
|
||||
// are we filtering?
|
||||
if ( f ) {
|
||||
if (fend-f > 1 ) {
|
||||
lastBreak = f;
|
||||
*f++ = ' ';
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
else trunc = true;
|
||||
}
|
||||
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
if ( f ) {
|
||||
if (fend-f > cs){
|
||||
if (fend-f > cs) {
|
||||
// change '|' to commas
|
||||
if ( *p == '|' )
|
||||
if ( *p == '|' ) {
|
||||
*f++ = ',';
|
||||
else if ( cs == 1 )
|
||||
} else if ( cs == 1 ) {
|
||||
*f++ = *p;
|
||||
else {
|
||||
} else {
|
||||
gbmemcpy(f,p,cs);
|
||||
f += cs;
|
||||
}
|
||||
}
|
||||
else trunc = true;
|
||||
else {
|
||||
trunc = true;
|
||||
}
|
||||
}
|
||||
|
||||
pos++;
|
||||
lastSpace = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (trunc) {
|
||||
if(lastBreak == NULL) {
|
||||
*len = 0;
|
||||
return false;
|
||||
}
|
||||
else if(f) f = lastBreak;
|
||||
else if (f) {
|
||||
f = lastBreak;
|
||||
}
|
||||
}
|
||||
|
||||
// set pos for the END of the last word here (used in Summary.cpp)
|
||||
if ( ! f ) m_pos[nw] = pos;
|
||||
// NULL terminate f
|
||||
else { *len = f - fstart; }
|
||||
if ( fend-f > maxCharSize) { *f = '\0';}
|
||||
if ( ! f ) {
|
||||
m_pos[nw] = pos;
|
||||
} else { // NULL terminate f
|
||||
*len = f - fstart;
|
||||
}
|
||||
|
||||
if ( fend-f > maxCharSize) {
|
||||
*f = '\0';
|
||||
}
|
||||
|
||||
// Success
|
||||
return true;
|
||||
}
|
||||
|
2
Pos.h
2
Pos.h
@ -38,8 +38,6 @@ class Pos {
|
||||
int32_t a = 0 ,
|
||||
int32_t b = -1 );
|
||||
|
||||
int32_t getMemUsed () { return m_bufSize; };
|
||||
|
||||
// . the position in CHARACTERS of word i is given by m_pos[i]
|
||||
// . this is NOT the byte position. you can have 2, 3 or even 4
|
||||
// byte characters in utf8. the purpose here is for counting
|
||||
|
66
Title.h
66
Title.h
@ -3,10 +3,16 @@
|
||||
#ifndef _TITLE_H_
|
||||
#define _TITLE_H_
|
||||
|
||||
#include "Query.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#define TITLE_LOCAL_SIZE 2048
|
||||
|
||||
// forward declaration
|
||||
class XmlDoc;
|
||||
class Xml;
|
||||
class Words;
|
||||
class Query;
|
||||
|
||||
class Title {
|
||||
public:
|
||||
Title();
|
||||
@ -14,6 +20,7 @@ public:
|
||||
|
||||
void reset();
|
||||
|
||||
/// @todo correct comments
|
||||
// . set m_title to the title of the document represented by "xd"
|
||||
// . if getHardTitle is true will always use the title in the <title>
|
||||
// tag, but if that is not present, will try dmoz titles before
|
||||
@ -30,42 +37,45 @@ public:
|
||||
// . does not consult words with scores of 0 (unless a meta tag)
|
||||
// . maxTitleChars is in chars (a utf16 char is 2 bytes or more)
|
||||
// . maxTitleChars of -1 means no max
|
||||
bool setTitle (class XmlDoc *xd ,
|
||||
class Xml *xml ,
|
||||
class Words *words ,
|
||||
int32_t maxTitleChars ,
|
||||
int32_t maxTitleWords ,
|
||||
Query *q ,
|
||||
int32_t niceness );
|
||||
bool setTitle(XmlDoc *xd, Xml *xml, Words *words, int32_t maxTitleChars, Query *q, int32_t niceness);
|
||||
|
||||
char *getTitle() {
|
||||
return m_title;
|
||||
}
|
||||
|
||||
char *getTitle ( ) { return m_title; }
|
||||
int32_t getTitleLen ( ) { return m_titleBytes; } // does NOT include \0
|
||||
// does NOT include \0
|
||||
int32_t getTitleLen() {
|
||||
return m_titleBytes;
|
||||
}
|
||||
|
||||
int32_t getTitleTagStart() {
|
||||
return m_titleTagStart;
|
||||
}
|
||||
|
||||
bool copyTitle (class Words *words, int32_t t0, int32_t t1 );
|
||||
int32_t getTitleTagEnd() {
|
||||
return m_titleTagEnd;
|
||||
}
|
||||
|
||||
float getSimilarity ( Words *w1 , int32_t i0 , int32_t i1 , Words *w2 , int32_t t0 , int32_t t1 );
|
||||
bool copyTitle(Words *words, int32_t t0, int32_t t1);
|
||||
|
||||
char *m_title;
|
||||
int32_t m_titleBytes; // in bytes. does NOT include \0
|
||||
int32_t m_titleAllocSize;
|
||||
char m_localBuf [ TITLE_LOCAL_SIZE ];
|
||||
char m_niceness;
|
||||
|
||||
int32_t m_maxTitleChars;
|
||||
|
||||
int32_t m_titleTagStart ;
|
||||
int32_t m_titleTagEnd ;
|
||||
float getSimilarity(Words *w1, int32_t i0, int32_t i1, Words *w2, int32_t t0, int32_t t1);
|
||||
|
||||
private:
|
||||
|
||||
bool setTitle4 (class XmlDoc *xd,
|
||||
class Xml *xml,
|
||||
class Words *words,
|
||||
int32_t maxTitleChars,
|
||||
int32_t maxTitleWords,
|
||||
Query *q);
|
||||
char *m_title;
|
||||
int32_t m_titleBytes; // in bytes. does NOT include \0
|
||||
int32_t m_titleAllocSize;
|
||||
char m_localBuf[TITLE_LOCAL_SIZE];
|
||||
char m_niceness;
|
||||
|
||||
int32_t m_maxTitleChars;
|
||||
|
||||
int32_t m_titleTagStart;
|
||||
int32_t m_titleTagEnd;
|
||||
|
||||
|
||||
|
||||
bool setTitle4(XmlDoc *xd, Xml *xml, Words *words, int32_t maxTitleChars, int32_t maxTitleWords, Query *q);
|
||||
|
||||
};
|
||||
|
||||
|
68
XmlDoc.cpp
68
XmlDoc.cpp
@ -3081,7 +3081,7 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
//if ( m_forceDelete ) m_decCount = true;
|
||||
|
||||
// fix for the exact quota bug found on eurekster collection. bug 229
|
||||
// if we're not a new doc, then don't increment the count because
|
||||
// if we're not a new doc, then don't increment the count because
|
||||
// we have been already counted as the old doc. MDW: i added the
|
||||
// condition that if decCount is true we need to update the count!
|
||||
if ( *isIndexed && ! m_decCount ) return logIt();
|
||||
@ -3106,12 +3106,12 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
if ( m_msg16.m_termIdHost == 0 ) { char *xx = NULL; *xx = 0; }
|
||||
if ( m_msg16.m_termIdDom == 0 ) { char *xx = NULL; *xx = 0; }
|
||||
|
||||
// . Msg36 gets the correct count from disk and puts it in cache. It
|
||||
// . Msg36 gets the correct count from disk and puts it in cache. It
|
||||
// doesn't try to increment or decrement the quotas in cache, because
|
||||
// then it would have to be done on all twins, and also the correct
|
||||
// split will have to be found.
|
||||
// . Actually, we should only use the cache on one host to hold the
|
||||
// sum of all splits. This will be the authority cache.
|
||||
// split will have to be found.
|
||||
// . Actually, we should only use the cache on one host to hold the
|
||||
// sum of all splits. This will be the authority cache.
|
||||
if ( ! m_updatedCounts ) {
|
||||
// only call this once
|
||||
m_updatedCounts = true;
|
||||
@ -9247,7 +9247,7 @@ int32_t *XmlDoc::getSummaryVector ( ) {
|
||||
//int32_t avail = 5000;
|
||||
//int32_t len;
|
||||
// put title into there
|
||||
int32_t tlen = ti->m_titleBytes - 1;
|
||||
int32_t tlen = ti->getTitleLen() - 1;
|
||||
//if ( len > avail ) len = avail - 10;
|
||||
if ( tlen < 0 ) tlen = 0;
|
||||
|
||||
@ -9260,7 +9260,7 @@ int32_t *XmlDoc::getSummaryVector ( ) {
|
||||
|
||||
//gbmemcpy ( p , ti->m_title , len );
|
||||
//p += len;
|
||||
sb.safeMemcpy ( ti->m_title , tlen );
|
||||
sb.safeMemcpy ( ti->getTitle() , tlen );
|
||||
// space separting the title from summary
|
||||
if ( tlen > 0 ) sb.pushChar(' ');
|
||||
|
||||
@ -21547,7 +21547,7 @@ int32_t *XmlDoc::getSpiderPriority ( ) {
|
||||
return &m_priority;
|
||||
}
|
||||
|
||||
bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
bool XmlDoc::logIt (SafeBuf *bb ) {
|
||||
|
||||
// set errCode
|
||||
int32_t errCode = m_indexCode;
|
||||
@ -33774,31 +33774,47 @@ SafeBuf *XmlDoc::getHeaderTagBuf() {
|
||||
|
||||
|
||||
Title *XmlDoc::getTitle ( ) {
|
||||
if ( m_titleValid ) return &m_title;
|
||||
if ( m_titleValid ) {
|
||||
return &m_title;
|
||||
}
|
||||
|
||||
// need a buncha crap
|
||||
Xml *xml = getXml();
|
||||
if ( ! xml || xml == (Xml *)-1 ) return (Title *)xml;
|
||||
if ( ! xml || xml == (Xml *)-1 ) {
|
||||
return (Title *)xml;
|
||||
}
|
||||
|
||||
Words *ww = getWords();
|
||||
if ( ! ww || ww == (Words *)-1 ) return (Title *)ww;
|
||||
if ( ! ww || ww == (Words *)-1 ) {
|
||||
return (Title *)ww;
|
||||
}
|
||||
|
||||
Query *q = getQuery();
|
||||
if ( ! q ) return (Title *)q;
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
int32_t titleMaxLen = cr->m_titleMaxLen;
|
||||
if ( m_req ) titleMaxLen = m_req->m_titleMaxLen;
|
||||
if ( ! q ) {
|
||||
return (Title *)q;
|
||||
}
|
||||
|
||||
int32_t titleMaxLen = 256;
|
||||
if ( m_req ) {
|
||||
titleMaxLen = m_req->m_titleMaxLen;
|
||||
} else {
|
||||
CollectionRec *cr = getCollRec();
|
||||
if (cr) {
|
||||
titleMaxLen = cr->m_titleMaxLen;
|
||||
}
|
||||
}
|
||||
|
||||
// limit for speed, some guys have a 100k word title!
|
||||
if ( titleMaxLen > 256 ) titleMaxLen = 256;
|
||||
if ( titleMaxLen > 256 ) {
|
||||
titleMaxLen = 256;
|
||||
}
|
||||
|
||||
m_titleValid = true;
|
||||
|
||||
if ( ! m_title.setTitle ( this ,
|
||||
xml ,
|
||||
ww ,
|
||||
titleMaxLen ,
|
||||
0xffff ,
|
||||
q ,
|
||||
m_niceness ) )
|
||||
if ( ! m_title.setTitle( this, xml, ww, titleMaxLen, q, m_niceness) ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return &m_title;
|
||||
}
|
||||
|
||||
@ -33857,9 +33873,9 @@ Summary *XmlDoc::getSummary () {
|
||||
m_cpuSummaryStartTime = start;
|
||||
|
||||
// make sure summary does not include title
|
||||
char *tbuf = ti->m_title;
|
||||
char *tbuf = ti->getTitle();
|
||||
// this does not include the terminating \0
|
||||
int32_t tbufLen = ti->m_titleBytes;
|
||||
int32_t tbufLen = ti->getTitleLen();
|
||||
|
||||
// compute the summary
|
||||
bool status;
|
||||
|
@ -17,7 +17,8 @@ def gb(request):
|
||||
|
||||
# verify gb is running
|
||||
try:
|
||||
requests.get('http://' + gb_config.host + ':' + gb_config.port)
|
||||
api = gigablast.GigablastAPI(gb_config)
|
||||
api.status()
|
||||
except requests.exceptions.ConnectionError:
|
||||
pytest.skip('Gigablast instance down')
|
||||
|
||||
@ -26,3 +27,14 @@ def gb(request):
|
||||
|
||||
request.addfinalizer(finalize)
|
||||
return gb_config
|
||||
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def gb_api(request, gb):
|
||||
api = gigablast.GigablastAPI(gb)
|
||||
|
||||
def finalize():
|
||||
api.finalize()
|
||||
|
||||
request.addfinalizer(finalize)
|
||||
return api
|
||||
|
BIN
test/system/data/office/test_word_no_properties.pdf
Normal file
BIN
test/system/data/office/test_word_no_properties.pdf
Normal file
Binary file not shown.
BIN
test/system/data/office/test_word_with_properties.pdf
Normal file
BIN
test/system/data/office/test_word_with_properties.pdf
Normal file
Binary file not shown.
@ -7,17 +7,113 @@ class GigablastConfig:
|
||||
self.port = config['port']
|
||||
|
||||
|
||||
class GigablastSearch:
|
||||
class GigablastAPI:
|
||||
class _HTTPStatus:
|
||||
@staticmethod
|
||||
def compare(status, expected_status):
|
||||
return status[status.find('(')+1:status.find(')')] == expected_status
|
||||
|
||||
@staticmethod
|
||||
def doc_force_delete():
|
||||
return 'Doc force deleted'
|
||||
|
||||
def __init__(self, gb_config):
|
||||
self.config = gb_config
|
||||
self._config = gb_config
|
||||
self._add_urls = set()
|
||||
|
||||
def finalize(self):
|
||||
# cleanup urls
|
||||
for url in self._add_urls:
|
||||
self.delete_url(url, True)
|
||||
|
||||
def _get_url(self, path):
|
||||
return 'http://' + self._config.host + ':' + self._config.port + '/' + path
|
||||
|
||||
@staticmethod
|
||||
def _apply_default_payload(payload):
|
||||
payload.setdefault('c', 'main')
|
||||
payload.setdefault('format', 'json')
|
||||
payload.setdefault('showinput', '0')
|
||||
|
||||
def _check_http_status(self, e, expected_status):
|
||||
# hacks to cater for inject returning invalid status line
|
||||
if (len(e.args) == 1 and
|
||||
type(e.args[0]) == requests.packages.urllib3.exceptions.ProtocolError and
|
||||
len(e.args[0].args) == 2):
|
||||
import http.client
|
||||
if type(e.args[0].args[1]) == http.client.BadStatusLine:
|
||||
if self._HTTPStatus.compare(str(e.args[0].args[1]), expected_status):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _add_url(self, url, payload=None):
|
||||
if not payload:
|
||||
payload = {}
|
||||
|
||||
self._apply_default_payload(payload)
|
||||
|
||||
payload.update({'urls': url})
|
||||
|
||||
response = requests.get(self._get_url('admin/addurl'), params=payload)
|
||||
|
||||
return response.json()
|
||||
|
||||
def _inject(self, url, payload=None):
|
||||
if not payload:
|
||||
payload = {}
|
||||
|
||||
self._apply_default_payload(payload)
|
||||
|
||||
payload.update({'url': url})
|
||||
|
||||
response = requests.get(self._get_url('admin/inject'), params=payload)
|
||||
|
||||
# inject doesn't seem to wait until document is completely indexed
|
||||
from time import sleep
|
||||
sleep(0.1)
|
||||
|
||||
return response.json()
|
||||
|
||||
def add_url(self, url, real_time=True):
|
||||
self._add_urls.add(url)
|
||||
|
||||
if real_time:
|
||||
return self._inject(url)['response']['statusCode'] == 0
|
||||
else:
|
||||
return self._add_url(url)['response']['statusCode'] == 0
|
||||
|
||||
def delete_url(self, url, finalizer=False):
|
||||
if not finalizer:
|
||||
self._add_urls.discard(url)
|
||||
|
||||
payload = {'deleteurl': '1'}
|
||||
|
||||
try:
|
||||
self._inject(url, payload)
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
# delete url returns invalid HTTP status line
|
||||
return self._check_http_status(e, self._HTTPStatus.doc_force_delete())
|
||||
|
||||
return False
|
||||
|
||||
def search(self, query, payload=None):
|
||||
if not payload:
|
||||
payload = {}
|
||||
|
||||
payload.update({'format': 'json'})
|
||||
self._apply_default_payload(payload)
|
||||
|
||||
payload.update({'q': query})
|
||||
|
||||
response = requests.get('http://' + self.config.host + ':' + self.config.port + '/search', params=payload)
|
||||
response = requests.get(self._get_url('search'), params=payload)
|
||||
|
||||
return response.json()
|
||||
|
||||
def status(self, payload=None):
|
||||
if not payload:
|
||||
payload = {}
|
||||
|
||||
self._apply_default_payload(payload)
|
||||
|
||||
response = requests.get(self._get_url('admin/status'), params=payload)
|
||||
|
||||
return response.json()
|
||||
|
2
test/system/pytest.ini
Normal file
2
test/system/pytest.ini
Normal file
@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
norecursedirs = data
|
@ -3,3 +3,4 @@
|
||||
|
||||
pytest==2.8.3
|
||||
requests==2.7.0
|
||||
pytest-localserver==0.3.4
|
||||
|
27
test/system/test_index_documents.py
Normal file
27
test/system/test_index_documents.py
Normal file
@ -0,0 +1,27 @@
|
||||
import pytest
|
||||
import os
|
||||
|
||||
|
||||
@pytest.mark.parametrize('file_location, url_with_file, content_type, expected_title', [
|
||||
# file_location url_with_file content_type expected_title
|
||||
('test_word_no_properties.pdf', True, 'application/pdf', 'test_word_no_properties.pdf'),
|
||||
('test_word_no_properties.pdf', False, 'application/pdf', ''),
|
||||
('test_word_with_properties.pdf', True, 'application/pdf', 'Title for Microsoft Word (in title)'),
|
||||
])
|
||||
def test_index_documents_office(gb_api, httpserver, file_location, url_with_file, content_type, expected_title):
|
||||
httpserver.serve_content(content=open('data/office/' + file_location, 'rb').read(),
|
||||
headers={'content-type': content_type})
|
||||
print(httpserver.url)
|
||||
|
||||
# format url
|
||||
file_url = httpserver.url + '/'
|
||||
if url_with_file:
|
||||
file_url += os.path.basename(file_location)
|
||||
|
||||
# add url
|
||||
assert gb_api.add_url(file_url) == True
|
||||
|
||||
# verify result
|
||||
result = gb_api.search('url:' + file_url)
|
||||
assert len(result['results']) == 1
|
||||
assert result['results'][0]['title'] == expected_title
|
@ -1,5 +1,4 @@
|
||||
import pytest
|
||||
import gigablast
|
||||
|
||||
|
||||
@pytest.mark.parametrize('query, fx_qlang, fx_blang, fx_fetld, fx_country, expected_lang', [
|
||||
@ -21,11 +20,9 @@ import gigablast
|
||||
('Smurfene', '', '', '', '', 'is'), # wrong
|
||||
('Smurfene', '', 'en-US', '', '', 'en'), # wrong
|
||||
('Smurfene', '', 'no-NO', '', '', 'no'),
|
||||
('Smurfene', '', '', '', 'no', 'no')
|
||||
('Smurfene', '', '', '', 'no', 'no'),
|
||||
])
|
||||
def test_search_language_hint(gb, query, fx_qlang, fx_blang, fx_fetld, fx_country, expected_lang):
|
||||
gb_search = gigablast.GigablastSearch(gb)
|
||||
|
||||
def test_search_language_hint(gb_api, query, fx_qlang, fx_blang, fx_fetld, fx_country, expected_lang):
|
||||
payload = {}
|
||||
|
||||
# add language hints
|
||||
@ -34,6 +31,6 @@ def test_search_language_hint(gb, query, fx_qlang, fx_blang, fx_fetld, fx_countr
|
||||
payload.update({'fx_fetld': fx_fetld})
|
||||
payload.update({'fx_country': fx_country})
|
||||
|
||||
result = gb_search.search(query, payload)
|
||||
result = gb_api.search(query, payload)
|
||||
|
||||
assert result['queryInfo']['queryLanguageAbbr'] == expected_lang
|
||||
|
Reference in New Issue
Block a user