Made unittest compile&link again
This commit is contained in:
parent
a009153233
commit
91c6919c02
@ -16,7 +16,6 @@ OBJECTS = GigablastTest.o GigablastTestUtils.o \
|
||||
RdbBaseTest.o RdbBucketsTest.o RdbIndexTest.o RdbListTest.o RdbTreeTest.o ResultOverrideTest.o RobotRuleTest.o RobotsCheckListTest.o RobotsTest.o \
|
||||
ScalingFunctionsTest.o SiteGetterTest.o SummaryTest.o \
|
||||
UnicodeTest.o UrlBlockCheckTest.o UrlComponentTest.o UrlMatchListTest.o UrlParserTest.o UrlTest.o \
|
||||
WordsTest.o \
|
||||
XmlDocTest.o XmlTest.o \
|
||||
|
||||
.PHONY: all
|
||||
@ -53,7 +52,7 @@ libgtest.so:
|
||||
CPPFLAGS += -g
|
||||
CPPFLAGS += -Wno-write-strings
|
||||
CPPFLAGS += -Wl,-rpath=. -Wl,-rpath=$(BASE_DIR)
|
||||
CPPFLAGS += -I$(BASE_DIR) -I$(BASE_DIR)/word_variations -I$(BASE_DIR)/unicode -isystem $(GTEST_DIR)/include
|
||||
CPPFLAGS += -I$(BASE_DIR) -I$(BASE_DIR)/word_variations -I$(BASE_DIR)/tokenizer -I$(BASE_DIR)/unicode -isystem $(GTEST_DIR)/include
|
||||
CPPFLAGS += -std=c++11
|
||||
|
||||
# exported in parent make
|
||||
@ -61,7 +60,7 @@ CPPFLAGS += $(CONFIG_CPPFLAGS)
|
||||
|
||||
LIBS += -L./ -lgtest
|
||||
LIBS += $(BASE_DIR)/libgb.a -lz -lpthread -lssl -lcrypto -lpcre -lsqlite3 -ldl
|
||||
LIBS += -L$(BASE_DIR) -lcld2_full -lcld3 -lprotobuf -lced -lcares -lword_variations -lsto -lunicode
|
||||
LIBS += -L$(BASE_DIR) -lcld2_full -lcld3 -lprotobuf -lced -lcares -lword_variations -lsto -ltokenizer -lunicode
|
||||
|
||||
$(TARGET): libgtest.so libgb.a $(BASE_DIR)/libcld2_full.so $(BASE_DIR)/libcld3.so $(BASE_DIR)/libced.so $(OBJECTS)
|
||||
$(CXX) $(CPPFLAGS) $(OBJECTS) $(LIBS) -o $@
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "Pos.h"
|
||||
#include "Words.h"
|
||||
#include "tokenizer.h"
|
||||
#include "Xml.h"
|
||||
#include "HttpMime.h"
|
||||
|
||||
@ -25,13 +25,13 @@ TEST( PosTest, FilterAllCaps ) {
|
||||
|
||||
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
|
||||
for ( size_t i = 0; i < input_len; i++ ) {
|
||||
Words words;
|
||||
TokenizerResult tr;
|
||||
Pos pos;
|
||||
char buf[MAX_BUF_SIZE];
|
||||
|
||||
ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
|
||||
plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);
|
||||
|
||||
int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );
|
||||
int32_t len = pos.filter( &tr, 0, tr.size(), false, buf, buf + MAX_BUF_SIZE );
|
||||
|
||||
EXPECT_STREQ( expected_output[i], buf );
|
||||
EXPECT_EQ( strlen( expected_output[i] ), len );
|
||||
@ -106,13 +106,13 @@ TEST( PosTest, FilterEnding ) {
|
||||
|
||||
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
|
||||
for ( size_t i = 0; i < input_len; i++ ) {
|
||||
Words words;
|
||||
TokenizerResult tr;
|
||||
Pos pos;
|
||||
char buf[MAX_BUF_SIZE];
|
||||
|
||||
ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
|
||||
plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);
|
||||
|
||||
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
|
||||
int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );
|
||||
|
||||
EXPECT_STREQ( expected_output[i], buf );
|
||||
EXPECT_EQ( strlen( expected_output[i] ), len );
|
||||
@ -136,7 +136,7 @@ TEST( PosTest, FilterTags ) {
|
||||
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
|
||||
for ( size_t i = 0; i < input_len; i++ ) {
|
||||
Xml xml;
|
||||
Words words;
|
||||
TokenizerResult tr;
|
||||
Pos pos;
|
||||
char input[MAX_BUF_SIZE];
|
||||
char buf[MAX_BUF_SIZE];
|
||||
@ -144,9 +144,9 @@ TEST( PosTest, FilterTags ) {
|
||||
std::sprintf(input, input_strs[i]);
|
||||
|
||||
ASSERT_TRUE( xml.set( input, strlen( input ), TITLEREC_CURRENT_VERSION, CT_HTML ) );
|
||||
ASSERT_TRUE( words.set( &xml ) );
|
||||
xml_tokenizer_phase_1(&xml, &tr);
|
||||
|
||||
int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );
|
||||
int32_t len = pos.filter( &tr, 0, tr.size(), false, buf, buf + MAX_BUF_SIZE );
|
||||
|
||||
EXPECT_STREQ( expected_output[i], buf );
|
||||
EXPECT_EQ( strlen( expected_output[i] ), len );
|
||||
@ -178,13 +178,13 @@ TEST( PosTest, FilterSamePunct ) {
|
||||
|
||||
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
|
||||
for ( size_t i = 0; i < input_len; i++ ) {
|
||||
Words words;
|
||||
TokenizerResult tr;
|
||||
Pos pos;
|
||||
char buf[MAX_BUF_SIZE];
|
||||
|
||||
ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
|
||||
plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);
|
||||
|
||||
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
|
||||
int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );
|
||||
|
||||
EXPECT_STREQ( expected_output[i], buf );
|
||||
EXPECT_EQ( strlen( expected_output[i] ), len );
|
||||
@ -219,13 +219,13 @@ TEST( PosTest, DecodeHTMLEntities ) {
|
||||
|
||||
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
|
||||
for ( size_t i = 0; i < input_len; i++ ) {
|
||||
Words words;
|
||||
TokenizerResult tr;
|
||||
Pos pos;
|
||||
char buf[MAX_BUF_SIZE];
|
||||
|
||||
ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
|
||||
plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);
|
||||
|
||||
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
|
||||
int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );
|
||||
|
||||
EXPECT_STREQ( expected_output[i], buf );
|
||||
EXPECT_EQ( strlen( expected_output[i] ), len );
|
||||
@ -233,15 +233,15 @@ TEST( PosTest, DecodeHTMLEntities ) {
|
||||
}
|
||||
|
||||
TEST(PosTest, SegFaultDotPrevChar) {
|
||||
Words words;
|
||||
TokenizerResult tr;
|
||||
Pos pos;
|
||||
char buf[MAX_BUF_SIZE];
|
||||
|
||||
const char *input_str = ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . . . . . . . . . . . . .. . . . . . . . . . . ...";
|
||||
|
||||
ASSERT_TRUE( words.set( const_cast<char*>(input_str) ) );
|
||||
plain_tokenizer_phase_1(input_str, strlen(input_str), &tr);
|
||||
|
||||
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
|
||||
int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );
|
||||
|
||||
EXPECT_EQ( 0, len );
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include <cstdio>
|
||||
|
||||
#include "Xml.h"
|
||||
#include "Words.h"
|
||||
#include "tokenizer.h"
|
||||
#include "Phrases.h"
|
||||
#include "Sections.h"
|
||||
#include "Pos.h"
|
||||
@ -22,17 +22,17 @@ static void generateSummary( Summary &summary, char *htmlInput, const char *quer
|
||||
Xml xml;
|
||||
ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML));
|
||||
|
||||
Words words;
|
||||
ASSERT_TRUE(words.set(&xml));
|
||||
TokenizerResult tr;
|
||||
xml_tokenizer_phase_1(&xml,&tr);
|
||||
|
||||
Bits bits;
|
||||
ASSERT_TRUE(bits.set(&words));
|
||||
ASSERT_TRUE(bits.set(&tr));
|
||||
|
||||
Url url;
|
||||
url.set(urlStr);
|
||||
|
||||
Sections sections;
|
||||
ASSERT_TRUE(sections.set(&words, &bits, &url, CT_HTML));
|
||||
ASSERT_TRUE(sections.set(&tr, &bits, &url, CT_HTML));
|
||||
|
||||
Query query;
|
||||
ASSERT_TRUE(query.set(queryStr, langEnglish, 1.0, 1.0, nullptr, false, true, ABS_MAX_QUERY_TERMS));
|
||||
@ -42,22 +42,22 @@ static void generateSummary( Summary &summary, char *htmlInput, const char *quer
|
||||
linkInfo.m_lisize = sizeof(LinkInfo);
|
||||
|
||||
Title title;
|
||||
ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));
|
||||
ASSERT_TRUE(title.setTitle(&xml, &tr, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));
|
||||
|
||||
Pos pos;
|
||||
ASSERT_TRUE(pos.set(&words));
|
||||
ASSERT_TRUE(pos.set(&tr));
|
||||
|
||||
Bits bitsForSummary;
|
||||
ASSERT_TRUE(bitsForSummary.setForSummary(&words));
|
||||
ASSERT_TRUE(bitsForSummary.setForSummary(&tr));
|
||||
|
||||
Phrases phrases;
|
||||
ASSERT_TRUE(phrases.set(&words, &bits));
|
||||
ASSERT_TRUE(phrases.set(tr, bits));
|
||||
|
||||
Matches matches;
|
||||
matches.setQuery(&query);
|
||||
ASSERT_TRUE(matches.set(&words, &phrases, §ions, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));
|
||||
ASSERT_TRUE(matches.set(&tr, &phrases, §ions, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));
|
||||
|
||||
summary.setSummary(&xml, &words, §ions, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
|
||||
summary.setSummary(&xml, &tr, §ions, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
|
||||
}
|
||||
|
||||
TEST( SummaryTest, StripSamePunct ) {
|
||||
|
@ -1,157 +0,0 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "Words.h"
|
||||
|
||||
TEST(WordsTest, VerifySize) {
|
||||
// set c to a curling quote in unicode
|
||||
int32_t c = 0x201c; // 0x235e;
|
||||
|
||||
// encode it into utf8
|
||||
char dst[5];
|
||||
|
||||
// point to it
|
||||
char *p = dst;
|
||||
|
||||
// put space in there
|
||||
*p++ = ' ';
|
||||
|
||||
// "numBytes" is how many bytes it stored into 'dst"
|
||||
int32_t numBytes = utf8Encode ( c , p );
|
||||
|
||||
// must be 3 bytes
|
||||
EXPECT_EQ(3, numBytes);
|
||||
|
||||
// check it
|
||||
int32_t size = getUtf8CharSize(p);
|
||||
EXPECT_EQ(3, size);
|
||||
|
||||
// is that punct
|
||||
EXPECT_TRUE(is_punct_utf8(p));
|
||||
}
|
||||
|
||||
TEST(WordsTest, simple_tokenization) {
|
||||
char buf[256];
|
||||
{
|
||||
strcpy(buf,"hello");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),1);
|
||||
}
|
||||
{
|
||||
strcpy(buf," ");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),1);
|
||||
}
|
||||
{
|
||||
strcpy(buf,"hello ");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),2);
|
||||
}
|
||||
{
|
||||
strcpy(buf," hello");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),2);
|
||||
}
|
||||
{
|
||||
strcpy(buf,"hello world");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),3);
|
||||
}
|
||||
{
|
||||
strcpy(buf,"Hello world!");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),4);
|
||||
EXPECT_EQ(words.getWordLen(0),5);
|
||||
EXPECT_EQ(words.getWordLen(1),1);
|
||||
EXPECT_EQ(words.getWordLen(2),5);
|
||||
EXPECT_EQ(words.getWordLen(3),1);
|
||||
}
|
||||
{
|
||||
strcpy(buf,"Hello, world");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),3);
|
||||
EXPECT_EQ(words.getWordLen(0),5);
|
||||
EXPECT_EQ(words.getWordLen(1),2);
|
||||
EXPECT_EQ(words.getWordLen(2),5);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(WordsTest, latin1_tokenization) {
|
||||
char buf[256];
|
||||
{
|
||||
strcpy(buf,"Æbleflæsk og øl");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),5);
|
||||
EXPECT_EQ(words.getWordLen(0),11);
|
||||
EXPECT_EQ(words.getWordLen(1),1);
|
||||
EXPECT_EQ(words.getWordLen(2),2);
|
||||
EXPECT_EQ(words.getWordLen(3),1);
|
||||
EXPECT_EQ(words.getWordLen(4),3);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(WordsTest, mixed_script_tokenization) {
|
||||
char buf[256];
|
||||
{
|
||||
strcpy(buf,"Æbleflæsk og γιαούρτι");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),5);
|
||||
EXPECT_EQ(words.getWordLen(0),11);
|
||||
EXPECT_EQ(words.getWordLen(1),1);
|
||||
EXPECT_EQ(words.getWordLen(2),2);
|
||||
EXPECT_EQ(words.getWordLen(3),1);
|
||||
EXPECT_EQ(words.getWordLen(4),16);
|
||||
}
|
||||
{
|
||||
strcpy(buf,"Æbleflæskγιαούρτι");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),2);
|
||||
EXPECT_EQ(words.getWordLen(0),11);
|
||||
EXPECT_EQ(words.getWordLen(1),16);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(WordsTest, buffer_tokenization) {
|
||||
char buf[256];
|
||||
{
|
||||
strcpy(buf,"Hello world");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf,8));
|
||||
EXPECT_EQ(words.getNumWords(),3);
|
||||
EXPECT_EQ(words.getWordLen(0),5);
|
||||
EXPECT_EQ(words.getWordLen(1),1);
|
||||
EXPECT_EQ(words.getWordLen(2),2);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(WordsTest, html_tokenization) {
|
||||
char buf[256];
|
||||
{
|
||||
strcpy(buf,"<p>Hello <em>world</em>!</p>");
|
||||
Words words;
|
||||
EXPECT_TRUE(words.set(buf));
|
||||
EXPECT_EQ(words.getNumWords(),13);
|
||||
EXPECT_EQ(words.getWordLen( 0),1);
|
||||
EXPECT_EQ(words.getWordLen( 1),1);
|
||||
EXPECT_EQ(words.getWordLen( 2),1);
|
||||
EXPECT_EQ(words.getWordLen( 3),5);
|
||||
EXPECT_EQ(words.getWordLen( 4),2);
|
||||
EXPECT_EQ(words.getWordLen( 5),2);
|
||||
EXPECT_EQ(words.getWordLen( 6),1);
|
||||
EXPECT_EQ(words.getWordLen( 7),5);
|
||||
EXPECT_EQ(words.getWordLen( 8),2);
|
||||
EXPECT_EQ(words.getWordLen( 9),2);
|
||||
EXPECT_EQ(words.getWordLen(10),4);
|
||||
EXPECT_EQ(words.getWordLen(11),1);
|
||||
EXPECT_EQ(words.getWordLen(12),1);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user