Made unittest compile&link again

This commit is contained in:
Ivan Skytte Jørgensen 2018-03-19 16:03:06 +01:00
parent a009153233
commit 91c6919c02
4 changed files with 32 additions and 190 deletions

@ -16,7 +16,6 @@ OBJECTS = GigablastTest.o GigablastTestUtils.o \
RdbBaseTest.o RdbBucketsTest.o RdbIndexTest.o RdbListTest.o RdbTreeTest.o ResultOverrideTest.o RobotRuleTest.o RobotsCheckListTest.o RobotsTest.o \
ScalingFunctionsTest.o SiteGetterTest.o SummaryTest.o \
UnicodeTest.o UrlBlockCheckTest.o UrlComponentTest.o UrlMatchListTest.o UrlParserTest.o UrlTest.o \
WordsTest.o \
XmlDocTest.o XmlTest.o \
.PHONY: all
@ -53,7 +52,7 @@ libgtest.so:
CPPFLAGS += -g
CPPFLAGS += -Wno-write-strings
CPPFLAGS += -Wl,-rpath=. -Wl,-rpath=$(BASE_DIR)
CPPFLAGS += -I$(BASE_DIR) -I$(BASE_DIR)/word_variations -I$(BASE_DIR)/unicode -isystem $(GTEST_DIR)/include
CPPFLAGS += -I$(BASE_DIR) -I$(BASE_DIR)/word_variations -I$(BASE_DIR)/tokenizer -I$(BASE_DIR)/unicode -isystem $(GTEST_DIR)/include
CPPFLAGS += -std=c++11
# exported in parent make
@ -61,7 +60,7 @@ CPPFLAGS += $(CONFIG_CPPFLAGS)
LIBS += -L./ -lgtest
LIBS += $(BASE_DIR)/libgb.a -lz -lpthread -lssl -lcrypto -lpcre -lsqlite3 -ldl
LIBS += -L$(BASE_DIR) -lcld2_full -lcld3 -lprotobuf -lced -lcares -lword_variations -lsto -lunicode
LIBS += -L$(BASE_DIR) -lcld2_full -lcld3 -lprotobuf -lced -lcares -lword_variations -lsto -ltokenizer -lunicode
$(TARGET): libgtest.so libgb.a $(BASE_DIR)/libcld2_full.so $(BASE_DIR)/libcld3.so $(BASE_DIR)/libced.so $(OBJECTS)
$(CXX) $(CPPFLAGS) $(OBJECTS) $(LIBS) -o $@

@ -1,7 +1,7 @@
#include <gtest/gtest.h>
#include "Pos.h"
#include "Words.h"
#include "tokenizer.h"
#include "Xml.h"
#include "HttpMime.h"
@ -25,13 +25,13 @@ TEST( PosTest, FilterAllCaps ) {
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
for ( size_t i = 0; i < input_len; i++ ) {
Words words;
TokenizerResult tr;
Pos pos;
char buf[MAX_BUF_SIZE];
ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);
int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );
int32_t len = pos.filter( &tr, 0, tr.size(), false, buf, buf + MAX_BUF_SIZE );
EXPECT_STREQ( expected_output[i], buf );
EXPECT_EQ( strlen( expected_output[i] ), len );
@ -106,13 +106,13 @@ TEST( PosTest, FilterEnding ) {
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
for ( size_t i = 0; i < input_len; i++ ) {
Words words;
TokenizerResult tr;
Pos pos;
char buf[MAX_BUF_SIZE];
ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );
EXPECT_STREQ( expected_output[i], buf );
EXPECT_EQ( strlen( expected_output[i] ), len );
@ -136,7 +136,7 @@ TEST( PosTest, FilterTags ) {
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
for ( size_t i = 0; i < input_len; i++ ) {
Xml xml;
Words words;
TokenizerResult tr;
Pos pos;
char input[MAX_BUF_SIZE];
char buf[MAX_BUF_SIZE];
@ -144,9 +144,9 @@ TEST( PosTest, FilterTags ) {
std::sprintf(input, input_strs[i]);
ASSERT_TRUE( xml.set( input, strlen( input ), TITLEREC_CURRENT_VERSION, CT_HTML ) );
ASSERT_TRUE( words.set( &xml ) );
xml_tokenizer_phase_1(&xml, &tr);
int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );
int32_t len = pos.filter( &tr, 0, tr.size(), false, buf, buf + MAX_BUF_SIZE );
EXPECT_STREQ( expected_output[i], buf );
EXPECT_EQ( strlen( expected_output[i] ), len );
@ -178,13 +178,13 @@ TEST( PosTest, FilterSamePunct ) {
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
for ( size_t i = 0; i < input_len; i++ ) {
Words words;
TokenizerResult tr;
Pos pos;
char buf[MAX_BUF_SIZE];
ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );
EXPECT_STREQ( expected_output[i], buf );
EXPECT_EQ( strlen( expected_output[i] ), len );
@ -219,13 +219,13 @@ TEST( PosTest, DecodeHTMLEntities ) {
size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
for ( size_t i = 0; i < input_len; i++ ) {
Words words;
TokenizerResult tr;
Pos pos;
char buf[MAX_BUF_SIZE];
ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );
EXPECT_STREQ( expected_output[i], buf );
EXPECT_EQ( strlen( expected_output[i] ), len );
@ -233,15 +233,15 @@ TEST( PosTest, DecodeHTMLEntities ) {
}
TEST(PosTest, SegFaultDotPrevChar) {
Words words;
TokenizerResult tr;
Pos pos;
char buf[MAX_BUF_SIZE];
const char *input_str = ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . . . . . . . . . . . . .. . . . . . . . . . . ...";
ASSERT_TRUE( words.set( const_cast<char*>(input_str) ) );
plain_tokenizer_phase_1(input_str, strlen(input_str), &tr);
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );
EXPECT_EQ( 0, len );
}

@ -5,7 +5,7 @@
#include <cstdio>
#include "Xml.h"
#include "Words.h"
#include "tokenizer.h"
#include "Phrases.h"
#include "Sections.h"
#include "Pos.h"
@ -22,17 +22,17 @@ static void generateSummary( Summary &summary, char *htmlInput, const char *quer
Xml xml;
ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML));
Words words;
ASSERT_TRUE(words.set(&xml));
TokenizerResult tr;
xml_tokenizer_phase_1(&xml,&tr);
Bits bits;
ASSERT_TRUE(bits.set(&words));
ASSERT_TRUE(bits.set(&tr));
Url url;
url.set(urlStr);
Sections sections;
ASSERT_TRUE(sections.set(&words, &bits, &url, CT_HTML));
ASSERT_TRUE(sections.set(&tr, &bits, &url, CT_HTML));
Query query;
ASSERT_TRUE(query.set(queryStr, langEnglish, 1.0, 1.0, nullptr, false, true, ABS_MAX_QUERY_TERMS));
@ -42,22 +42,22 @@ static void generateSummary( Summary &summary, char *htmlInput, const char *quer
linkInfo.m_lisize = sizeof(LinkInfo);
Title title;
ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));
ASSERT_TRUE(title.setTitle(&xml, &tr, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));
Pos pos;
ASSERT_TRUE(pos.set(&words));
ASSERT_TRUE(pos.set(&tr));
Bits bitsForSummary;
ASSERT_TRUE(bitsForSummary.setForSummary(&words));
ASSERT_TRUE(bitsForSummary.setForSummary(&tr));
Phrases phrases;
ASSERT_TRUE(phrases.set(&words, &bits));
ASSERT_TRUE(phrases.set(tr, bits));
Matches matches;
matches.setQuery(&query);
ASSERT_TRUE(matches.set(&words, &phrases, &sections, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));
ASSERT_TRUE(matches.set(&tr, &phrases, &sections, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));
summary.setSummary(&xml, &words, &sections, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
summary.setSummary(&xml, &tr, &sections, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
}
TEST( SummaryTest, StripSamePunct ) {

@ -1,157 +0,0 @@
#include <gtest/gtest.h>
#include "Words.h"
TEST(WordsTest, VerifySize) {
// set c to a curling quote in unicode
int32_t c = 0x201c; // 0x235e;
// encode it into utf8
char dst[5];
// point to it
char *p = dst;
// put space in there
*p++ = ' ';
// "numBytes" is how many bytes it stored into 'dst"
int32_t numBytes = utf8Encode ( c , p );
// must be 3 bytes
EXPECT_EQ(3, numBytes);
// check it
int32_t size = getUtf8CharSize(p);
EXPECT_EQ(3, size);
// is that punct
EXPECT_TRUE(is_punct_utf8(p));
}
TEST(WordsTest, simple_tokenization) {
char buf[256];
{
strcpy(buf,"hello");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),1);
}
{
strcpy(buf," ");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),1);
}
{
strcpy(buf,"hello ");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),2);
}
{
strcpy(buf," hello");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),2);
}
{
strcpy(buf,"hello world");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),3);
}
{
strcpy(buf,"Hello world!");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),4);
EXPECT_EQ(words.getWordLen(0),5);
EXPECT_EQ(words.getWordLen(1),1);
EXPECT_EQ(words.getWordLen(2),5);
EXPECT_EQ(words.getWordLen(3),1);
}
{
strcpy(buf,"Hello, world");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),3);
EXPECT_EQ(words.getWordLen(0),5);
EXPECT_EQ(words.getWordLen(1),2);
EXPECT_EQ(words.getWordLen(2),5);
}
}
TEST(WordsTest, latin1_tokenization) {
char buf[256];
{
strcpy(buf,"Æbleflæsk og øl");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),5);
EXPECT_EQ(words.getWordLen(0),11);
EXPECT_EQ(words.getWordLen(1),1);
EXPECT_EQ(words.getWordLen(2),2);
EXPECT_EQ(words.getWordLen(3),1);
EXPECT_EQ(words.getWordLen(4),3);
}
}
TEST(WordsTest, mixed_script_tokenization) {
char buf[256];
{
strcpy(buf,"Æbleflæsk og γιαούρτι");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),5);
EXPECT_EQ(words.getWordLen(0),11);
EXPECT_EQ(words.getWordLen(1),1);
EXPECT_EQ(words.getWordLen(2),2);
EXPECT_EQ(words.getWordLen(3),1);
EXPECT_EQ(words.getWordLen(4),16);
}
{
strcpy(buf,"Æbleflæskγιαούρτι");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),2);
EXPECT_EQ(words.getWordLen(0),11);
EXPECT_EQ(words.getWordLen(1),16);
}
}
TEST(WordsTest, buffer_tokenization) {
char buf[256];
{
strcpy(buf,"Hello world");
Words words;
EXPECT_TRUE(words.set(buf,8));
EXPECT_EQ(words.getNumWords(),3);
EXPECT_EQ(words.getWordLen(0),5);
EXPECT_EQ(words.getWordLen(1),1);
EXPECT_EQ(words.getWordLen(2),2);
}
}
TEST(WordsTest, html_tokenization) {
char buf[256];
{
strcpy(buf,"<p>Hello <em>world</em>!</p>");
Words words;
EXPECT_TRUE(words.set(buf));
EXPECT_EQ(words.getNumWords(),13);
EXPECT_EQ(words.getWordLen( 0),1);
EXPECT_EQ(words.getWordLen( 1),1);
EXPECT_EQ(words.getWordLen( 2),1);
EXPECT_EQ(words.getWordLen( 3),5);
EXPECT_EQ(words.getWordLen( 4),2);
EXPECT_EQ(words.getWordLen( 5),2);
EXPECT_EQ(words.getWordLen( 6),1);
EXPECT_EQ(words.getWordLen( 7),5);
EXPECT_EQ(words.getWordLen( 8),2);
EXPECT_EQ(words.getWordLen( 9),2);
EXPECT_EQ(words.getWordLen(10),4);
EXPECT_EQ(words.getWordLen(11),1);
EXPECT_EQ(words.getWordLen(12),1);
}
}