Made unittest compile&link again

2018-03-19 16:03:06 +01:00 · 2018-03-19 16:03:06 +01:00 · 91c6919c02
commit 91c6919c02
parent a009153233
4 changed files with 32 additions and 190 deletions
--- a/test/unit/Makefile
+++ b/test/unit/Makefile
@ -16,7 +16,6 @@ OBJECTS = GigablastTest.o GigablastTestUtils.o \
 	RdbBaseTest.o RdbBucketsTest.o RdbIndexTest.o RdbListTest.o RdbTreeTest.o ResultOverrideTest.o RobotRuleTest.o RobotsCheckListTest.o RobotsTest.o \
 	ScalingFunctionsTest.o SiteGetterTest.o SummaryTest.o \
 	UnicodeTest.o UrlBlockCheckTest.o UrlComponentTest.o UrlMatchListTest.o UrlParserTest.o UrlTest.o \
-	WordsTest.o \
 	XmlDocTest.o XmlTest.o \

 .PHONY: all
@ -53,7 +52,7 @@ libgtest.so:
 CPPFLAGS += -g
 CPPFLAGS += -Wno-write-strings
 CPPFLAGS += -Wl,-rpath=. -Wl,-rpath=$(BASE_DIR)
-CPPFLAGS += -I$(BASE_DIR) -I$(BASE_DIR)/word_variations  -I$(BASE_DIR)/unicode -isystem $(GTEST_DIR)/include
+CPPFLAGS += -I$(BASE_DIR) -I$(BASE_DIR)/word_variations  -I$(BASE_DIR)/tokenizer -I$(BASE_DIR)/unicode -isystem $(GTEST_DIR)/include
 CPPFLAGS += -std=c++11

 # exported in parent make
@ -61,7 +60,7 @@ CPPFLAGS += $(CONFIG_CPPFLAGS)

 LIBS += -L./ -lgtest 
 LIBS += $(BASE_DIR)/libgb.a -lz -lpthread -lssl -lcrypto -lpcre -lsqlite3 -ldl
-LIBS += -L$(BASE_DIR) -lcld2_full -lcld3 -lprotobuf -lced -lcares -lword_variations -lsto -lunicode
+LIBS += -L$(BASE_DIR) -lcld2_full -lcld3 -lprotobuf -lced -lcares -lword_variations -lsto -ltokenizer -lunicode

 $(TARGET): libgtest.so libgb.a $(BASE_DIR)/libcld2_full.so $(BASE_DIR)/libcld3.so $(BASE_DIR)/libced.so $(OBJECTS)
 	$(CXX) $(CPPFLAGS) $(OBJECTS) $(LIBS) -o $@
--- a/test/unit/PosTest.cpp
+++ b/test/unit/PosTest.cpp
@ -1,7 +1,7 @@
 #include <gtest/gtest.h>

 #include "Pos.h"
-#include "Words.h"
+#include "tokenizer.h"
 #include "Xml.h"
 #include "HttpMime.h"

@ -25,13 +25,13 @@ TEST( PosTest, FilterAllCaps ) {

 	size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
 	for ( size_t i = 0; i < input_len; i++ ) {
-		Words words;
+		TokenizerResult tr;
 		Pos pos;
 		char buf[MAX_BUF_SIZE];

-		ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
+		plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);

-		int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );
+		int32_t len = pos.filter( &tr, 0, tr.size(), false, buf, buf + MAX_BUF_SIZE );

 		EXPECT_STREQ( expected_output[i], buf );
 		EXPECT_EQ( strlen( expected_output[i] ), len );
@ -106,13 +106,13 @@ TEST( PosTest, FilterEnding ) {

 	size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
 	for ( size_t i = 0; i < input_len; i++ ) {
-		Words words;
+		TokenizerResult tr;
 		Pos pos;
 		char buf[MAX_BUF_SIZE];

-		ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
+		plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);

-		int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
+		int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );

 		EXPECT_STREQ( expected_output[i], buf );
 		EXPECT_EQ( strlen( expected_output[i] ), len );
@ -136,7 +136,7 @@ TEST( PosTest, FilterTags ) {
 	size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
 	for ( size_t i = 0; i < input_len; i++ ) {
 		Xml xml;
-		Words words;
+		TokenizerResult tr;
 		Pos pos;
 		char input[MAX_BUF_SIZE];
 		char buf[MAX_BUF_SIZE];
@ -144,9 +144,9 @@ TEST( PosTest, FilterTags ) {
 		std::sprintf(input, input_strs[i]);

 		ASSERT_TRUE( xml.set( input, strlen( input ), TITLEREC_CURRENT_VERSION, CT_HTML ) );
-		ASSERT_TRUE( words.set( &xml ) );
+		xml_tokenizer_phase_1(&xml, &tr);

-		int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );
+		int32_t len = pos.filter( &tr, 0, tr.size(), false, buf, buf + MAX_BUF_SIZE );

 		EXPECT_STREQ( expected_output[i], buf );
 		EXPECT_EQ( strlen( expected_output[i] ), len );
@ -178,13 +178,13 @@ TEST( PosTest, FilterSamePunct ) {

 	size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
 	for ( size_t i = 0; i < input_len; i++ ) {
-		Words words;
+		TokenizerResult tr;
 		Pos pos;
 		char buf[MAX_BUF_SIZE];

-		ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
+		plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);

-		int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
+		int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );

 		EXPECT_STREQ( expected_output[i], buf );
 		EXPECT_EQ( strlen( expected_output[i] ), len );
@ -219,13 +219,13 @@ TEST( PosTest, DecodeHTMLEntities ) {

 	size_t input_len = sizeof( input_strs ) / sizeof( input_strs[0] );
 	for ( size_t i = 0; i < input_len; i++ ) {
-		Words words;
+		TokenizerResult tr;
 		Pos pos;
 		char buf[MAX_BUF_SIZE];

-		ASSERT_TRUE( words.set( const_cast<char*>(input_strs[i]) ) );
+		plain_tokenizer_phase_1(input_strs[i], strlen(input_strs[i]), &tr);

-		int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
+		int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );

 		EXPECT_STREQ( expected_output[i], buf );
 		EXPECT_EQ( strlen( expected_output[i] ), len );
@ -233,15 +233,15 @@ TEST( PosTest, DecodeHTMLEntities ) {
 }

 TEST(PosTest, SegFaultDotPrevChar) {
-	Words words;
+	TokenizerResult tr;
 	Pos pos;
 	char buf[MAX_BUF_SIZE];

 	const char *input_str = ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . . . . . . . . . . . . .. . . . . . . . . . . ...";

-	ASSERT_TRUE( words.set( const_cast<char*>(input_str) ) );
+	plain_tokenizer_phase_1(input_str, strlen(input_str), &tr);

-	int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
+	int32_t len = pos.filter( &tr, 0, tr.size(), true, buf, buf + 180 );

 	EXPECT_EQ( 0, len );
 }
--- a/test/unit/SummaryTest.cpp
+++ b/test/unit/SummaryTest.cpp
@ -5,7 +5,7 @@
 #include <cstdio>

 #include "Xml.h"
-#include "Words.h"
+#include "tokenizer.h"
 #include "Phrases.h"
 #include "Sections.h"
 #include "Pos.h"
@ -22,17 +22,17 @@ static void generateSummary( Summary &summary, char *htmlInput, const char *quer
 	Xml xml;
 	ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML));

-	Words words;
-	ASSERT_TRUE(words.set(&xml));
+	TokenizerResult tr;
+	xml_tokenizer_phase_1(&xml,&tr);

 	Bits bits;
-	ASSERT_TRUE(bits.set(&words));
+	ASSERT_TRUE(bits.set(&tr));

 	Url url;
 	url.set(urlStr);

 	Sections sections;
-	ASSERT_TRUE(sections.set(&words, &bits, &url, CT_HTML));
+	ASSERT_TRUE(sections.set(&tr, &bits, &url, CT_HTML));

 	Query query;
 	ASSERT_TRUE(query.set(queryStr, langEnglish, 1.0, 1.0, nullptr, false, true, ABS_MAX_QUERY_TERMS));
@ -42,22 +42,22 @@ static void generateSummary( Summary &summary, char *htmlInput, const char *quer
 	linkInfo.m_lisize = sizeof(LinkInfo);

 	Title title;
-	ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));
+	ASSERT_TRUE(title.setTitle(&xml, &tr, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));

 	Pos pos;
-	ASSERT_TRUE(pos.set(&words));
+	ASSERT_TRUE(pos.set(&tr));

 	Bits bitsForSummary;
-	ASSERT_TRUE(bitsForSummary.setForSummary(&words));
+	ASSERT_TRUE(bitsForSummary.setForSummary(&tr));

 	Phrases phrases;
-	ASSERT_TRUE(phrases.set(&words, &bits));
+	ASSERT_TRUE(phrases.set(tr, bits));

 	Matches matches;
 	matches.setQuery(&query);
-	ASSERT_TRUE(matches.set(&words, &phrases, &sections, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));
+	ASSERT_TRUE(matches.set(&tr, &phrases, &sections, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));

-	summary.setSummary(&xml, &words, &sections, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
+	summary.setSummary(&xml, &tr, &sections, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
 }

 TEST( SummaryTest, StripSamePunct ) {
--- a/test/unit/WordsTest.cpp
+++ b/test/unit/WordsTest.cpp
@ -1,157 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "Words.h"
-
-TEST(WordsTest, VerifySize) {
-	// set c to a curling quote in unicode
-	int32_t c = 0x201c; // 0x235e;
-
-	// encode it into utf8
-	char dst[5];
-
-	// point to it
-	char *p = dst;
-
-	// put space in there
-	*p++ = ' ';
-
-	// "numBytes" is how many bytes it stored into 'dst"
-	int32_t numBytes = utf8Encode ( c , p );
-
-	// must be 3 bytes
-	EXPECT_EQ(3, numBytes);
-
-	// check it
-	int32_t size = getUtf8CharSize(p);
-	EXPECT_EQ(3, size);
-
-	// is that punct
-	EXPECT_TRUE(is_punct_utf8(p));
-}
-
-TEST(WordsTest, simple_tokenization) {
-	char buf[256];
-	{
-		strcpy(buf,"hello");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),1);
-	}
-	{
-		strcpy(buf,"  ");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),1);
-	}
-	{
-		strcpy(buf,"hello ");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),2);
-	}
-	{
-		strcpy(buf," hello");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),2);
-	}
-	{
-		strcpy(buf,"hello world");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),3);
-	}
-	{
-		strcpy(buf,"Hello world!");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),4);
-		EXPECT_EQ(words.getWordLen(0),5);
-		EXPECT_EQ(words.getWordLen(1),1);
-		EXPECT_EQ(words.getWordLen(2),5);
-		EXPECT_EQ(words.getWordLen(3),1);
-	}
-	{
-		strcpy(buf,"Hello, world");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),3);
-		EXPECT_EQ(words.getWordLen(0),5);
-		EXPECT_EQ(words.getWordLen(1),2);
-		EXPECT_EQ(words.getWordLen(2),5);
-	}
-}
-
-TEST(WordsTest, latin1_tokenization) {
-	char buf[256];
-	{
-		strcpy(buf,"Æbleflæsk og øl");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),5);
-		EXPECT_EQ(words.getWordLen(0),11);
-		EXPECT_EQ(words.getWordLen(1),1);
-		EXPECT_EQ(words.getWordLen(2),2);
-		EXPECT_EQ(words.getWordLen(3),1);
-		EXPECT_EQ(words.getWordLen(4),3);
-	}
-}
-
-TEST(WordsTest, mixed_script_tokenization) {
-	char buf[256];
-	{
-		strcpy(buf,"Æbleflæsk og γιαούρτι");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),5);
-		EXPECT_EQ(words.getWordLen(0),11);
-		EXPECT_EQ(words.getWordLen(1),1);
-		EXPECT_EQ(words.getWordLen(2),2);
-		EXPECT_EQ(words.getWordLen(3),1);
-		EXPECT_EQ(words.getWordLen(4),16);
-	}
-	{
-		strcpy(buf,"Æbleflæskγιαούρτι");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),2);
-		EXPECT_EQ(words.getWordLen(0),11);
-		EXPECT_EQ(words.getWordLen(1),16);
-	}
-}
-
-TEST(WordsTest, buffer_tokenization) {
-	char buf[256];
-	{
-		strcpy(buf,"Hello world");
-		Words words;
-		EXPECT_TRUE(words.set(buf,8));
-		EXPECT_EQ(words.getNumWords(),3);
-		EXPECT_EQ(words.getWordLen(0),5);
-		EXPECT_EQ(words.getWordLen(1),1);
-		EXPECT_EQ(words.getWordLen(2),2);
-	}
-}
-
-TEST(WordsTest, html_tokenization) {
-	char buf[256];
-	{
-		strcpy(buf,"<p>Hello <em>world</em>!</p>");
-		Words words;
-		EXPECT_TRUE(words.set(buf));
-		EXPECT_EQ(words.getNumWords(),13);
-		EXPECT_EQ(words.getWordLen( 0),1);
-		EXPECT_EQ(words.getWordLen( 1),1);
-		EXPECT_EQ(words.getWordLen( 2),1);
-		EXPECT_EQ(words.getWordLen( 3),5);
-		EXPECT_EQ(words.getWordLen( 4),2);
-		EXPECT_EQ(words.getWordLen( 5),2);
-		EXPECT_EQ(words.getWordLen( 6),1);
-		EXPECT_EQ(words.getWordLen( 7),5);
-		EXPECT_EQ(words.getWordLen( 8),2);
-		EXPECT_EQ(words.getWordLen( 9),2);
-		EXPECT_EQ(words.getWordLen(10),4);
-		EXPECT_EQ(words.getWordLen(11),1);
-		EXPECT_EQ(words.getWordLen(12),1);
-	}
-}