privacore-open-source-searc.../FxTermCheckList.cpp

242 lines
7.3 KiB
C++

//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "FxTermCheckList.h"
#include "Conf.h"
#include "Log.h"
#include "termid_mask.h"
#include "Phrases.h"
#include "tokenizer.h"
#include <fstream>
#include <sys/stat.h>
TermCheckList::TermCheckList() : m_initialized(false) {}
TermCheckList::~TermCheckList(){}
bool TermCheckList::init(const char *fname1, const char *fname2) {
//
// Initialize single words
//
int32_t need4 = 10000 * 4 + 5000;
if (!m_terms.set(sizeof(int64_t), 4, need4, NULL, 0, false, "termchecklist", false, 0)) {
log(LOG_ERROR,"Could not initialize term hashtable");
return false;
}
if( fname1 && !loadScoredTermList(&m_terms, fname1) ) {
log(LOG_ERROR,"Could not load '%s'", fname1);
}
//
// Initialize phrases (bigrams) - use same hash table as words
//
if( fname2 && !loadScoredTermList(&m_terms, fname2) ) {
log(LOG_ERROR,"Could not load '%s'", fname1);
}
m_initialized = true;
return m_initialized;
}
bool TermCheckList::loadScoredTermList(HashTableX *ht, const char *filename) {
log(LOG_INFO, "Loading %s", filename);
struct stat st;
if (stat(filename, &st) != 0) {
// probably not found
log(LOG_INFO, "loadScoredTermlist: Unable to stat %s", filename);
return false;
}
std::ifstream file(filename);
std::string line;
while (std::getline(file, line)) {
// ignore comments & empty lines
if (line.length() == 0 || line[0] == '#') {
continue;
}
auto firstColEnd = line.find_first_of("|");
size_t secondCol = line.find_first_not_of("|", firstColEnd);
if( firstColEnd == std::string::npos || secondCol == std::string::npos) {
// invalid format
log(LOG_ERROR,"Invalid line read from %s: %.*s", filename, (int)line.length(), line.data());
continue;
}
size_t secondColEnd = line.find_first_of("|", secondCol);
size_t thirdCol = line.find_first_not_of("|", secondColEnd);
if (thirdCol == std::string::npos) {
// invalid format
log(LOG_ERROR,"Invalid line read from %s: %.*s", filename, (int)line.length(), line.data());
continue;
}
std::string lang = std::string(line, 0, firstColEnd);
std::string col2(line, secondCol, secondColEnd - secondCol);
std::string col3 = std::string(line, thirdCol);
int32_t dwscore = atoi(col3.data());
if( dwscore < 1 || col2.length() < 1 || col3.length() < 1 ) {
log(LOG_ERROR,"Invalid line read from %s: %.*s", filename, (int)line.length(), line.data());
continue;
}
//log(LOG_ERROR,"read: %s [%" PRId32 "] [%s]", col2.c_str(), dwscore, lang.c_str());
int64_t dwid = hash64Lower_utf8_nospaces(col2.data(), col2.length());
if( !ht->addKey(&dwid, &dwscore) ) {
log(LOG_ERROR,"Could not add [%.*s] to word list", (int)col2.length(), col2.data());
return false;
}
}
log(LOG_INFO, "Loaded %s", filename);
return true;
}
bool TermCheckList::getScore(const TokenizerResult &tr, Phrases *p, HashTableX *uniqueTermIds, int32_t *docScore, int32_t *numUniqueWords, int32_t *numUniquePhrases, char *debbuf, int32_t &debbuf_used, int32_t debbuf_size) {
if( !uniqueTermIds || !docScore || !numUniqueWords || !numUniquePhrases ) {
return false;
}
if( debbuf ) {
debbuf[debbuf_used] = '\0';
}
int rc;
for(size_t i=0; i < tr.size(); i++) {
const auto &token = tr[i];
if(!token.is_alfanum || token.token_hash==0) {
continue;
}
const char *s = NULL;
int32_t slen = 0;
if( g_conf.m_logTraceTermCheckList || debbuf ) {
s = token.token_start;
slen = token.token_len;
}
int64_t termId = token.token_hash;
// only process if we haven't seen it before
if ( uniqueTermIds->getSlot( &termId ) >= 0 ) {
//logTrace(g_conf.m_logTraceTermCheckList, "Already seen word %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK));
}
else {
// add to hash table. return NULL and set g_errno on error
if ( !uniqueTermIds->addKey(&termId)) {
log(LOG_ERROR,"Could not add termId to uniqueTermIds hash table");
}
int32_t *sc = (int32_t*)m_terms.getValue64(termId);
if( sc ) {
logTrace(g_conf.m_logTraceTermCheckList, "Match word %d: %.*s -> %" PRIu64 " (%" PRId64 ") score %" PRId32 ". debbuf_used=%" PRId32 ", debbuf_size=%" PRId32 "", (int)i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK), *sc, debbuf_used, debbuf_size);
(*docScore) += *sc;
(*numUniqueWords)++;
if( debbuf ) {
// 2=", ", 2="w:"
if( debbuf_used+slen+2+2+1 < debbuf_size ) {
if(debbuf_used ) {
rc = snprintf(&debbuf[debbuf_used], debbuf_size - debbuf_used, ", ");
if( rc > 0 ) {
debbuf_used += rc;
}
}
rc = snprintf(&debbuf[debbuf_used], debbuf_size - debbuf_used, "w:%.*s", slen, s);
if( rc > 0 ) {
debbuf_used += rc;
}
}
}
}
else {
//logTrace(g_conf.m_logTraceTermCheckList, "Word %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, slen, s, (uint64_t)termId, (uint64_t)(termId & TERMID_MASK));
}
}
if( !p->getPhraseId(i)) {
// No phrases
continue;
}
int32_t plen=0;
char pbuf[256]={0};
if( g_conf.m_logTraceTermCheckList || debbuf ) {
p->getPhrase(i, tr, pbuf, sizeof(pbuf)-1, &plen);
}
int64_t phraseId = p->getPhraseId(i);
if ( uniqueTermIds->getSlot ( &phraseId ) >= 0 ) {
//logTrace(g_conf.m_logTraceTermCheckList, "Already seen phrase %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK));
continue;
}
// add to hash table. return NULL and set g_errno on error
if ( !uniqueTermIds->addKey(&phraseId)) {
log(LOG_ERROR,"Could not add phraseId to uniqueTermIds hash table");
}
int32_t *sc = (int32_t*)m_terms.getValue64(phraseId);
if( sc ) {
logTrace(g_conf.m_logTraceTermCheckList, "Match phrase %d: %.*s -> %" PRIu64 " (%" PRId64 ") score %" PRId32 ". debbuf_used=%" PRId32 ", debbuf_size=%" PRId32 "", (int)i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK), *sc, debbuf_used, debbuf_size);
(*docScore) += *sc;
(*numUniquePhrases)++;
if( debbuf ) {
// 2=", ", 2="p:"
if( debbuf_used+plen+2+2+1 < debbuf_size ) {
if(debbuf_used) {
rc = snprintf(&debbuf[debbuf_used], debbuf_size-debbuf_used, ", ");
if( rc > 0 ) {
debbuf_used += rc;
}
}
rc = snprintf(&debbuf[debbuf_used], debbuf_size-debbuf_used, "p:%.*s", plen, pbuf);
if( rc > 0 ) {
debbuf_used += rc;
}
}
}
}
else {
//logTrace(g_conf.m_logTraceTermCheckList, "Phrase %" PRId32 ": %.*s -> %" PRIu64 " (%" PRId64 ")", i, plen, pbuf, (uint64_t)phraseId, (uint64_t)(phraseId & TERMID_MASK));
}
}
if( debbuf ) {
debbuf[debbuf_used] = '\0';
}
return true;
}