193 lines
6.3 KiB
C++
193 lines
6.3 KiB
C++
#include "PageTemperatureRegistry.h"
|
|
#include "ScopedLock.h"
|
|
#include "ScalingFunctions.h"
|
|
#include "Log.h"
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
#include <sys/stat.h>
|
|
#include <math.h>
|
|
#include <float.h> // FLT_EPSILON, DBL_EPSILON
|
|
#include <algorithm>
|
|
|
|
|
|
PageTemperatureRegistry g_pageTemperatureRegistry;
|
|
static GbMutex load_lock;
|
|
|
|
static const char filename[] = "page_temperatures.dat";
|
|
|
|
|
|
bool PageTemperatureRegistry::load() {
|
|
ScopedLock sl(load_lock);
|
|
log(LOG_DEBUG, "Loading %s", filename);
|
|
|
|
struct stat st;
|
|
if(stat(filename,&st)!=0) {
|
|
log(LOG_WARN,"fstat(%s) failed with errno=%d (%s)", filename, errno, strerror(errno));
|
|
return false;
|
|
}
|
|
|
|
unsigned new_active_index = 1-active_index;
|
|
if(!mmf[new_active_index].open(filename)) {
|
|
if(errno==ENOENT)
|
|
log(LOG_INFO,"Couldn't open %s, errno=%d (%s)", filename, errno, strerror(errno));
|
|
else
|
|
log(LOG_WARN,"Couldn't open %s, errno=%d (%s)", filename, errno, strerror(errno));
|
|
return false;
|
|
}
|
|
|
|
if(mmf[new_active_index].size()%sizeof(uint64_t)) {
|
|
log(LOG_WARN,"%s size is not a multiple of %zu", filename, sizeof(uint64_t));
|
|
return false;
|
|
}
|
|
|
|
|
|
//ok, is the file sorted as we expect it to be?
|
|
if(mmf[new_active_index].size() >= sizeof(uint64_t)*5) {
|
|
//just probe 5 elements
|
|
size_t c = mmf[new_active_index].size() / sizeof(uint64_t);
|
|
auto e = reinterpret_cast<const uint64_t*>(mmf[new_active_index].start());
|
|
if(e[c/5*0]<e[c/5*1] &&
|
|
e[c/5*1]<e[c/5*2] &&
|
|
e[c/5*2]<e[c/5*3] &&
|
|
e[c/5*3]<e[c-1 ])
|
|
; //excellent
|
|
else {
|
|
log(LOG_WARN,"%s is not sorted. Regenerate or sort it", filename);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
//Default temperature for unregistered pages is a bit tricky.
|
|
//Initially an unregistered page is likely just freshly crawled but an old one. So the average
|
|
//temperature is a good guess. On the other hand when we have crawled most of the internet
|
|
//then an unregistered page indicates a new page and it like has low temperature.
|
|
//There is no obvious correct value.
|
|
|
|
//If there is a .meta file then use the values from that
|
|
bool using_meta = false;
|
|
char meta_filename[1024];
|
|
sprintf(meta_filename,"%s.meta",filename);
|
|
FILE *fp_meta = fopen(meta_filename,"r");
|
|
if(fp_meta) {
|
|
unsigned tmp_min_temperature;
|
|
unsigned tmp_max_temperature;
|
|
unsigned tmp_default_temperature;
|
|
if(fscanf(fp_meta,"%u%u%u",&tmp_min_temperature,&tmp_max_temperature,&tmp_default_temperature)==3) {
|
|
if(tmp_min_temperature<tmp_max_temperature &&
|
|
tmp_default_temperature>=tmp_min_temperature &&
|
|
tmp_default_temperature<=tmp_max_temperature)
|
|
{
|
|
min_temperature = tmp_min_temperature;
|
|
max_temperature = tmp_max_temperature;
|
|
default_temperature = tmp_default_temperature;
|
|
using_meta = true;
|
|
} else
|
|
log(LOG_WARN,"Invalid values in %s", meta_filename);
|
|
}
|
|
fclose(fp_meta);
|
|
}
|
|
|
|
if(!using_meta) {
|
|
//otherwise calculate min/max/avg
|
|
unsigned new_min_temperature = 0x3ffffff;
|
|
unsigned new_max_temperature = 0;
|
|
auto *begin = reinterpret_cast<const uint64_t*>(mmf[new_active_index].start());
|
|
auto end = begin + mmf[new_active_index].size()/sizeof(uint64_t);
|
|
for(auto *e = begin; e<end; e++) {
|
|
//uint64_t docid = *e>>26;
|
|
unsigned temperature = *e&0x3ffffff;
|
|
if(temperature<new_min_temperature) new_min_temperature=temperature;
|
|
if(temperature>new_max_temperature) new_max_temperature=temperature;
|
|
}
|
|
min_temperature = new_min_temperature;
|
|
max_temperature = new_max_temperature;
|
|
default_temperature = (min_temperature+max_temperature)/2;
|
|
}
|
|
|
|
temperature_range_for_scaling = max_temperature-min_temperature;
|
|
|
|
min_temperature_log = min_temperature>0 ? log(min_temperature) : DBL_EPSILON;
|
|
max_temperature_log = log(max_temperature);
|
|
temperature_range_for_scaling_log = log(temperature_range_for_scaling);
|
|
default_temperature_log = log(default_temperature);
|
|
|
|
if(!using_meta)
|
|
log(LOG_WARN, "meta-file %s could not be loaded. Using default temperature of %u which can scew results for new pages", meta_filename, default_temperature);
|
|
|
|
log(LOG_DEBUG, "pagetemp: min_temperature=%u",min_temperature);
|
|
log(LOG_DEBUG, "pagetemp: max_temperature=%u",max_temperature);
|
|
log(LOG_DEBUG, "pagetemp: default_temperature=%u",default_temperature);
|
|
|
|
log(LOG_DEBUG, "%s loaded (%lu items)", filename, (unsigned long)mmf[new_active_index].size()/sizeof(uint64_t));
|
|
|
|
//swap in and done.
|
|
|
|
active_index.store(new_active_index,std::memory_order_release);
|
|
|
|
stat_ino = st.st_ino;
|
|
stat_mtime = st.st_mtime;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
void PageTemperatureRegistry::unload() {
|
|
mmf[0].close();
|
|
mmf[1].close();
|
|
//min/max temperatures are kept as-is
|
|
}
|
|
|
|
|
|
void PageTemperatureRegistry::reload_if_needed() {
|
|
struct stat st;
|
|
if(stat(filename,&st)!=0)
|
|
return;
|
|
if(st.st_ino!=stat_ino || st.st_mtime!=stat_mtime)
|
|
load();
|
|
}
|
|
|
|
|
|
unsigned PageTemperatureRegistry::query_page_temperature_internal(uint64_t docid) const {
|
|
return query_page_temperature_internal(docid,default_temperature);
|
|
}
|
|
|
|
|
|
unsigned PageTemperatureRegistry::query_page_temperature_internal(uint64_t docid, unsigned raw_default) const {
|
|
auto ai = active_index.load(std::memory_order_consume);
|
|
auto start = reinterpret_cast<const uint64_t *>(mmf[ai].start());
|
|
auto count = mmf[ai].size()/sizeof(uint64_t);
|
|
auto end = start+count;
|
|
|
|
auto pos = std::lower_bound(start, end, docid<<26);
|
|
if(pos!=end && *pos>>26 == docid) {
|
|
return *pos&0x3ffffff;
|
|
}
|
|
return raw_default;
|
|
}
|
|
|
|
|
|
|
|
bool PageTemperatureRegistry::query_page_temperature(uint64_t docid, double range_min, double range_max, double *temperature) const {
|
|
auto ai = active_index.load(std::memory_order_consume);
|
|
auto start = reinterpret_cast<const uint64_t *>(mmf[ai].start());
|
|
auto count = mmf[ai].size()/sizeof(uint64_t);
|
|
auto end = start+count;
|
|
|
|
auto pos = std::lower_bound(start, end, docid<<26);
|
|
if(pos!=end && *pos>>26 == docid) {
|
|
*temperature = scale_temperature(range_min,range_max,*pos&0x3ffffff);
|
|
return *pos&0x3ffffff;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
double PageTemperatureRegistry::scale_temperature(double range_min, double range_max, unsigned raw_temperature) const {
|
|
double temperature_26bit_log = log(raw_temperature);
|
|
return scale_linear(temperature_26bit_log, min_temperature_log, max_temperature_log, range_min, range_max);
|
|
}
|
|
|
|
double PageTemperatureRegistry::query_default_page_temperature(double range_min, double range_max) const {
|
|
return scale_linear(default_temperature_log, min_temperature_log, max_temperature_log, range_min, range_max);
|
|
}
|