Cache document summaries

Implemented two caches for document summaries (Msg20Reply), split is based on
how stable the summary is whcih depends on highlighting and how the summary was
generated. Max cache age workds, but memory limit is not supported yet.
This commit is contained in:
Ivan Skytte Jørgensen
2016-03-03 16:52:21 +01:00
parent 8f45692633
commit ee4c23f999
8 changed files with 292 additions and 4 deletions

4
Conf.h

@ -353,6 +353,10 @@ class Conf {
int64_t m_clusterdbFileCacheSize;
int64_t m_titledbFileCacheSize;
int64_t m_spiderdbFileCacheSize;
int64_t m_stableSummaryCacheSize;
int64_t m_stableSummaryCacheMaxAge;
int64_t m_unstableSummaryCacheSize;
int64_t m_unstableSummaryCacheMaxAge;
//bool m_quickpollCoreOnError;
bool m_useShotgun;

@ -67,6 +67,7 @@ OBJS = UdpSlot.o Rebalance.o \
Punycode.o Version.o \
HighFrequencyTermShortcuts.o \
IPAddressChecks.o \
SummaryCache.o \
# common flags
DEFS = -D_REENTRANT_ -D_CHECK_FORMAT_STRING_ -I.

@ -3,11 +3,16 @@
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif
#include "SummaryCache.h"
static void gotReplyWrapper20 ( void *state , void *state20 ) ;
static void handleRequest20 ( UdpSlot *slot , int32_t netnice );
static bool gotReplyWrapperxd ( void *state ) ;
static bool sendCachedReply ( Msg20Request *req, const void *cached_summary, size_t cached_summary_len, UdpSlot *slot );
Msg20::Msg20 () { constructor(); }
Msg20::~Msg20() { reset(); }
@ -356,6 +361,18 @@ void handleRequest20 ( UdpSlot *slot , int32_t netnice ) {
return;
}
int64_t cache_key = req->makeCacheKey();
const void *cached_summary;
size_t cached_summary_len;
if(g_stable_summary_cache.lookup(cache_key, &cached_summary, &cached_summary_len) ||
g_unstable_summary_cache.lookup(cache_key, &cached_summary, &cached_summary_len))
{
log(LOG_DEBUG, "Summary cache hit");
sendCachedReply(req,cached_summary,cached_summary_len,slot);
return;
} else
log(LOG_DEBUG, "Summary cache miss");
// if it's not stored locally that's an error
if ( req->m_docId >= 0 && ! g_titledb.isLocal ( req->m_docId ) ) {
log("query: Got msg20 request for non-local docId %"INT64"",
@ -447,7 +464,7 @@ bool gotReplyWrapperxd ( void *state ) {
}
// error?
if ( g_errno ) { xd->m_reply.sendReply ( xd ); return true; }
if ( g_errno ) { xd->m_reply.sendReply ( req, xd ); return true; }
// this should not block now
Msg20Reply *reply = xd->getMsg20Reply ( );
// sanity check, should not block here now
@ -455,7 +472,7 @@ bool gotReplyWrapperxd ( void *state ) {
// NULL means error, -1 means blocked. on error g_errno should be set
if ( ! reply && ! g_errno ) { char *xx=NULL;*xx=0;}
// send it off. will send an error reply if g_errno is set
return reply->sendReply ( xd );
return reply->sendReply ( req, xd );
}
Msg20Reply::Msg20Reply ( ) {
@ -481,7 +498,7 @@ void Msg20Reply::destructor ( ) {
// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
bool Msg20Reply::sendReply ( XmlDoc *xd ) {
bool Msg20Reply::sendReply ( Msg20Request *req, XmlDoc *xd ) {
// get it
UdpSlot *slot = (UdpSlot *)xd->m_slot;
@ -531,6 +548,12 @@ bool Msg20Reply::sendReply ( XmlDoc *xd ) {
color );
//put the reply into the summary cache
if(m_isDisplaySumSetFromTags && !req->m_highlightQueryTerms)
g_stable_summary_cache.insert(req->makeCacheKey(), buf, need);
else
g_unstable_summary_cache.insert(req->makeCacheKey(), buf, need);
// . del the list at this point, we've copied all the data into reply
// . this will free a non-null State20::m_ps (ParseState) for us
mdelete ( xd , sizeof(XmlDoc) , "xd20" );
@ -541,6 +564,23 @@ bool Msg20Reply::sendReply ( XmlDoc *xd ) {
return true;
}
static bool sendCachedReply ( Msg20Request *req, const void *cached_summary, size_t cached_summary_len, UdpSlot *slot )
{
//copy the cached summary to a new temporary buffer, so that UDPSlot/Server can free it when possible
char *buf = (char *)mmalloc ( cached_summary_len , "Msg20Reply" );
if(!buf) {
g_udpServer.sendErrorReply ( slot , g_errno ) ;
return true;
}
memcpy(buf,cached_summary,cached_summary_len);
g_udpServer.sendReply_ass ( buf , cached_summary_len , NULL , 0 , slot );
return true;
}
// . this is destructive on the "buf". it converts offs to ptrs
// . sets m_r to the modified "buf" when done
// . sets g_errno and returns -1 on error, otherwise # of bytes deseril
@ -622,6 +662,39 @@ int32_t Msg20Request::deserialize ( ) {
return (int32_t)sizeof(Msg20Request) + (p - m_buf);
}
//make a cache key for a request
int64_t Msg20Request::makeCacheKey() const
{
SafeBuf hash_buffer;
hash_buffer.pushLong(m_version);
hash_buffer.pushLong(m_numSummaryLines);
hash_buffer.pushLong(m_getHeaderTag);
hash_buffer.pushLongLong(m_docId);
hash_buffer.pushLong(m_titleMaxLen);
hash_buffer.pushLong(m_summaryMaxLen);
hash_buffer.pushLong(m_summaryMaxNumCharsPerLine);
hash_buffer.pushLong(m_collnum);
hash_buffer.pushLong(m_highlightQueryTerms);
hash_buffer.pushLong(m_getSummaryVector);
hash_buffer.pushLong(m_showBanned);
hash_buffer.pushLong(m_includeCachedCopy);
hash_buffer.pushLong(m_doLinkSpamCheck);
hash_buffer.pushLong(m_isLinkSpam);
hash_buffer.pushLong(m_isSiteLinkInfo);
hash_buffer.pushLong(m_getLinkInfo);
hash_buffer.pushLong(m_onlyNeedGoodInlinks);
hash_buffer.pushLong(m_getLinkText);
if(m_highlightQueryTerms)
hash_buffer.safeMemcpy(ptr_qbuf,size_qbuf);
hash_buffer.safeMemcpy(ptr_ubuf,size_ubuf);
hash_buffer.safeMemcpy(ptr_linkee,size_linkee);
hash_buffer.safeMemcpy(ptr_displayMetas,size_displayMetas);
int64_t h = hash64(hash_buffer.getBufStart(), hash_buffer.length());
return h;
}
int32_t Msg20Reply::getStoredSize ( ) {
int32_t size = (int32_t)sizeof(Msg20Reply);
// add up string buffer sizes

@ -39,6 +39,7 @@ class Msg20Request {
int32_t getStoredSize ( );
char *serialize ( int32_t *sizePtr );
int32_t deserialize ( );
int64_t makeCacheKey() const;
char m_version ; // non-zero default
char m_numSummaryLines ; // non-zero default
@ -121,7 +122,7 @@ public:
int32_t serialize ( char *buf , int32_t bufSize );
bool sendReply ( class XmlDoc *xd ) ;
bool sendReply ( Msg20Request *req, class XmlDoc *xd ) ;
// after calling these, when serialize() is called again it will
// exclude these strings which were "cleared". Used by Msg40 to

@ -6408,6 +6408,62 @@ void Parms::init ( ) {
m->m_group = 0;
m++;
m->m_title = "stable-summary cache size";
m->m_desc = "How much memory to use for stable summaries, viz. generated from meta tags and the same for all users and queries";
m->m_cgi = "stablesumcachemem";
m->m_xml = "StableSummaryCacheSize";
m->m_off = (char *)&g_conf.m_stableSummaryCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_units = "bytes";
m->m_flags = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "stable-summary cache max age";
m->m_desc = "How long to cache stable summaries, in milliseconds";
m->m_cgi = "stablesumcacheage";
m->m_xml = "StableSummaryCacheAge";
m->m_off = offsetof(Conf,m_stableSummaryCacheMaxAge);
m->m_type = TYPE_LONG_LONG;
m->m_def = "86400000";
m->m_units = "milliseconds";
m->m_flags = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "unstable-summary cache size";
m->m_desc = "How much memory to use for stable summaries, viz. generated from content and depends on user and search terms";
m->m_cgi = "unstablesumcachemem";
m->m_xml = "UnstableSummaryCacheSize";
m->m_off = offsetof(Conf,m_unstableSummaryCacheSize);
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_units = "bytes";
m->m_flags = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "stable-summary cache max age";
m->m_desc = "How long to cache unstable summaries, in milliseconds";
m->m_cgi = "unstablesumcacheage";
m->m_xml = "UnstableSummaryCacheAge";
m->m_off = offsetof(Conf,m_unstableSummaryCacheMaxAge);
m->m_type = TYPE_LONG_LONG;
m->m_def = "3600000";
m->m_units = "milliseconds";
m->m_flags = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "scan all if not found";
m->m_desc = "Scan all titledb files if rec not found. You should "
"keep this on to avoid corruption. Do not turn it off unless "
@ -6416,6 +6472,7 @@ void Parms::init ( ) {
m->m_off = (char *)&g_conf.m_scanAllIfNotFound - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_units = "";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;

106
SummaryCache.cpp Normal file

@ -0,0 +1,106 @@
#include "SummaryCache.h"
#include "Mem.h"
#include "fctypes.h"
SummaryCache g_stable_summary_cache;
SummaryCache g_unstable_summary_cache;
static const char memory_note[] = "cached_summary";
SummaryCache::SummaryCache()
: m(),
purge_iter(m.begin()),
max_age(1000), //1 second
max_memory(1000000), //1 megabyte
memory_used(0)
{
}
void SummaryCache::configure(int64_t max_age_, size_t max_memory_)
{
max_age = max_age_;
max_memory = max_memory_;
}
void SummaryCache::clear()
{
for(std::map<int64_t,Item>::iterator iter = m.begin();
iter!=m.end();
++iter)
mfree(iter->second.data,iter->second.datalen,memory_note);
m.clear();
purge_iter = m.begin();
memory_used = 0;
}
void SummaryCache::insert(int64_t key, const void *data, size_t datalen)
{
purge_step();
std::map<int64_t,Item>::iterator iter = m.find(key);
if(iter!=m.end()) {
//remove the old entry first
if(purge_iter==iter)
++purge_iter;
mfree(iter->second.data,iter->second.datalen,memory_note);
memory_used -= iter->second.datalen;
m.erase(iter);
}
Item item;
item.timestamp = 0; //temporarily, for exception+memoryleak reason
item.data = 0;
item.datalen = 0;
iter = m.insert(std::make_pair(key,item)).first;
void *datacopy = mmalloc(datalen, memory_note);
if(!datacopy) {
m.erase(iter);
return;
}
memcpy(datacopy,data,datalen);
iter->second.data = datacopy;
iter->second.datalen = datalen;
iter->second.timestamp = gettimeofdayInMilliseconds();
memory_used += datalen;
}
bool SummaryCache::lookup(int64_t key, const void **data, size_t *datalen)
{
purge_step();
std::map<int64_t,Item>::iterator iter = m.find(key);
if(iter!=m.end() && iter->second.timestamp+max_age>=gettimeofdayInMilliseconds()) {
*data = iter->second.data;
*datalen = iter->second.datalen;
return true;
} else
return false;
}
void SummaryCache::purge_step()
{
if(purge_iter==m.end())
purge_iter = m.begin();
else {
int64_t now = gettimeofdayInMilliseconds();
if(purge_iter->second.timestamp+max_age<now) {
std::map<int64_t,Item>::iterator iter = purge_iter;
++purge_iter;
mfree(iter->second.data,iter->second.datalen,memory_note);
memory_used -= iter->second.datalen;
m.erase(iter);
} else
++purge_iter;
}
}

42
SummaryCache.h Normal file

@ -0,0 +1,42 @@
#ifndef SUMMARY_CACHE_
#define SUMMARY_CACHE_
#include <inttypes.h>
#include <stddef.h>
#include <map>
class SummaryCache {
SummaryCache(const SummaryCache&);
SummaryCache& operator=(const SummaryCache&);
struct Item {
int64_t timestamp;
void *data;
size_t datalen;
};
std::map<int64_t,Item> m;
std::map<int64_t,Item>::iterator purge_iter;
int64_t max_age;
size_t max_memory;
size_t memory_used;
public:
SummaryCache();
~SummaryCache() { clear(); }
void configure(int64_t max_age, size_t max_memory);
void clear();
void insert(int64_t key, const void *data, size_t datalen);
bool lookup(int64_t key, const void **data, size_t *datalen);
private:
void purge_step();
};
extern SummaryCache g_stable_summary_cache; //for summaries based on tags and no highlighting
extern SummaryCache g_unstable_summary_cache; //for summaries based on content or with highlighting
#endif

@ -45,6 +45,7 @@
#include "Pos.h"
#include "Title.h"
#include "Speller.h"
#include "SummaryCache.h"
// include all msgs that have request handlers, cuz we register them with g_udp
#include "Msg0.h"
@ -2724,6 +2725,9 @@ int main2 ( int argc , char *argv[] ) {
//if ( ! g_dnsLocal.init( 0 , false ) ) {
// log("db: Dns local client init failed." ); return 1; }
g_stable_summary_cache.configure(g_conf.m_stableSummaryCacheMaxAge, g_conf.m_stableSummaryCacheSize);
g_unstable_summary_cache.configure(g_conf.m_unstableSummaryCacheMaxAge, g_conf.m_unstableSummaryCacheSize);
// . then webserver
// . server should listen to a socket and register with g_loop
if ( ! g_httpServer.init( h9->m_httpPort, h9->m_httpsPort ) ) {