mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
e515e92dae
It has always been local time since ... forever. We rely on NTP doing its job.
836 lines
24 KiB
C++
836 lines
24 KiB
C++
#include "Proxy.h"
|
|
#include "Process.h"
|
|
#include "Msg13.h"
|
|
#include "Collectiondb.h"
|
|
#include "Dns.h"
|
|
#include "UdpSlot.h"
|
|
#include "XmlDoc.h"
|
|
#include "GbCompress.h"
|
|
#include "HttpServer.h"
|
|
#include "Pages.h"
|
|
#include "HttpRequest.h"
|
|
#include "Stats.h"
|
|
#include "ip.h"
|
|
#include "Conf.h"
|
|
#include "Mem.h"
|
|
#include "Statistics.h"
|
|
#include "Errno.h"
|
|
|
|
|
|
Proxy g_proxy;
|
|
|
|
static void proxyHandlerWrapper ( TcpSocket *s );
|
|
//static void gotReplyWrapperPage( void *state, TcpSocket *s );
|
|
static void gotHttpReplyWrapper ( void *state, UdpSlot *slot ) ;
|
|
//static void gotDataFeedRequestWrapper( void *state );
|
|
static void uncountStripe ( struct StateControl *stC ) ;
|
|
|
|
struct StateControl{
|
|
int32_t m_pageNum;
|
|
TcpSocket *m_s;
|
|
int64_t m_startTime;
|
|
bool m_isQuery;
|
|
uint32_t m_hash;
|
|
int32_t m_hostId;
|
|
int32_t m_raw;
|
|
int32_t m_stripe ;
|
|
int32_t m_numQueryTerms ;
|
|
UdpSlot *m_slot;
|
|
char *m_slotReadBuf;
|
|
int32_t m_slotReadBufMaxSize;
|
|
int32_t m_retries;
|
|
int64_t m_timeout;
|
|
HttpRequest m_hr;
|
|
Host *m_forwardHost;
|
|
};
|
|
|
|
static void freeStateControl ( StateControl *stC );
|
|
|
|
Proxy::Proxy() {
|
|
m_proxyId = -1;
|
|
m_proxyRunning = false;
|
|
for (int32_t i =0; i < MAX_HOSTS; i++)
|
|
m_numOutstanding[i] = 0;
|
|
for ( int32_t i = 0 ; i < MAX_STRIPES ; i++ ) {
|
|
m_termsOutOnStripe [i] = 0;
|
|
m_queriesOutOnStripe [i] = 0;
|
|
m_stripeLastHostId [i] = -1;
|
|
}
|
|
m_nextStripe = 0;
|
|
m_lastHost = 0;
|
|
m_mainHost = 0;
|
|
//m_msg3c = NULL;
|
|
}
|
|
|
|
Proxy::~Proxy() {
|
|
/*
|
|
if(m_msg3c) {
|
|
// free the msg3c
|
|
mdelete(m_msg3c, sizeof(Msg3c), "Proxy-Msg3c");
|
|
delete m_msg3c;
|
|
m_msg3c = NULL;
|
|
}
|
|
*/
|
|
}
|
|
|
|
bool Proxy::initHttpServer ( uint16_t httpPort,
|
|
uint16_t httpsPort ) {
|
|
if ( ! g_httpServer.init( httpPort, httpsPort,
|
|
proxyHandlerWrapper ) ) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Proxy::initProxy ( int32_t proxyId, uint16_t udpPort,
|
|
uint16_t udpPort2,UdpProtocol *dp ) {
|
|
m_proxyId = proxyId;
|
|
|
|
// set this in Hostdb too!
|
|
g_hostdb.m_myHostId = proxyId;
|
|
|
|
m_proxyRunning = true;
|
|
|
|
//We log the http requests, althogh this is directly done by us
|
|
g_conf.m_logHttpRequests = true;
|
|
|
|
// if proxy, always have autosave on so we can save user accouting
|
|
// info regularly for billing feed access. save every 5 minutes.
|
|
g_conf.m_autoSaveFrequency = 5;
|
|
|
|
|
|
//Also have to init pages because we need to know which requests to
|
|
//forward. html/gif's, etc can be taken care here itself.
|
|
g_pages.init ( );
|
|
|
|
if ( ! Msg13::registerHandler () ) return false;
|
|
|
|
// . then dns Distributed client
|
|
// . server should listen to a socket and register with g_loop
|
|
// . Only the distributed cache shall call the dns server.
|
|
if ( ! g_dns.init( g_hostdb.m_myHost->m_dnsClientPort ) ) {
|
|
log("db: Dns distributed client init failed." ); return 1; }
|
|
|
|
if ( ! MsgC::registerHandler() ) return false;
|
|
|
|
//need to init collectiondb too because of addurl
|
|
//set isdump to true because we aren't going to store any data in the
|
|
//collection
|
|
if ( !g_collectiondb.loadAllCollRecs( ) ){ //isDump
|
|
log ("db: collectiondb init failed.");
|
|
return false;
|
|
}
|
|
//init g_msg
|
|
g_msg = "";
|
|
|
|
return true;
|
|
}
|
|
|
|
void proxyHandlerWrapper ( TcpSocket *s ){
|
|
g_proxy.handleRequest (s);
|
|
}
|
|
|
|
bool Proxy::handleRequest (TcpSocket *s){
|
|
|
|
if( !s ) {
|
|
return false;
|
|
}
|
|
|
|
// if we are a spider compression proxy, do not really act like
|
|
// a proxy at all!
|
|
if ( g_hostdb.m_myHost->m_type == HT_SCPROXY ) {
|
|
//httprequest changes the buf
|
|
g_httpServer.requestHandler(s);
|
|
g_msg = "";
|
|
return true;
|
|
}
|
|
|
|
HttpRequest hr;
|
|
|
|
bool status = hr.set ( s->m_readBuf , s->m_readOffset , s ) ;
|
|
if ( ! status ) {
|
|
// log a bad request
|
|
char ipbuf[16];
|
|
log("http: Got bad request from %s: %s",
|
|
iptoa(s->m_ip,ipbuf), mstrerror(g_errno));
|
|
// cancel the g_errno, we'll send a BAD REQUEST reply to them
|
|
g_errno = 0;
|
|
// . this returns false if blocked, true otherwise
|
|
// . this sets g_errno on error
|
|
// . this will destroy(s) if cannot malloc send buffer
|
|
g_httpServer.sendErrorReply ( s , 400, "Bad Request" );
|
|
return false;
|
|
}
|
|
|
|
bool isAdmin = g_conf.isMasterAdmin(s,&hr);
|
|
|
|
int32_t redirLen = hr.getRedirLen() ;
|
|
const char *redir = NULL;
|
|
if(redirLen > 0) redir = hr.getRedir();
|
|
|
|
// redirect everyone away if we should
|
|
if ( !redir &&
|
|
!isAdmin &&
|
|
// . we put this here to redirect all traffic somewhere else
|
|
// . you can set that url in the master controls
|
|
// . it only redirects there if the raw/code/site/sites is NULL
|
|
*g_conf.m_redirect != '\0' &&
|
|
hr.getLong("xml", -1) == -1 &&
|
|
hr.getString("code") == NULL &&
|
|
hr.getString("site") == NULL &&
|
|
hr.getString("sites") == NULL) {
|
|
//direct all non-xml, non admin traffic away.
|
|
redir = g_conf.m_redirect;
|
|
redirLen = strlen(g_conf.m_redirect);
|
|
}
|
|
|
|
// . just requesting a static file, like rants.html or logo.gif?
|
|
// . if so just handle that as a normal html/image file
|
|
int32_t n = g_pages.getDynamicPageNumber ( &hr );
|
|
const char *path = hr.getPath();
|
|
|
|
// . i guess right now the proxy is handling admin pages itself
|
|
// . i am changing this so that if &forward=<hostid> is in the url then
|
|
|
|
// the proxy forwards the control page request to the given hostid
|
|
int32_t forward = hr.getLong("forward",-1);
|
|
|
|
bool handleIt = true;
|
|
if ( forward != -1 ) handleIt = false;
|
|
//if ( n == PAGE_ROOT ) handleIt = false;
|
|
if ( n == PAGE_GET ) handleIt = false;
|
|
if ( n == PAGE_RESULTS ) handleIt = false;
|
|
|
|
// proxy now handles the shell addurl page, the actual request
|
|
// made by the ajax in the shell to add the url has a &id= in it
|
|
// and should go to the backend for adding the url. but we can
|
|
// handle the shell, just the add url page html including the ajax
|
|
// script.
|
|
int32_t cgiId = hr.getLong("id",0);
|
|
if ( n == PAGE_ADDURL && cgiId ) handleIt = false;
|
|
|
|
// our new cached page representation format
|
|
if ( ! strncmp(path,"/?id=" ,5 ) ) handleIt = false;
|
|
|
|
// get the server this socket uses
|
|
TcpServer *tcp = s->m_this;
|
|
int32_t max;
|
|
if ( tcp == &g_httpServer.m_ssltcp ) max = g_conf.m_httpsMaxSockets;
|
|
else max = g_conf.m_httpMaxSockets;
|
|
|
|
|
|
// . page addurl uses the udpserver to send the addurl stuff to one of
|
|
// the hosts, so we need udpserver.
|
|
// . handle the request ourselves if it is not one of these
|
|
// pages and "forward" was not specified in the url cgi fields
|
|
if ( handleIt ) {
|
|
//httprequest changes the buf
|
|
g_httpServer.requestHandler(s);
|
|
g_msg = "";
|
|
return true;
|
|
}
|
|
|
|
// just a safety catch
|
|
if ( max < 20 ) max = 20;
|
|
|
|
// enforce the open socket quota iff not admin and not from intranet
|
|
if ( ! isAdmin && tcp->m_numIncomingUsed >= max &&
|
|
!tcp->closeLeastUsed()) {
|
|
static int32_t s_last = 0;
|
|
static int32_t s_count = 0;
|
|
int32_t now = getTime();
|
|
if ( now - s_last < 5 )
|
|
s_count++;
|
|
else {
|
|
char ipbuf[16];
|
|
log("query: Too many sockets open. Sending 500 "
|
|
"http status code to %s. (msgslogged=%" PRId32")[2]",
|
|
iptoa(s->m_ip,ipbuf),s_count);
|
|
s_count = 0;
|
|
s_last = now;
|
|
}
|
|
Statistics::register_socket_limit_hit();
|
|
return g_httpServer.sendErrorReply ( s , 500 ,
|
|
"Too many sockets open.");
|
|
}
|
|
|
|
bool err2 = false;
|
|
if ( err2 ) {
|
|
hadError2:
|
|
g_errno = ENOMEM;
|
|
log("proxy: new(%i): %s",(int32_t)sizeof(StateControl),
|
|
mstrerror(g_errno));
|
|
g_msg = " (error: out of memory.)";
|
|
printRequest(s, &hr);
|
|
int32_t bs;
|
|
bool st;
|
|
st=g_httpServer.sendErrorReply(s,500,mstrerror(g_errno),&bs);
|
|
return st;
|
|
}
|
|
|
|
// if we get here that means we've got something to forward.
|
|
StateControl *stC;
|
|
try { stC = new (StateControl) ; }
|
|
// return true and set g_errno if couldn't make a new File class
|
|
catch(std::bad_alloc&) {
|
|
goto hadError2;
|
|
}
|
|
mnew ( stC, sizeof(StateControl), "Proxy");
|
|
|
|
// make a copy of this now
|
|
if ( ! stC->m_hr.copy ( &hr ) ) {
|
|
mdelete(stC,sizeof(StateControl),"Proxy");
|
|
delete(stC);
|
|
goto hadError2;
|
|
}
|
|
|
|
// reset to -1 in case freeStateControl is called
|
|
stC->m_hostId = -1;
|
|
stC->m_slot = NULL;
|
|
|
|
// support &xml=1 to indicate xml output is wanted
|
|
stC->m_raw = hr.getLong ( "xml", 0 );
|
|
|
|
stC->m_s = s;
|
|
|
|
stC->m_pageNum = n;
|
|
|
|
stC->m_startTime = gettimeofdayInMilliseconds();
|
|
|
|
stC->m_isQuery = false;
|
|
|
|
//check if we've got a query
|
|
if ( n == PAGE_RESULTS )
|
|
stC->m_isQuery = true;
|
|
|
|
stC->m_hash = hash32( hr.getRequest(), hr.getRequestLen() );
|
|
|
|
// assume we are not doing a search query (stripe load balancing)
|
|
stC->m_stripe = -1;
|
|
|
|
// we need to know how many terms (excluding synonyms)
|
|
// so we can do stripe load balancing by number of query terms.
|
|
// i.e. sending 3 queries of only one term each is about the
|
|
// same as sending one larger query to a single stripe.
|
|
const char *qs = hr.getString("q",NULL);
|
|
Query q;
|
|
if (qs) {
|
|
q.set(qs, langUnknown, 1.0, 1.0, NULL, false, hr.getBool("highfreqtermcache", true), ABS_MAX_QUERY_TERMS);
|
|
}
|
|
// clear g_errno in case Query::set() set it
|
|
g_errno = 0;
|
|
// save it. might be zero!
|
|
stC->m_numQueryTerms = q.getNumTerms();
|
|
|
|
|
|
Host *h = NULL;
|
|
|
|
//if we want the main page, or if it is addurl, cannot send addurl to
|
|
// another host if turing is on. Pick 1 host and keep sending to it
|
|
if ( n == PAGE_REINDEX ||
|
|
n == PAGE_INJECT ||
|
|
n == PAGE_ADDURL ||
|
|
//n == PAGE_ROOT || // FOR DEBUG!!
|
|
//n == PAGE_RESULTS || // FOR DEBUG!!
|
|
n == PAGE_SITEDB )
|
|
// get host #0
|
|
h = g_hostdb.getHost ( 0 );
|
|
else
|
|
h = pickBestHost ( stC );
|
|
|
|
stC->m_retries = 0;
|
|
|
|
stC->m_forwardHost = h;
|
|
|
|
// . TODO: make both this and Multicast.cpp use a getTimeout() function
|
|
// . default timeout to 8 seconds
|
|
stC->m_timeout = 8 * 1000;
|
|
// set the timeout
|
|
int32_t firstResult = hr.getLong("s", 0);
|
|
int32_t docsWanted = hr.getLong("n", 10);
|
|
// how many docsids request? first 4 bytes of request.
|
|
int32_t rr = hr.getLong("rerank",-1);
|
|
// . how many milliseconds of waiting before we re-route?
|
|
// . 100 ms per doc wanted, but if they all end up
|
|
// clustering then docsWanted is no indication of the
|
|
// actual number of titleRecs (or title keys) read
|
|
// . it may take a while to do dup removal on 1 million docs
|
|
int64_t wait = 5000 + 100 * (docsWanted+firstResult);
|
|
// those big UOR queries should not get re-routed all the time
|
|
wait += 1000 * stC->m_numQueryTerms;
|
|
// a min of 8 seconds is good
|
|
if ( wait < 8000 ) wait = 8000;
|
|
// seems like buzz is hammering the cluster and 0x39's are
|
|
// timing out too much because of huge title recs taking
|
|
// forever with Msg20
|
|
//if ( wait < 120000 ) wait = 120000;
|
|
// never re-route if it has a rerank, those take forever
|
|
if ( rr >= 0 ) wait = 3000 * 1000;
|
|
// set it
|
|
stC->m_timeout = wait;
|
|
|
|
// for now the tool and free and you don't have to login
|
|
return forwardRequest ( stC );
|
|
}
|
|
|
|
|
|
bool Proxy::forwardRequest ( StateControl *stC ) {
|
|
|
|
// this was a function arg... now it is in "stC"
|
|
Host *h = stC->m_forwardHost;
|
|
|
|
stC->m_hostId = h->m_hostId;
|
|
|
|
TcpSocket *s = stC->m_s;
|
|
|
|
// if sending to the temporary network, add one to port
|
|
int32_t port = h->getInternalHttpPort();
|
|
if ( g_conf.m_useTmpCluster ) port += 1;
|
|
|
|
// put ip at end of request
|
|
char *req = s->m_readBuf;
|
|
int32_t reqSize = s->m_readOffset;
|
|
// but then TcpServer.cpp leaves some room for a \0 and ip
|
|
char *p = req + reqSize;
|
|
// NULL terminate it
|
|
*p = '\0';
|
|
p += 1;
|
|
// then add in ip
|
|
*(int32_t *)p = s->m_ip;
|
|
p += 4;
|
|
|
|
bool isQCProxy = (g_hostdb.m_myHost->m_type & HT_QCPROXY);
|
|
|
|
// . alter the request buffer
|
|
// . set the please compress reply flag
|
|
if ( isQCProxy && *req == 'G' ) *req = 'Z';
|
|
|
|
// update size
|
|
reqSize = p - req;
|
|
// sanity check
|
|
if ( reqSize > s->m_readBufSize ) { g_process.shutdownAbort(true);}
|
|
|
|
// sanity check
|
|
if ( h->m_isProxy ) { g_process.shutdownAbort(true); }
|
|
|
|
// if we are a QUERY COMPRESSION proxy send to the specified address
|
|
int32_t dstIp = h->m_ip;
|
|
int32_t dstPort = h->m_port;
|
|
int32_t dstId = h->m_hostId;
|
|
if ( isQCProxy ) {
|
|
dstIp = g_hostdb.m_myHost->m_forwardIp;
|
|
dstPort = g_hostdb.m_myHost->m_forwardPort;
|
|
dstId = -1;
|
|
}
|
|
|
|
// . let's use the udp server instead because it quickly switches
|
|
// to using eth1 if eth0 does two or more resends without an ACK,
|
|
// and vice versa. this ensure that if a network switch fails then
|
|
// we won't notice it besides a possible one-time 100ms delay.
|
|
// . additionally, we can now accept tcp requests for admin pages
|
|
// even if such requests come from the proxy ip! because now they
|
|
// will just have to be from our ssh tunnel!!
|
|
// . returns false and sets g_errno on error, true on success
|
|
// . after resending the request 4 times with no ACK recv'd, call
|
|
// it a EUDPTIMEDOUT error and deal with that below...
|
|
bool status = g_udpServer.sendRequest(req, reqSize, msg_type_fd, dstIp, dstPort, dstId, NULL, stC, gotHttpReplyWrapper, stC->m_timeout, 0, NULL, 4);
|
|
|
|
// if no error, return false, we blocked
|
|
if ( status ) return false;
|
|
|
|
//if not, we've got an error
|
|
g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
|
freeStateControl(stC);
|
|
|
|
return true;
|
|
}
|
|
|
|
void gotHttpReplyWrapper ( void *state, UdpSlot *slot ) { // TcpSocket *s ){
|
|
g_proxy.gotReplyPage(state,slot);
|
|
}
|
|
|
|
void Proxy::gotReplyPage ( void *state, UdpSlot *slot ) {
|
|
|
|
StateControl *stC = (StateControl *) state;
|
|
|
|
char *reply = slot->m_readBuf;
|
|
int32_t size = slot->m_readBufSize;
|
|
|
|
char *req = slot->m_sendBufAlloc;
|
|
|
|
// . AND this is what we forwaded to a host in the flock
|
|
// . we can free this because it reference the tcp buffer
|
|
slot->m_sendBufAlloc = NULL;
|
|
|
|
// . try another host if this one times out
|
|
// . if it is dead before we send to it then it will not ACK our
|
|
// requests, we will resend 10 times within about 300 ms and then
|
|
// we will get slot->m_errno set to EUDPTIMEDOUT
|
|
// . it will also set the errno to EUDPTIMEDOUT if the timeout we
|
|
// gave sendRequest() above is reached.
|
|
if ( slot->getErrno() == EUDPTIMEDOUT && //stC->m_forward < 0 &&
|
|
// try this thrice i guess... hopefully we won't pick the same
|
|
// host we did before!
|
|
++stC->m_retries <= 3 ) {
|
|
// reduce the query load counts
|
|
uncountStripe ( stC );
|
|
// pick another host! should NEVER return NULL
|
|
Host *h = pickBestHost ( stC );
|
|
log("proxy: hostid #%" PRId32" timed out. req=%s Rerouting "
|
|
"forward request "
|
|
"to hostid #%" PRId32" instead.",stC->m_hostId,
|
|
req,//stC->m_s->m_readBuf,
|
|
h->m_hostId);
|
|
// . try a resend!
|
|
// . it will block or it will call sendErrorReply
|
|
stC->m_forwardHost = h;
|
|
forwardRequest ( stC );//, h );
|
|
// all done
|
|
return;
|
|
}
|
|
|
|
// save it so we can free "reply" when state is destroyed
|
|
stC->m_slot = slot;
|
|
|
|
// save reply so we can free it when this state is freed
|
|
// no i am just transferring into the socket's sendbuf now
|
|
stC->m_slotReadBuf = NULL;
|
|
|
|
// should we uncompress the reply?
|
|
bool doUncompress = ( req[0] == 'Z' );
|
|
// do not allow regular proxy to uncompress it though!
|
|
if ( ! (g_hostdb.m_myHost->m_type & HT_QCPROXY ) ) doUncompress=false;
|
|
|
|
// sanity check
|
|
if ( slot->m_readBufSize < 0 ) { g_process.shutdownAbort(true); }
|
|
|
|
// don't let udp server free the reply, we forward this to end user
|
|
slot->m_readBuf = NULL;
|
|
slot->m_readBufSize = 0;
|
|
slot->m_readBufMaxSize = 0;
|
|
|
|
int64_t nowms = gettimeofdayInMilliseconds();
|
|
|
|
if ( size == 0 ){
|
|
log (LOG_WARN,"query: Proxy: Lost the request");
|
|
g_errno = EBADREQUEST;
|
|
hadError:
|
|
log(LOG_WARN,"proxy: error=%s req=%s",mstrerror(g_errno),req);
|
|
g_httpServer.sendErrorReply(stC->m_s,500,mstrerror(g_errno));
|
|
freeStateControl(stC);
|
|
return;
|
|
}
|
|
|
|
uint64_t took = nowms - stC->m_startTime;
|
|
|
|
// if reply was compressed then uncompress it
|
|
if ( doUncompress ) {
|
|
// sanity check
|
|
if ( size < 12 ) { g_process.shutdownAbort(true); }
|
|
// parse it up
|
|
unsigned char *p = (unsigned char *)reply;
|
|
// get the sizes
|
|
int32_t need = *(int32_t *)p; p += 4; // uncompressed total size
|
|
int32_t size1 = *(int32_t *)p; p += 4; // size of compressed mime
|
|
int32_t size2 = *(int32_t *)p; p += 4; // size of compressed content
|
|
// note it
|
|
//logf(LOG_DEBUG,"proxy: uncompressing from %" PRId32" to %" PRId32,
|
|
// size1+size2+12,need);
|
|
// make the decompressed buf
|
|
unsigned char *dbuf = (unsigned char *)mmalloc ( need,"pdbuf");
|
|
unsigned char *dend = dbuf + need;
|
|
if ( ! dbuf ) goto hadError;
|
|
unsigned char *dptr = dbuf;
|
|
|
|
uint32_t bytes1 = dend - dptr;
|
|
// ucompress the http mime
|
|
int err1 = gbuncompress (dptr , &bytes1, p , size1 );
|
|
p += size1;
|
|
dptr += bytes1;
|
|
|
|
if ( size2 ) {
|
|
uint32_t bytes2 = dend - dptr;
|
|
// uncompress the http content
|
|
int err2 = gbuncompress ( dptr , &bytes2, p , size2 );
|
|
p += size2;
|
|
dptr += bytes2;
|
|
if ( err2 != Z_OK || err1 != Z_OK ) {
|
|
g_errno = EUNCOMPRESSERROR;
|
|
goto hadError;
|
|
}
|
|
}
|
|
|
|
// sanity check
|
|
if ( dptr - dbuf != need ) { g_process.shutdownAbort(true); }
|
|
|
|
// free original compressed reply
|
|
mfree ( reply , size , "origreply");
|
|
|
|
// now re-set these to the uncompressed mime/pagecontent
|
|
reply = (char *)dbuf;
|
|
size = need;
|
|
}
|
|
|
|
|
|
// if we are a regular proxy forwarding a compressed reply to a
|
|
// query compression proxy, then the reply is compressed, just leave
|
|
// it alone
|
|
if ( ( req[0] == 'Z' ) && (g_hostdb.m_myHost->m_type & HT_PROXY ) ) {
|
|
//now should be able to print
|
|
HttpRequest r;
|
|
r.set(stC->m_s->m_readBuf, stC->m_s->m_readOffset, stC->m_s);
|
|
// log the request
|
|
printRequest(stC->m_s, &r, took, NULL,0);//content,contentLen);
|
|
// add stat for stats graph
|
|
if ( stC->m_isQuery ) {
|
|
/// @todo ALC use proper query language
|
|
Statistics::register_query_time(stC->m_numQueryTerms, langUnknown, 0, (gettimeofdayInMilliseconds() - stC->m_startTime));
|
|
}
|
|
|
|
// let tcp server free it when done
|
|
g_httpServer.m_tcp.sendMsg( stC->m_s, reply, size, size, size, NULL, NULL );
|
|
|
|
// free mem
|
|
freeStateControl(stC);
|
|
return;
|
|
}
|
|
|
|
HttpMime mime;
|
|
// re-store original mime from uncompressed mime
|
|
mime.set ( reply, size, NULL);
|
|
int32_t httpStatus = mime.getHttpStatus();
|
|
if ( httpStatus != 200 )
|
|
g_msg = " (error: unknown.)";
|
|
|
|
if ( stC->m_isQuery && httpStatus == 200 ){
|
|
/// @todo ALC use proper query language
|
|
Statistics::register_query_time(stC->m_numQueryTerms, langUnknown, 0, (gettimeofdayInMilliseconds() - stC->m_startTime));
|
|
} else if ( stC->m_isQuery && httpStatus != 200 ) {
|
|
/// @todo ALC use proper error code if possible
|
|
Statistics::register_query_time(stC->m_numQueryTerms, langUnknown, EINTERNALERROR, (gettimeofdayInMilliseconds() - stC->m_startTime));
|
|
}
|
|
|
|
//now should be able to print
|
|
HttpRequest r;
|
|
r.set(stC->m_s->m_readBuf, stC->m_s->m_readOffset, stC->m_s);
|
|
|
|
char *content = reply + mime.getMimeLen();
|
|
int32_t contentLen = size - mime.getMimeLen();
|
|
|
|
printRequest(stC->m_s, &r, took, content,contentLen);
|
|
|
|
// . add the login bar to all pages we send back
|
|
// . we could also use to automatically update copyright years
|
|
// and add any common elements to every page...
|
|
// . make a new reply to send back...
|
|
// . it may free the old "reply" or it may set newReply=reply...
|
|
int32_t newReplySize = size;
|
|
char *newReply = reply;
|
|
|
|
// make sure it is HTTP/1.0 not HTTP/1.1
|
|
if ( reply[0] == 'H' &&
|
|
reply[1] == 'T' &&
|
|
reply[2] == 'T' &&
|
|
reply[3] == 'P' &&
|
|
reply[4] == '/' &&
|
|
reply[5] == '1' &&
|
|
reply[6] == '.' &&
|
|
reply[7] == '1' )
|
|
reply[7] = '0';
|
|
|
|
// . try this one instead
|
|
// . returns false if blocked
|
|
TcpServer *tcp = &g_httpServer.m_tcp;
|
|
|
|
// are we using ssl?
|
|
if ( stC->m_s->m_ssl ) {
|
|
tcp = &g_httpServer.m_ssltcp;
|
|
}
|
|
|
|
tcp->sendMsg( stC->m_s, newReply, newReplySize, newReplySize, newReplySize, NULL, NULL );
|
|
|
|
freeStateControl(stC);
|
|
}
|
|
|
|
|
|
void freeStateControl ( StateControl *stC ){
|
|
if ( ! stC ) return;
|
|
|
|
if ( stC->m_hostId >= 0 ) {
|
|
g_proxy.m_numOutstanding[stC->m_hostId]--;
|
|
uncountStripe ( stC );
|
|
}
|
|
|
|
// free the reply buffer
|
|
if ( stC->m_slot ) {
|
|
// save reply so we can free it when this state is freed
|
|
char *reply = stC->m_slotReadBuf;
|
|
int32_t size = stC->m_slotReadBufMaxSize;
|
|
if ( reply ) mfree ( reply , size , "proxy" );
|
|
// do not double free!
|
|
stC->m_slotReadBuf = NULL;
|
|
}
|
|
|
|
mdelete(stC,sizeof(StateControl),"Proxy");
|
|
delete(stC);
|
|
}
|
|
|
|
void uncountStripe ( StateControl *stC ) {
|
|
// if stripe is -1, it was not a search query request
|
|
int32_t stripe = stC->m_stripe;
|
|
if ( stripe < 0 ) return;
|
|
// a more refined load balancing act
|
|
g_proxy.m_termsOutOnStripe[stripe] -= stC->m_numQueryTerms;
|
|
// dec this too
|
|
g_proxy.m_queriesOutOnStripe[stripe]--;
|
|
}
|
|
|
|
|
|
// . now do stripe balancing
|
|
// . this prevents one machine from receving all the Msg39 requests while its
|
|
// twin gets none
|
|
Host *Proxy::pickBestHost( StateControl *stC ) {
|
|
|
|
// sanity check, for m_stripeLastHostId array size, which is only 8 now
|
|
int32_t numStripes = g_hostdb.getNumStripes();
|
|
if ( numStripes > MAX_STRIPES ) { g_process.shutdownAbort(true); }
|
|
|
|
// see which stripes have non-dead hosts!
|
|
char stripeDead[MAX_STRIPES];
|
|
bool allDead = true;
|
|
memset ( stripeDead , 1 , MAX_STRIPES );
|
|
int32_t nh = g_hostdb.getNumHosts();
|
|
for ( int32_t i = 0 ; i < nh ; i++ ) {
|
|
Host *h = g_hostdb.getHost(i);
|
|
if ( g_hostdb.isDead ( h ) ) continue;
|
|
// hey, we are not all dead!
|
|
allDead = false;
|
|
// clear it
|
|
stripeDead [ h->m_stripe ] = 0;
|
|
}
|
|
|
|
// start at this stripe
|
|
int32_t ns = m_nextStripe;
|
|
|
|
int32_t min ;
|
|
int32_t minns = -1;
|
|
for ( int32_t i = 0 ; i < numStripes ; i++ ) {
|
|
// get stripe number
|
|
if ( ++ns >= numStripes ) ns = 0;
|
|
// skip if whole stripe is dead, and other stripes are not dead
|
|
if ( stripeDead[ns] && ! allDead ) continue;
|
|
// how loaded is this stripe?
|
|
int32_t termsOut = m_termsOutOnStripe[ns];
|
|
// skip if his load is tied or higher than our current winner
|
|
if ( minns != -1 && termsOut >= min ) continue;
|
|
// got a new winner
|
|
minns = ns;
|
|
min = termsOut;
|
|
}
|
|
|
|
// sanity check
|
|
if ( minns == -1 ) { g_process.shutdownAbort(true); }
|
|
|
|
// rotate the prefered next stripe
|
|
if ( ++m_nextStripe >= numStripes ) m_nextStripe = 0;
|
|
|
|
// find the next host in line for stripe #minns
|
|
int32_t bestHostId = m_stripeLastHostId[minns];
|
|
// count iterations
|
|
//int32_t count = 0;
|
|
loop:
|
|
// inc it, wrap it
|
|
if ( ++bestHostId >= g_hostdb.getNumHosts() ) bestHostId = 0;
|
|
|
|
// skip if dead
|
|
if ( g_hostdb.isDead( bestHostId ) && ! allDead ) goto loop;
|
|
|
|
// get the host
|
|
Host *h = g_hostdb.getHost ( bestHostId );
|
|
|
|
// saity check
|
|
if ( h->m_isProxy ) { g_process.shutdownAbort(true); }
|
|
|
|
// advance until it is from the least-loaded stripe
|
|
if ( h->m_stripe != minns ) goto loop;
|
|
|
|
|
|
// add query terms to it for load balancing purposes
|
|
m_termsOutOnStripe[minns] += stC->m_numQueryTerms;
|
|
// inc this too
|
|
m_queriesOutOnStripe[minns]++;
|
|
// save it
|
|
m_stripeLastHostId[minns] = bestHostId;
|
|
|
|
// store ino into state so we can reduce the count when we get a reply
|
|
stC->m_stripe = minns;
|
|
//stC->m_numQueryTerms = numQueryTerms;
|
|
|
|
// return it
|
|
return g_hostdb.getHost( bestHostId );
|
|
}
|
|
|
|
|
|
void Proxy::printRequest(TcpSocket *s, HttpRequest *r,
|
|
uint64_t took ,
|
|
char *content,
|
|
int32_t contentLen ) {
|
|
//LOG THE REQUEST
|
|
|
|
// get time format: 7/23/1971 10:45:32
|
|
time_t tt = getTime();
|
|
struct tm tm_buf;
|
|
struct tm *timeStruct = localtime_r(&tt,&tm_buf);
|
|
char bufTime[64];
|
|
strftime ( bufTime , 63 , "%b %d %T", timeStruct);
|
|
//char *ref = r->getReferer ();
|
|
|
|
// if autobanned and we should not log, return now
|
|
if (g_msg&&!g_conf.m_logAutobannedQueries && strstr(g_msg,"autoban")){
|
|
g_msg = "";
|
|
return;
|
|
}
|
|
|
|
char *req = s->m_readBuf;
|
|
//int32_t reqLen = s->m_readOffset;
|
|
|
|
char ipbuf[16];
|
|
logf (LOG_INFO,"http: %s %s %s %s",
|
|
bufTime, iptoa(s->m_ip,ipbuf), req,//r->getRequest(),
|
|
g_msg);
|
|
|
|
|
|
//reset g_msg
|
|
g_msg = "";
|
|
if ( (int32_t)took < g_conf.m_logQueryTimeThreshold ) return;
|
|
|
|
if ( ! g_conf.m_logQueryReply || ! content || contentLen <= 0 ) {
|
|
logf (LOG_INFO,"http: Took %" PRIu64" ms "
|
|
"(len=%" PRId32" bytes) "
|
|
"for request %s",
|
|
took, contentLen, r->getRequest());
|
|
return;
|
|
}
|
|
|
|
|
|
// copy into buf
|
|
char *p = (char *)mmalloc ( contentLen+1,"proxycont");
|
|
if ( ! p ) return;
|
|
|
|
for ( int32_t i = 0 ; i < contentLen ; i++ ) {
|
|
if ( content[i] && ! is_binary_a(content[i]) ) {
|
|
p[i]=content[i]; continue; }
|
|
// fix 0's and binary stuff
|
|
p[i]='?';
|
|
}
|
|
// null terminate
|
|
p[contentLen]=0;
|
|
|
|
logf (LOG_INFO,"http: Took %" PRIu64" ms "
|
|
"(len=%" PRId32" bytes) "
|
|
"for request %s reply=%s",
|
|
took, contentLen, r->getRequest(),content);
|
|
|
|
mfree ( p , contentLen+1, "proxycont");
|
|
}
|