Merge branch 'ia-zak' into testing

Conflicts:
	main.cpp
This commit is contained in:
Matt
2015-09-25 08:24:12 -06:00
47 changed files with 650 additions and 264 deletions

@ -33,7 +33,7 @@ BigFile::~BigFile () {
//#define O_DIRECT 040000
BigFile::BigFile () {
m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
//m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
m_flags = O_RDWR ; // | O_DIRECT;
m_usePartFiles = true;
// NULLify all ptrs to files
@ -289,7 +289,7 @@ bool BigFile::open ( int flags ,
m_flags = flags;
//m_pc = pc;
m_permissions = permissions;
//m_permissions = permissions;
m_isClosing = false;
// this is true except when parsing big warc files
m_usePartFiles = true;//usePartFiles;
@ -363,7 +363,7 @@ int BigFile::getfd ( int32_t n , bool forReading ) { // , int64_t *vfd ) {
}
// open it if not opened
if ( ! f->calledOpen() ) {
if ( ! f->open ( m_flags , m_permissions ) ) {
if ( ! f->open ( m_flags , getFileCreationFlags() ) ) {
log("disk: Failed to open file part #%"INT32".",n);
return -1;
}

@ -353,7 +353,7 @@ class BigFile {
SafeBuf m_newBaseFilenameDir ;//[256];
int32_t m_permissions;
//int32_t m_permissions;
int32_t m_flags;
// determined in open() override

@ -630,10 +630,11 @@ bool Collectiondb::addNewColl ( char *coll ,
// MDW: create the new directory
retry22:
if ( ::mkdir ( dname ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ) {
if ( ::mkdir ( dname ,
getDirCreationFlags() ) ) {
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ) {
// valgrind?
if ( errno == EINTR ) goto retry22;
g_errno = errno;
@ -1398,10 +1399,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
log("admin: Trying to create collection %s but "
"directory %s already exists on disk.",cr->m_coll,dname);
}
if ( ::mkdir ( dname ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ) {
if ( ::mkdir ( dname ,
getDirCreationFlags() ) ) {
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ) {
// valgrind?
//if ( errno == EINTR ) goto retry22;
//g_errno = errno;

@ -9,6 +9,24 @@
Conf g_conf;
static bool s_setUmask = false;;
mode_t getFileCreationFlags() {
if ( ! s_setUmask ) {
s_setUmask = true;
umask ( 0 );
}
return S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
}
mode_t getDirCreationFlags() {
if ( ! s_setUmask ) {
s_setUmask = true;
umask ( 0 );
}
return S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
}
Conf::Conf ( ) {
m_save = true;
m_doingCommandLine = false;

7
Conf.h

@ -43,6 +43,9 @@
#define MAX_GEOCODERS 4
mode_t getFileCreationFlags();
mode_t getDirCreationFlags ();
class Conf {
public:
@ -180,7 +183,9 @@ class Conf {
//bool m_tagdbUseSeals;
//int32_t m_tagdbMinFilesToMerge;
//bool m_tagdbSaveCache;
//bool m_makeAllFilesGroupWritable;
// catdb parameters
int32_t m_catdbMaxTreeMem;
//int32_t m_catdbMaxDiskPageCacheMem;

@ -238,7 +238,10 @@ bool File::open ( int flags , int permissions ) {
}
// save these in case we need to reopen in getfd()
m_flags = flags;
m_permissions = permissions;
//m_permissions = permissions;
// just override and use system settings so we can get the group
// writable/readable/executable bits if set that way in g_conf
//m_permissions = getFileCreationFlags();
m_calledOpen = true;
// sanity check
//int32_t ss = 0;
@ -668,7 +671,7 @@ int File::getfd () {
if ( fd == -1 ) {
t1 = gettimeofdayInMilliseconds();
retry7:
fd = ::open ( getFilename() , m_flags , m_permissions );
fd = ::open ( getFilename() , m_flags,getFileCreationFlags());
// valgrind
if ( fd == -1 && errno == EINTR ) goto retry7;
// 0 means stdout, right? why am i seeing it get assigned???
@ -676,7 +679,7 @@ int File::getfd () {
log("disk: Got fd of 0 when opening %s.",
getFilename());
if ( fd == 0 )
fd = ::open ( getFilename(), m_flags , m_permissions );
fd=::open(getFilename(),m_flags,getFileCreationFlags());
if ( fd == 0 )
log("disk: Got fd of 0 when opening2 %s.",
getFilename());

2
File.h

@ -193,7 +193,7 @@ class File {
// save the permission and flag sets in case of re-opening
int m_flags;
int m_permissions;
//int m_permissions;
char m_calledOpen;
char m_calledSet;

@ -623,8 +623,10 @@ bool HashTableX::save ( char *dir ,
char s[1024];
sprintf ( s , "%s/%s", dir , filename );
int fd = ::open ( s ,
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
S_IRGRP | S_IWGRP | S_IROTH);
O_RDWR | O_CREAT | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR |
// S_IRGRP | S_IWGRP | S_IROTH);
if ( fd < 0 ) {
//m_saveErrno = errno;
return log("db: Could not open %s for writing: %s.",

@ -2005,8 +2005,9 @@ bool Hostdb::saveHostsConf ( ) {
sprintf ( filename, "%shosts.conf", m_dir );
log ( LOG_INFO, "conf: Writing hosts.conf file to: %s",
filename );
int32_t fd = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
int32_t fd = open ( filename, O_CREAT|O_WRONLY|O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
if ( !fd ) {
log ( "conf: Failed to open %s for writing.", filename );
return false;

@ -211,6 +211,7 @@ class Host {
int64_t m_lastPing;
char m_tmpBuf[4];
int16_t m_tmpCount;
// . first time we sent an unanswered ping request to this host
// . used so we can determine when to send an email alert

@ -1007,7 +1007,10 @@ void Images::thumbStart_r ( bool amThread ) {
// Open/Create temporary file to store image to
int fhndl;
if( (fhndl = open( in, O_RDWR+O_CREAT, S_IWUSR+S_IRUSR )) < 0 ) {
if( (fhndl = open( in, O_RDWR+O_CREAT ,
getFileCreationFlags()
// // S_IWUSR+S_IRUSR
)) < 0 ) {
log( "image: Could not open file, %s, for writing: %s - %d.",
in, mstrerror( m_errno ), fhndl );
m_imgDataSize = 0;

@ -145,7 +145,7 @@ bool Language::convertLatin1DictToUTF8( char *infile ){
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -2763,7 +2763,7 @@ bool Language::makeWordFiles ( int32_t numWordsToDump , int32_t numWordsPerPhras
// then open a new one for appending
fds[i] = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fds[i] < 0 )
return log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
@ -3146,7 +3146,7 @@ bool Language::makePopFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase
// then open a new one for appending
fds[i] = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fds[i] < 0 )
return log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
@ -3683,7 +3683,7 @@ bool Language::makeQueryFiles ( ) {
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -3874,7 +3874,7 @@ bool Language::makeWikiFiles( ) {
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -4250,7 +4250,7 @@ bool Language::gotTermFreqs( StateDict *st ){
// then open a new one for appending
fd = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fd < 0 ){
log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
@ -4338,7 +4338,7 @@ bool StateAff::openAffinityFile( ){
unlink ( ff );
// then open a new one for appending
m_fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( m_fdw < 0 ){
log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -4537,7 +4537,7 @@ bool Language::cleanDictFile ( ) {
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -4590,7 +4590,7 @@ bool Language::makePhonet( char *infile){
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
@ -4711,7 +4711,7 @@ bool Language::genTopPopFile ( char *infile ){
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
@ -4761,7 +4761,8 @@ bool Language::genDistributedPopFile ( char *infile, uint32_t myHash ){
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
@ -4848,7 +4849,8 @@ int32_t Language::spellcheckDict(){
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));

10
Log.cpp

@ -132,8 +132,9 @@ bool Log::init ( char *filename ) {
// open it for appending.
// create with -rw-rw-r-- permissions if it's not there.
m_fd = open ( m_filename ,
O_APPEND | O_CREAT | O_RDWR ,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
O_APPEND | O_CREAT | O_RDWR ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
if ( m_fd >= 0 ) return true;
// bitch to stderr and return false on error
fprintf(stderr,"could not open log file %s for appending\n",
@ -422,8 +423,9 @@ bool Log::makeNewLogFile ( ) {
// open it for appending.
// create with -rw-rw-r-- permissions if it's not there.
m_fd = open ( m_filename ,
O_APPEND | O_CREAT | O_RDWR ,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
O_APPEND | O_CREAT | O_RDWR ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
if ( m_fd >= 0 ) return true;
// bitch to stderr and return false on error
fprintf(stderr,"could not open new log file %s for appending\n",

@ -1386,9 +1386,9 @@ bool Loop::runLoop ( ) {
if ( m_shutdown == 2 ) {
//log(0,"Thread is saving & shutting down urgently.");
//while ( 1 == 1 ) sleep (50000);
log("loop: Resuming despite thread crash.");
m_shutdown = 0;
goto BIGLOOP;
//log("loop: Resuming despite thread crash.");
//m_shutdown = 0;
//goto BIGLOOP;
}
// otherwise, thread did not save, so we must do it
log ( LOG_INIT ,"loop: Saving and shutting down urgently.");

@ -2356,7 +2356,7 @@ bool getTestSpideredDate ( Url *u , int32_t *origSpideredDate , char *testDir )
bool addTestSpideredDate ( Url *u , int32_t spideredTime , char *testDir ) {
// ensure dir exists
::mkdir(testDir,S_IRWXU);
::mkdir(testDir,getDirCreationFlags());
// set this
int64_t uh64 = hash64(u->getUrl(),u->getUrlLen());

@ -57,8 +57,9 @@ void handleRequest ( UdpSlot *slot , int32_t netnice ) {
return;
}
int32_t fd = open ( filename , O_RDONLY,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
int32_t fd = open ( filename , O_RDONLY ,
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
if ( ! fd ) {
log(LOG_DEBUG, "logviewer: Failed to open %s for reading: ",
filename);

@ -399,7 +399,9 @@ void handleRequest20 ( UdpSlot *slot , int32_t netnice ) {
// sanity check, the size include the \0
if ( req->m_collnum < 0 ) {
log("query: Got empty collection in msg20 handler. FIX!");
char *xx =NULL; *xx = 0;
g_udpServer.sendErrorReply ( slot , ENOTFOUND );
return;
//char *xx =NULL; *xx = 0;
}
// if it's not stored locally that's an error
if ( req->m_docId >= 0 && ! g_titledb.isLocal ( req->m_docId ) ) {

@ -1434,8 +1434,9 @@ bool saveAddsInProgress ( char *prefix ) {
sprintf ( filename , "%s%saddsinprogress.saving",
g_hostdb.m_dir , prefix );
int32_t fd = open ( filename, O_RDWR | O_CREAT | O_TRUNC ,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
int32_t fd = open ( filename, O_RDWR | O_CREAT | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
if ( fd < 0 ) {
log ("build: Failed to open %s for writing: %s",
filename,strerror(errno));

@ -696,7 +696,7 @@ bool Msg40::federatedLoop ( ) {
// and mult based on index size
numDocIdSplits *= mult;
// prevent going OOM for type:article AND html
//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
//}
if ( cr ) mr.m_maxQueryTerms = cr->m_maxQueryTerms;
@ -2437,6 +2437,9 @@ bool Msg40::gotSummary ( ) {
for ( int32_t i = 0 ; dedupPercent && i < m_numReplies ; i++ ) {
// skip if already invisible
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// Skip if invalid
if ( m_msg20[i]->m_errno ) continue;
// start with the first docid we have not yet checked!
//int32_t m = oldNumContiguous;
// get it
@ -2455,6 +2458,8 @@ bool Msg40::gotSummary ( ) {
// skip if already invisible
if ( *level != CR_OK ) continue;
// get it
if ( m_msg20[m]->m_errno ) continue;
Msg20Reply *mrm = m_msg20[m]->m_r;
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last

@ -75,6 +75,13 @@ void setInjectionRequestFromParms ( TcpSocket *sock ,
int32_t def = atoll(m->m_def);
*ii = (char)hr->getLong(m->m_cgi,def);
}
else if ( m->m_type == TYPE_IP ) {
char *ii = (char *)((char *)ir + m->m_off);
char *is = hr->getString(m->m_cgi,NULL);
*(int32_t *)ii = 0; // default ip to 0
// otherwise, set the ip
if ( is ) *(int32_t *)ii = atoip(is);
}
// if unsupported let developer know
else { char *xx=NULL;*xx=0; }
}
@ -124,6 +131,53 @@ Host *getHostToHandleInjection ( char *url ) {
Host *group = g_hostdb.getShard ( shardNum );
int32_t hostNum = docId % g_hostdb.m_numHostsPerShard;
Host *host = &group[hostNum];
bool isWarcInjection = false;
int32_t ulen = gbstrlen(url);
if ( ulen > 10 && strcmp(url+ulen-8,".warc.gz") == 0 )
isWarcInjection = true;
if ( ulen > 10 && strcmp(url+ulen-5,".warc") == 0 )
isWarcInjection = true;
if ( ! isWarcInjection ) return host;
// warc files end up calling XmlDoc::indexWarcOrArc() which spawns
// a msg7 injection request for each doc in the warc/arc file
// so let's do load balancing differently for them so one host
// doesn't end up doing a bunch of wget/gunzips on warc files
// thereby bottlenecking the cluster. get the first hostid that
// we have not sent a msg7 injection request to that is still out
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
h->m_tmpCount = 0;
}
for ( UdpSlot *slot = g_udpServer.m_head2 ;
slot ;
slot = slot->m_next2 ) {
// skip if not injection request
if ( slot->m_msgType != 0x07 ) continue;
//if ( ! slot->m_weInitiated ) continue;
// if we did not initiate the injection request, i.e. if
// it is to us, skip it
if ( ! slot->m_callback ) continue;
// who is it from?
int32_t hostId = slot->m_hostId;
if ( hostId < 0 ) continue;
Host *h = g_hostdb.getHost ( hostId );
if ( ! h ) continue;
h->m_tmpCount++;
}
int32_t min = 999999;
Host *minh = NULL;
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
if ( h->m_tmpCount == 0 ) return h;
if ( h->m_tmpCount >= min ) continue;
min = h->m_tmpCount;
minh = h;
}
if ( minh ) return minh;
// how can this happen?
return host;
}
@ -608,7 +662,7 @@ void sendUdpReply7 ( void *state ) {
uint32_t statColor = 0xccffcc;
if(xd->m_indexCode) {
statColor = 0x4e99e9;
statColor = 0xaaddaa;//0x4e99e9;
}
g_stats.addStat_r ( xd->m_rawUtf8ContentSize,
xd->m_injectStartTime,
@ -645,7 +699,6 @@ void sendUdpReply7 ( void *state ) {
void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
InjectionRequest *ir = (InjectionRequest *)slot->m_readBuf;
// now just supply the first guy's char ** and size ptr

@ -49,9 +49,18 @@ class StateStatsdb {
static time_t genDate( char *date, int32_t dateLen ) ;
static void sendReply ( void *st ) ;
static bool s_graphInUse = false;
// . returns false if blocked, otherwise true
// . sets g_errno on error
bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
if ( s_graphInUse ) {
char *msg = "stats graph calculating for another user. "
"Try again later.";
g_httpServer.sendErrorReply(s,500,msg);
return true;
}
char *cgi;
int32_t cgiLen;
@ -121,7 +130,6 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
st->m_endDate = st->m_endDateR;
}
g_statsdb.addDocsIndexed();
//
// this is no longer a gif, but an html graph in g_statsdb.m_sb
//
@ -130,8 +138,10 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
st->m_samples ,
&st->m_sb2 ,
st ,
sendReply ) )
sendReply ) ) {
s_graphInUse = true;
return false;
}
// if we didn't block call it ourselves directly
sendReply ( st );
@ -186,6 +196,8 @@ void genStatsGraphTable(SafeBuf *buf, StateStatsdb *st) {
void sendReply ( void *state ) {
s_graphInUse = false;
StateStatsdb *st = (StateStatsdb *)state;
if ( g_errno ) {

@ -11297,7 +11297,7 @@ void Parms::init ( ) {
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
@ -12415,12 +12415,30 @@ void Parms::init ( ) {
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
/*
m->m_title = "files group writable";
m->m_desc = "Make all created files group writable? If you have "
"multiple user accounts starting Gigablast processes you "
"will want the files to be group writable. You will "
"need to make sure you run gigablast under the "
"primary group you want to use for gigablast administration.";
m->m_cgi = "afgw";
m->m_off = (char *)&g_conf.m_makeAllFilesGroupWritable - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
*/
m->m_title = "max spider read threads";
@ -15107,6 +15125,19 @@ void Parms::init ( ) {
m->m_off = (char *)&ir.m_hopCount - (char *)&ir;
m++;
m->m_title = "url IP";
m->m_desc = "Use this IP when injecting the document. Do not use or "
"set to 0.0.0.0, if unknown. If provided, it will save an IP "
"lookup.";
m->m_cgi = "urlip";
m->m_obj = OBJ_IR;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_injectDocIp - (char *)&ir;
m++;
m->m_title = "last spider time";
m->m_desc = "Override last time spidered";
m->m_cgi = "lastspidered";
@ -15213,7 +15244,10 @@ void Parms::init ( ) {
"Separate MIME from actual content with two returns. "
"At least put a single space in here if you want to "
"inject empty content, otherwise the content will "
"be downloaded from the url.";
"be downloaded from the url. This is because the "
"page injection form always submits the content text area "
"even if it is empty, which should signify that the "
"content should be downloaded.";
m->m_cgi = "content";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
@ -22490,6 +22524,7 @@ bool Parm::printVal ( SafeBuf *sb , collnum_t collnum , int32_t occNum ) {
return sb->safePrintf("CMD");
if ( m_type == TYPE_IP )
// may print 0.0.0.0
return sb->safePrintf("%s",iptoa(*(int32_t *)val) );
log("parms: missing parm type!!");

@ -6775,6 +6775,8 @@ void PosdbTable::intersectLists10_r ( ) {
nwp[mink] = NULL;
// avoid breach of core below now
if ( mptr < mptrEnd ) goto mergeMore;
// wrap it up here since done merging
miniMergedEnd[j] = mptr;
}
// breach?

@ -1862,7 +1862,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
ff.safePrintf("%strash/profile.txt",g_hostdb.m_dir);
char *filename = ff.getBufStart();
unlink ( filename );
int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
int fd = open ( filename , O_RDWR | O_CREAT , getFileCreationFlags() );
if ( fd < 0 ) {
sb->safePrintf("FAILED TO OPEN %s for writing: %s"
,ff.getBufStart(),mstrerror(errno));
@ -2090,7 +2090,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
ff.reset();
ff.safePrintf("%strash/qp.txt",g_hostdb.m_dir);
filename = ff.getBufStart();
fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
//fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
if ( fd < 0 ) {
sb->safePrintf("FAILED TO OPEN %s for writing: %s"
,ff.getBufStart(),strerror(errno));

24
Rdb.cpp

@ -374,16 +374,16 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
char dstDir[256];
// make the trash dir if not there
sprintf ( dstDir , "%s/trash/" , g_hostdb.m_dir );
int32_t status = ::mkdir ( dstDir ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ;
int32_t status = ::mkdir ( dstDir , getDirCreationFlags() );
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ;
// we have to create it
sprintf ( dstDir , "%s/trash/rebuilt%"UINT32"/" , g_hostdb.m_dir , t );
status = ::mkdir ( dstDir ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ;
status = ::mkdir ( dstDir , getDirCreationFlags() );
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ;
if ( status && errno != EEXIST ) {
g_errno = errno;
return log("repair: Could not mkdir(%s): %s",dstDir,
@ -643,10 +643,10 @@ bool Rdb::deleteAllRecs ( collnum_t collnum ) {
bool makeTrashDir() {
char trash[1024];
sprintf(trash, "%strash/",g_hostdb.m_dir);
if ( ::mkdir ( trash,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 ) {
if ( ::mkdir ( trash , getDirCreationFlags() ) ) {
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 ) {
if ( errno != EEXIST ) {
log("dir: mkdir %s had error: %s",
trash,mstrerror(errno));

@ -165,10 +165,10 @@ bool RdbBase::init ( char *dir ,
}
// make a special "cat" dir for it if we need to
sprintf ( tmp , "%s%s" , dir , dbname );
int32_t status = ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH );
int32_t status = ::mkdir ( tmp , getDirCreationFlags() );
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH );
if ( status == -1 && errno != EEXIST && errno )
return log("db: Failed to make directory %s: %s.",
tmp,mstrerror(errno));
@ -186,9 +186,9 @@ bool RdbBase::init ( char *dir ,
// make a special "cat" dir for it if we need to
sprintf ( tmp , "%scat" , dir );
if ( ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
return log("db: Failed to make directory %s: %s.",
tmp,mstrerror(errno));
}
@ -202,9 +202,9 @@ bool RdbBase::init ( char *dir ,
// make a special "stats" dir for it if necessary
sprintf ( tmp , "%sstats" , dir );
if ( ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
return log( "db: Failed to make directory %s: %s.",
tmp, mstrerror( errno ) );
}
@ -218,9 +218,9 @@ bool RdbBase::init ( char *dir ,
// make a special "stats" dir for it if necessary
sprintf ( tmp , "%saccess" , dir );
if ( ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
return log( "db: Failed to make directory %s: %s.",
tmp, mstrerror( errno ) );
}
@ -234,9 +234,9 @@ bool RdbBase::init ( char *dir ,
// make a special "stats" dir for it if necessary
sprintf ( tmp , "%ssyncdb" , dir );
if ( ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
return log( "db: Failed to make directory %s: %s.",
tmp, mstrerror( errno ) );
}

@ -2060,8 +2060,10 @@ bool RdbBuckets::fastSave_r() {
char s[1024];
sprintf ( s , "%s/%s-buckets-saving.dat", m_dir , m_dbname );
int fd = ::open ( s ,
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
S_IRGRP | S_IWGRP | S_IROTH);
O_RDWR | O_CREAT | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR |
// S_IRGRP | S_IWGRP | S_IROTH);
if ( fd < 0 ) {
m_saveErrno = errno;
return log("db: Could not open %s for writing: %s.",

@ -1474,7 +1474,7 @@ bool RdbCache::save_r ( ) {
//f.set ( g_hostdb.m_dir , filename );
// open the file
//if ( ! f.open ( O_RDWR | O_CREAT ) )
int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
int fd = open ( filename , O_RDWR | O_CREAT , getFileCreationFlags() );
if ( fd < 0 )
return log("db: Had opening file to save cache to: %s.",
mstrerror(errno));

@ -405,12 +405,15 @@ bool RdbDump::dumpTree ( bool recall ) {
m_totalNegDumped += m_numNegRecs;
// . check the list we got from the tree for problems
// . ensures keys are ordered from lowest to highest as well
#ifdef GBSANITYCHECK
log("dump: verifying list before dumping");
m_list->checkList_r ( false , // removeNegRecs?
false , // sleep on problem?
m_rdb->m_rdbId );
#endif
//#ifdef GBSANITYCHECK
if ( g_conf.m_verifyWrites ) {
char *s = "none";
if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
log("dump: verifying list before dumping (rdb=%s)",s);
m_list->checkList_r ( false , // removeNegRecs?
false , // sleep on problem?
m_rdb->m_rdbId );
}
// if list is empty, we're done!
if ( status && m_list->isEmpty() ) {
// consider that a rollover?
@ -486,15 +489,15 @@ bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {
if ( m_list->isEmpty() ) return true;
// we're now in dump mode again
m_isDumping = true;
#ifdef GBSANITYCHECK
//#ifdef GBSANITYCHECK
// don't check list if we're dumping an unordered list from tree!
if ( m_orderedDump ) {
if ( g_conf.m_verifyWrites && m_orderedDump ) {
m_list->checkList_r ( false /*removedNegRecs?*/ );
// print list stats
log("dump: sk=%s ",KEYSTR(m_list->m_startKey,m_ks));
log("dump: ek=%s ",KEYSTR(m_list->m_endKey,m_ks));
// log("dump: sk=%s ",KEYSTR(m_list->m_startKey,m_ks));
// log("dump: ek=%s ",KEYSTR(m_list->m_endKey,m_ks));
}
#endif
//#endif
// before calling RdbMap::addList(), always reset list ptr
// since we no longer call this in RdbMap::addList() so we don't
@ -525,8 +528,10 @@ bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {
}
}
if ( m_ks==18 ) {
m_list->checkList_r(false,false,RDB_POSDB);
if ( g_conf.m_verifyWrites ) {
char rdbId = 0;
if ( m_rdb ) rdbId = m_rdb->m_rdbId;
m_list->checkList_r(false,false,rdbId);//RDB_POSDB);
m_list->resetListPtr();
}

@ -693,9 +693,9 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
return false;
}
if ( m_useHalfKeys && m_ks == 12 ) // m_ks != 18 && m_ks != 24 )
return checkIndexList_r ( removeNegRecs ,
sleepOnProblem );
// if ( m_useHalfKeys && m_ks == 12 ) // m_ks != 18 && m_ks != 24 )
// return checkIndexList_r ( removeNegRecs ,
// sleepOnProblem );
//log("m_list=%"INT32"",(int32_t)m_list);
//key_t oldk;
@ -721,6 +721,10 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
if ( KEYCMP(acceptable,KEYMIN(),m_ks)==0 )
KEYSET ( acceptable , m_endKey , m_ks );
char k[MAX_KEY_BYTES];
static int32_t th = 0;
if ( ! th ) th = hash64Lower_a ( "roottitles" , 10 );
while ( ! isExhausted() ) {
//key_t k = getCurrentKey();
getCurrentKey( k );
@ -734,6 +738,43 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
*(int32_t *)data > 100000000 ) ) {
char *xx = NULL; *xx = 0; }
}
// tagrec?
if ( rdbId == RDB_TAGDB && ! KEYNEG(k) ) {
//TagRec *gr = (TagRec *)getCurrentRec();
//Tag *tag = gr->getFirstTag ( );
//for ( ; tag ; tag = gr->getNextTag ( tag ) ) {
Tag *tag = (Tag *)getCurrentRec();
if ( tag->m_type == th ) {
char *tdata = tag->getTagData();
int32_t tsize = tag->getTagDataSize();
// core if tag val is not \0 terminated
if ( tsize > 0 && tdata[tsize-1]!='\0' ) {
log("db: bad root title tag");
char *xx=NULL;*xx=0; }
}
}
if ( rdbId == RDB_SPIDERDB && ! KEYNEG(k) &&
getCurrentDataSize() > 0 ) {
//char *data = getCurrentData();
char *rec = getCurrentRec();
// bad url in spider request?
if ( g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ){
SpiderRequest *sr = (SpiderRequest *)rec;
if ( strncmp(sr->m_url,"http",4) != 0 ) {
log("db: spider req url");
char *xx=NULL;*xx=0;
}
}
}
// title bad uncompress size?
if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) {
char *rec = getCurrentRec();
int32_t usize = *(int32_t *)(rec+12+4);
if ( usize <= 0 ) {
log("db: bad titlerec uncompress size");
char *xx=NULL;*xx=0;
}
}
// debug msg
// pause if it's google
//if ((((k.n0) >> 1) & 0x0000003fffffffffLL) == 70166155664)

@ -2467,8 +2467,8 @@ void threadDoneWrapper ( void *state , ThreadEntry *t ) {
THIS->m_dbname,mstrerror(g_errno));
else
// log it
log("db: Done saving %s/%s-saved.dat",
THIS->m_dir,THIS->m_dbname);
log("db: Done saving %s/%s-saved.dat (wrote %"INT64" bytes)",
THIS->m_dir,THIS->m_dbname,THIS->m_bytesWritten);
// . call callback
if ( THIS->m_callback ) THIS->m_callback ( THIS->m_state );
}
@ -2488,13 +2488,29 @@ bool RdbTree::fastSave_r() {
char s[1024];
sprintf ( s , "%s/%s-saving.dat", m_dir , m_dbname );
int fd = ::open ( s ,
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
S_IRGRP | S_IWGRP | S_IROTH);
O_RDWR | O_CREAT | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR |
// S_IRGRP | S_IWGRP | S_IROTH);
if ( fd < 0 ) {
m_saveErrno = errno;
return log("db: Could not open %s for writing: %s.",
s,mstrerror(errno));
}
redo:
// verify the tree
if ( g_conf.m_verifyWrites ) {
log("db: verify writes is enabled, checking tree before "
"saving.");
if ( ! checkTree( false , true ) ) {
log("db: fixing tree and re-checking");
fixTree ( );
goto redo;
}
}
// clear our own errno
errno = 0;
// . save the header

@ -431,7 +431,7 @@ bool SafeBuf::reserve(int32_t i , char *label, bool clearIt ) {
//buffer size.
bool SafeBuf::reserve2x(int32_t i, char *label) {
//watch out for overflow!
if((m_capacity << 1) + i < 0) return false;
if((m_capacity << 1) + i < m_capacity) return false;
if(i + m_length >= m_capacity)
return reserve(m_capacity + i,label);
else return true;
@ -449,8 +449,9 @@ int32_t SafeBuf::save ( char *fullFilename ) {
int32_t SafeBuf::dumpToFile(char *filename ) {
retry22:
int32_t fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
int32_t fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC ,
getFileCreationFlags() );
//S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry22;
@ -484,8 +485,9 @@ int32_t SafeBuf::safeSave (char *filename ) {
fn.safePrintf( "%s.saving",filename );
int32_t fd = open ( fn.getBufStart() ,
O_CREAT | O_WRONLY | O_TRUNC,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
O_CREAT | O_WRONLY | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry22;
@ -571,8 +573,8 @@ int32_t SafeBuf::fillFromFile(char *filename) {
reserve(results.st_size+1);
retry:
int32_t fd = open ( filename , O_RDONLY,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
int32_t fd = open ( filename , O_RDONLY , getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
if ( ! fd ) {
// valgrind
if ( errno == EINTR ) goto retry;

@ -1805,7 +1805,8 @@ bool Speller::createUnifiedDict (){
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
getFileCreationFlags());
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));

@ -4785,7 +4785,8 @@ bool SpiderColl::scanListForWinners ( ) {
// firstip in the record!
if ( sreq->m_firstIp != firstIp ) {
log("spider: request %s firstip does not match "
"firstip in key",sreq->m_url);
"firstip in key collnum=%i",sreq->m_url,
(int)m_collnum);
log("spider: ip1=%s",iptoa(sreq->m_firstIp));
log("spider: ip2=%s",iptoa(firstIp));
continue;
@ -11504,6 +11505,16 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
// never repeated after that!
errCode != EBADIP &&
// assume diffbot is temporarily experiencing errs
// but the crawl, if recurring, should retry these
// at a later point
errCode != EDIFFBOTUNABLETOAPPLYRULES &&
errCode != EDIFFBOTCOULDNOTPARSE &&
errCode != EDIFFBOTCOULDNOTDOWNLOAD &&
errCode != EDIFFBOTINVALIDAPI &&
errCode != EDIFFBOTVERSIONREQ &&
errCode != EDIFFBOTURLPROCESSERROR &&
errCode != EDIFFBOTTOKENEXPIRED &&
errCode != EDIFFBOTUNKNOWNERROR &&
errCode != EDIFFBOTINTERNALERROR &&
// if diffbot received empty content when d'lding
errCode != EDIFFBOTEMPTYCONTENT &&

@ -65,21 +65,19 @@ static Label s_labels[] = {
// . 300MB/s is max read rate regardless to stop graph shrinkage
// . use 1KB as the min resolution per pixel
// . stored in Bps so use 1/1000 as scalar to get into KBps
{ GRAPH_QUANTITY,200,"disk_read",1,"%.0f MBps",1.0/(1000.0*1000.0),0x000000,
"disk read"},
{ GRAPH_QUANTITY,200,"disk_read",1,"%.0f MBps",1.0/(1000.0*1000.0),0x000000,"disk read"},
// . 300MB/s is max write rate regardless to stop graph shrinkage
// . use 1KB as the min resolution per pixel
// . stored in Bps so use 1/1000 as scalar to get into KBps
{GRAPH_QUANTITY,200,"disk_write",1,"%.0f Mbps",1.0/(1000.0*1000.0), 0xff0000,
"disk write"},
{GRAPH_QUANTITY,200,"disk_write",1,"%.0f Mbps",1.0/(1000.0*1000.0), 0xff0000, "disk write"},
// . 20 is the max dps regardless to stop graph shrinkage
// . use .03 qps as the min resolution per pixel
{GRAPH_OPS,20,"parse_doc", .005,"%.1f dps" , 1.0 , 0x00fea915,"parsed doc" },
{GRAPH_QUANTITY_PER_OP,1000,"docs_per_second", .005,"%.1f docs" , .001 , 0x1F2F5C,"docs per second" },
{GRAPH_QUANTITY_PER_OP,-1,"docs_per_second", .1,"%.1f docs per second" , -1 , 0x1F2F5C,"*successfully* indexed docs per second" },
// . use .1 * 1000 docs as the min resolution per pixel
// . max = -1, means dynamic size the ymax!
@ -88,7 +86,7 @@ static Label s_labels[] = {
// . make it 2M now not 50M. seems like it is per pixel and theres
// like 1000 pixels vertically. but we need to autoscale it
// eventually
{GRAPH_QUANTITY,2000000.0,"docs_indexed", .1,"%.0fK docs" , .001 , 0x00cc0099,"docs indexed" }
{GRAPH_QUANTITY,-1,"docs_indexed", .1,"%.0f docs" , -1, 0x00cc0099,"docs indexed" }
//{ "termlist_intersect",0x0000ff00},
@ -122,6 +120,7 @@ Label *Statsdb::getLabel ( int32_t labelHash ) {
return *label;
}
Statsdb::Statsdb ( ) {
m_init = false;
m_disabled = true;
@ -246,6 +245,8 @@ void flushStatsWrapper ( int fd , void *state ) {
void Statsdb::addDocsIndexed ( ) {
if ( ! isClockInSync() ) return;
if ( g_hostdb.hasDeadHost() ) return;
// only host #0 needs this
if ( g_hostdb.m_hostId != 0 ) return;
@ -270,18 +271,23 @@ void Statsdb::addDocsIndexed ( ) {
// divide by # of groups
total /= g_hostdb.getNumHostsPerShard();
// skip if no change
if ( total == s_lastTotal ) return;
int32_t docsIndexedInInterval = total - s_lastTotal;
float docsPerSecond = docsIndexedInInterval / (float)interval;
s_lastTotal = total;
log("build: total docs indexed: %f. docs per second %f %i %i", (float)total, docsPerSecond, docsIndexedInInterval, interval);
// add it if changed though
int64_t nowms = gettimeofdayInMillisecondsGlobal();
addStat ( MAX_NICENESS,"docs_indexed", nowms, nowms, (float)total );
addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond );
// Prevent a datapoint which adds all of the docs indexed to date.
if( s_lastTotal != 0 ) {
addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond );
}
s_lastTotal = total;
}
// . m_key bitmap in statsdb:
@ -896,12 +902,13 @@ char *Statsdb::plotGraph ( char *pstart ,
bool needMax = true;
float ymin = 0.0;
float ymax = 0.0;
float yscalar = label->m_yscalar;
char *p = pstart;
for ( ; p < pend ; p += 12 ) {
// breathe
QUICKPOLL ( m_niceness );
if ( m_gw.getLength() > 10000000 ) break;
// get the y
float y2 = *(float *)(p+4);
// get color of this point
@ -909,7 +916,8 @@ char *Statsdb::plotGraph ( char *pstart ,
// stop if not us
if ( gh != graphHash ) continue;
// put into scaled space right away
y2 = y2 * label->m_yscalar;
if (label->m_yscalar >= 0)
y2 = y2 * label->m_yscalar;
// . limit y to absolute max
// . these units should be scaled as well!
if ( y2 > label->m_absYMax && label->m_absYMax > 0.0 )
@ -922,13 +930,21 @@ char *Statsdb::plotGraph ( char *pstart ,
}
// force to zero for now
ymin = 0.0;
//ymin = 0.0;
// . and force to ymax for now as well
// . -1 indicates dynamic though!
if ( label->m_absYMax > 0.0 ) ymax = label->m_absYMax;
// add a 20% ceiling
else ymax *= 1.20;
// else ymax *= 1.20;
if( label->m_yscalar <= 0 ) {
if(ymax == ymin) {
yscalar = 0;
} else {
yscalar = (float)DY2 / (ymax - ymin);
}
}
// return that!
char *retp = p;
@ -951,7 +967,7 @@ char *Statsdb::plotGraph ( char *pstart ,
// . pad y range if total range is small
// . only do this for certain types of stats, like qps and disk i/o
if ( ourDiff < minDiff ) {
if ( label->m_yscalar >=0 && ourDiff < minDiff ) {
float pad = (minDiff - ourDiff) / 2;
// pad it out
ymin -= pad ;
@ -981,16 +997,23 @@ char *Statsdb::plotGraph ( char *pstart ,
for ( ; p < pend ; ) {
// breathe
QUICKPOLL ( m_niceness );
if ( m_gw.getLength() > 10000000 ) break;
// first is x pixel pos
int32_t x2 = *(int32_t *)p; p += 4;
// then y pos
float y2 = *(float *)p; p += 4;
// scale it right away
y2 *= label->m_yscalar;
if(label->m_yscalar < 0) {
y2 = (y2 - ymin) * yscalar;
}
else {
y2 *= yscalar;
}
// adjust
if ( y2 > ymax ) y2 = ymax;
if ( y2 < 0 ) y2 = 0;
// then graphHash
int32_t gh = *(int32_t *)p; p += 4;
@ -1003,8 +1026,10 @@ char *Statsdb::plotGraph ( char *pstart ,
float y1 = lasty;
// normalize y into pixel space
y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
if(label->m_yscalar >= 0 && ymax != ymin) {
y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
}
// set lasts for next iteration of this loop
lastx = x2;
lasty = y2;
@ -1073,13 +1098,20 @@ char *Statsdb::plotGraph ( char *pstart ,
}
float lastZ = -1;
for ( float z = ymin ; z < ymax ; z += deltaz ) {
// breathe
QUICKPOLL ( m_niceness );
// draw it
drawHR ( z , ymin , ymax , m_gw , label , zoff , color );
if(z == lastZ) break;
lastZ = z;
//if ( m_gw.getLength() > 10000000 ) break;
}
if ( m_gw.getLength() > 10000000 )
log("statsdb: graph too big");
return retp;
//#endif
@ -1158,7 +1190,7 @@ void Statsdb::drawHR ( float z ,
"font-size:14px;"
"min-height:20px;"
"min-width:3px;\""
" class=\"color-%"XINT32"\";"
" class=\"color-%"XINT32"\""
">%s</div>\n"
, (int32_t)(m_bx)
, (int32_t)z2 +m_by
@ -1194,6 +1226,13 @@ bool Statsdb::processList ( ) {
m_done = true;
}
// HACK: the user can request all of the events, it can
// become quite large. so limit to 100 mb right now.
if( m_sb3.length() > 100000000) {
log("statsdb: truncating statsdb results.");
m_done = true;
}
//
// all these points are accumulated into 1-second buckets
@ -1590,7 +1629,7 @@ void Statsdb::drawLine3 ( SafeBuf &sb ,
"z-index:-5;"
"min-height:%"INT32"px;"
"min-width:%"INT32"px;\""
"class=\"color-%"XINT32"\"></div>\n"
" class=\"color-%"XINT32"\"></div>\n"
, x1 + m_bx
, (fy1 - width/2) + m_by
, color
@ -1599,3 +1638,5 @@ void Statsdb::drawLine3 ( SafeBuf &sb ,
, color
);
}

@ -2289,11 +2289,11 @@ void TcpServer::destroySocket ( TcpSocket *s ) {
sb.safeTruncateEllipsis(s->m_sendBuf,
s->m_sendBufSize,
200);
sb.safePrintf(" readbuf=");
sb.safePrintf(" bytesread=%i readbuf=",(int)s->m_readOffset);
if ( s->m_readBuf )
sb.safeTruncateEllipsis(s->m_readBuf,
s->m_readBufSize,
200);
s->m_readOffset,
2000);
log("%s",sb.getBufStart());
}

@ -323,7 +323,7 @@ bool Threads::init ( ) {
if ( ! g_threads.registerType ( INTERSECT_THREAD,max,200) )
return log("thread: Failed to register thread type." );
// filter thread spawned to call popen() to filter an http reply
if ( ! g_threads.registerType ( FILTER_THREAD , 1/*maxThreads*/,300) )
if ( ! g_threads.registerType ( FILTER_THREAD, 2/*maxThreads*/,300) )
return log("thread: Failed to register thread type." );
// RdbTree uses this to save itself
if ( ! g_threads.registerType ( SAVETREE_THREAD,1/*maxThreads*/,100) )

@ -286,6 +286,7 @@ bool UdpServer::init ( uint16_t port, UdpProtocol *proto, int32_t niceness,
// no requests waiting yet
m_requestsInWaiting = 0;
// special count
m_msg07sInWaiting = 0;
m_msg10sInWaiting = 0;
m_msgc1sInWaiting = 0;
//m_msgDsInWaiting = 0;
@ -1005,7 +1006,7 @@ UdpSlot *UdpServer::getBestSlotToSend ( int64_t now ) {
UdpSlot *maxi = NULL;
int32_t score;
//UdpSlot *slot;
// . we send dgrams with the lowest "score" first
// . we send dgrams with the lowest "score" first
// . the "score" is just number of ACKs you're waiting for
// . that way transmissions that are the most caught up to their ACKs
// are considered faster so we send to them first
@ -1482,6 +1483,9 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
// rate, these are pretty lightweight. msg 0x10 reply gen times
// are VERY low. MDW
bool getSlot = true;
if ( msgType == 0x07 && m_msg07sInWaiting >= 100 )
getSlot = false;
if ( msgType == 0x10 && m_msg10sInWaiting >= 50 )
getSlot = false;
// crawl update info from Spider.cpp
@ -1671,6 +1675,7 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
// if we connected to a request slot, count it
m_requestsInWaiting++;
// special count
if ( msgType == 0x07 ) m_msg07sInWaiting++;
if ( msgType == 0x10 ) m_msg10sInWaiting++;
if ( msgType == 0xc1 ) m_msgc1sInWaiting++;
//if ( msgType == 0xd ) m_msgDsInWaiting++;
@ -3122,6 +3127,7 @@ void UdpServer::destroySlot ( UdpSlot *slot ) {
// one less request in waiting
m_requestsInWaiting--;
// special count
if ( slot->m_msgType == 0x07 ) m_msg07sInWaiting--;
if ( slot->m_msgType == 0x10 ) m_msg10sInWaiting--;
if ( slot->m_msgType == 0xc1 ) m_msgc1sInWaiting--;
//if ( slot->m_msgType == 0xd ) m_msgDsInWaiting--;

@ -390,6 +390,7 @@ class UdpServer {
int32_t m_requestsInWaiting;
// like m_requestsInWaiting but requests which spawn other requests
int32_t m_msg07sInWaiting;
int32_t m_msg10sInWaiting;
int32_t m_msgc1sInWaiting;
//int32_t m_msgDsInWaiting;

@ -1258,6 +1258,12 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
utf8Content = m_mime.getContent();
}
// use this to avoid ip lookup if it is not zero
if ( forcedIp ) {
m_ip = forcedIp;
m_ipValid = true;
}
// sometimes they supply the content they want! like when zaks'
// injects pages from PageInject.cpp
if ( utf8Content ) {
@ -1290,11 +1296,6 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
// use this ip as well for now to avoid ip lookup
//m_ip = atoip("127.0.0.1");
//m_ipValid = true;
// use this to avoid ip lookup if it is not zero
if ( forcedIp ) {
m_ip = forcedIp;
m_ipValid = true;
}
// do not need robots.txt then
m_isAllowed = true;
m_isAllowedValid = true;
@ -12077,11 +12078,25 @@ XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
// if we had the title rec, set from that
if ( *rtr ) {
m_rootDoc->set2 ( m_rootTitleRec ,
m_rootTitleRecSize , // maxSize ,
cr->m_coll ,
NULL , // pbuf
m_niceness );
if ( ! m_rootDoc->set2 ( m_rootTitleRec ,
m_rootTitleRecSize , // maxSize ,
cr->m_coll ,
NULL , // pbuf
m_niceness ) ) {
// it was corrupted... delete this
// possibly printed
// " uncompress uncompressed size=..." bad uncompress
log("build: rootdoc set2 failed");
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
delete ( m_rootDoc );
// call it empty for now, we don't want to return
// NULL with g_errno set because it could stop
// the whole indexing pipeline
m_rootDoc = NULL;
m_rootDocValid = true;
return &m_rootDoc;
//return NULL;
}
}
// . otherwise, set the url and download it on demand
// . this junk copied from the contactDoc->* stuff below
@ -16948,8 +16963,9 @@ char **XmlDoc::getHttpReply2 ( ) {
bool isInjecting = getIsInjecting();
if ( ! isInjecting && m_sreqValid && m_sreq.m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
// only if it was a seed for now... so comment out
// if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
// r->m_isRootSeedUrl = 1;
// sanity check
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
@ -17814,6 +17830,91 @@ Url **XmlDoc::getCanonicalRedirUrl ( ) {
return &m_canonicalRedirUrlPtr;
}
// returns false if none found
bool setMetaRedirUrlFromTag ( char *p , Url *metaRedirUrl , char niceness ,
Url *cu ) {
// limit scan
char *limit = p + 30;
// skip whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must be a num
if ( ! is_digit(*p) ) return false;
// init delay
int32_t delay = atol ( p );
// ignore long delays
if ( delay >= 10 ) return false;
// now find the semicolon, if any
for ( ; *p && p < limit && *p != ';' ; p++ );
// must have semicolon
if ( *p != ';' ) return false;
// skip it
p++;
// skip whitespace some more
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must have URL
if ( strncasecmp(p,"URL",3) ) return false;
// skip that
p += 3;
// skip white space
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// then an equal sign
if ( *p != '=' ) return false;
// skip equal sign
p++;
// them maybe more whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// an optional quote
if ( *p == '\"' ) p++;
// can also be a single quote!
if ( *p == '\'' ) p++;
// set the url start
char *url = p;
// now advance to next quote or space or >
for ( ; *p && !is_wspace_a(*p) &&
*p !='\'' &&
*p !='\"' &&
*p !='>' ;
p++);
// that is the end
char *urlEnd = p;
// get size
int32_t usize = urlEnd - url;
// skip if too big
if ( usize > 1024 ) {
log("build: meta redirurl of %"INT32" bytes too big",usize);
return false;
}
// get our current utl
//Url *cu = getCurrentUrl();
// decode what we got
char decoded[MAX_URL_LEN];
// convert &amp; to "&"
int32_t decBytes = htmlDecode(decoded,url,usize,false,niceness);
decoded[decBytes]='\0';
// . then the url
// . set the url to the one in the redirect tag
// . but if the http-equiv meta redirect url starts with a '?'
// then just replace our cgi with that one
if ( *url == '?' ) {
char foob[MAX_URL_LEN*2];
char *pf = foob;
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
gbmemcpy(foob,cu->getUrl(),cuBytes);
pf += cuBytes;
gbmemcpy ( pf , decoded , decBytes );
pf += decBytes;
*pf = '\0';
metaRedirUrl->set(foob);
}
// . otherwise, append it right on
// . use "url" as the base Url
// . it may be the original url or the one we redirected to
// . redirUrl is set to the original at the top
else
// addWWW = false, stripSessId=true
metaRedirUrl->set(cu,decoded,decBytes,false,true);
return true;
}
// scan document for <meta http-equiv="refresh" content="0;URL=xxx">
@ -17840,6 +17941,14 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
if ( cr->m_recycleContent || m_recycleContent )
return &m_metaRedirUrlPtr;
// will this work in here?
//uint8_t *ct = getContentType();
//if ( ! ct ) return NULL;
Url *cu = getCurrentUrl();
bool gotOne = false;
// advance a bit, we are initially looking for the 'v' char
p += 10;
// begin the string matching loop
@ -17879,91 +17988,64 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
p += 8;
// skip possible quote
if ( *p == '\"' ) p++;
// limit scan
limit = p + 30;
// skip whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must be a num
if ( ! is_digit(*p) ) continue;
// init delay
int32_t delay = atol ( p );
// ignore int32_t delays
if ( delay >= 10 ) continue;
// now find the semicolon, if any
for ( ; *p && p < limit && *p != ';' ; p++ );
// must have semicolon
if ( *p != ';' ) continue;
// skip it
p++;
// skip whitespace some more
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must have URL
if ( strncasecmp(p,"URL",3) ) continue;
// skip that
p += 3;
// skip white space
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// then an equal sign
if ( *p != '=' ) continue;
// skip equal sign
p++;
// them maybe more whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// an optional quote
if ( *p == '\"' ) p++;
// can also be a single quote!
if ( *p == '\'' ) p++;
// set the url start
char *url = p;
// now advance to next quote or space or >
for ( ; *p && !is_wspace_a(*p) &&
*p !='\'' &&
*p !='\"' &&
*p !='>' ;
p++);
// that is the end
char *urlEnd = p;
// get size
int32_t usize = urlEnd - url;
// skip if too big
if ( usize > 1024 ) {
log("build: meta redirurl of %"INT32" bytes too big",usize);
// PARSE OUT THE URL
Url dummy;
if ( ! setMetaRedirUrlFromTag ( p , &dummy , m_niceness ,cu))
continue;
gotOne = true;
break;
}
if ( ! gotOne )
return &m_metaRedirUrlPtr;
// to fix issue with scripts containing
// document.write('<meta http-equiv="Refresh" content="0;URL=http://ww
// we have to get the Xml. we can't call getXml() because of
// recursion bugs so just do it directly here
Xml xml;
if ( ! xml.set ( m_httpReply ,
m_httpReplySize - 1, // make it a length
false , // ownData?
0 , // allocSize
false , // pure xml?
m_version ,
false , // setParentsArg?
m_niceness ,
// assume html since getContentType() is recursive
// on us.
CT_HTML ) ) // *ct ) )
// return NULL on error with g_errno set
return NULL;
XmlNode *nodes = xml.getNodes();
int32_t n = xml.getNumNodes();
// find the first meta summary node
for ( int32_t i = 0 ; i < n ; i++ ) {
// continue if not a meta tag
if ( nodes[i].m_nodeId != 68 ) continue;
// only get content for <meta http-equiv=..>
int32_t tagLen;
char *tag ;
tag = xml.getString ( i , "http-equiv" , &tagLen );
// skip if empty
if ( ! tag || tagLen <= 0 ) continue;
// if not a refresh, skip it
if ( strncasecmp ( tag , "refresh", 7 ) ) continue;
// get the content
tag = xml.getString ( i ,"content", &tagLen );
// skip if empty
if ( ! tag || tagLen <= 0 ) continue;
// PARSE OUT THE URL
if (!setMetaRedirUrlFromTag(p,&m_metaRedirUrl,m_niceness,cu) )
continue;
}
// get our current utl
Url *cu = getCurrentUrl();
// decode what we got
char decoded[MAX_URL_LEN];
// convert &amp; to "&"
int32_t decBytes = htmlDecode(decoded,url,usize,false,m_niceness);
decoded[decBytes]='\0';
// . then the url
// . set the url to the one in the redirect tag
// . but if the http-equiv meta redirect url starts with a '?'
// then just replace our cgi with that one
if ( *url == '?' ) {
char foob[MAX_URL_LEN*2];
char *pf = foob;
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
gbmemcpy(foob,cu->getUrl(),cuBytes);
pf += cuBytes;
gbmemcpy ( pf , decoded , decBytes );
pf += decBytes;
*pf = '\0';
m_metaRedirUrl.set(foob);
}
// . otherwise, append it right on
// . use "url" as the base Url
// . it may be the original url or the one we redirected to
// . redirUrl is set to the original at the top
else
// addWWW = false, stripSessId=true
m_metaRedirUrl.set(cu,decoded,decBytes,false,true);
// set it
m_metaRedirUrlPtr = &m_metaRedirUrl;
// return it
break;
return &m_metaRedirUrlPtr;
}
// nothing found
return &m_metaRedirUrlPtr;
}
@ -18566,7 +18648,7 @@ void XmlDoc::filterStart_r ( bool amThread ) {
errno = 0;
// open the input file
retry11:
int fd = open ( in , O_WRONLY | O_CREAT , S_IRWXU );
int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry11;
@ -34730,7 +34812,7 @@ int gbcompress7 ( unsigned char *dest ,
errno = 0;
// open the input file
retry11:
int fd = open ( in , O_WRONLY | O_CREAT , S_IRWXU );
int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry11;
@ -39371,6 +39453,12 @@ char **XmlDoc::getRootTitleBuf ( ) {
// sanity check, must include the null ni the size
if ( m_rootTitleBufSize > 0 &&
m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) {
log("build: bad root titlebuf size not end in null char for "
"collnum=%i",(int)m_collnum);
ptr_rootTitleBuf = NULL;
size_rootTitleBuf = 0;
m_rootTitleBufValid = true;
return (char **)&m_rootTitleBuf;
char *xx=NULL;*xx=0;
//m_rootTitleBuf [ m_rootTitleBufSize - 1 ] = '\0';
//m_rootTitleBufSize++;

@ -2618,4 +2618,3 @@ bool verifyUtf8 ( char *txt ) {
int32_t tlen = gbstrlen(txt);
return verifyUtf8(txt,tlen);
}

@ -236,7 +236,7 @@ int filterContent ( char *buf , int32_t n , int32_t mimeLen , char ctype , int32
//fprintf(stderr,"in=%s\n",in);
int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU );
int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU | S_IRWXG );
if ( fd < 0 ) {
fprintf(stderr,"gbfilter: open: %s\n",strerror(errno));
return -1;

@ -5022,7 +5022,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
if ( ! f.doesExist() ) target = "gb";
sprintf(tmp,
"scp " // blowfish is faster
"scp " // blowfish is faster
"%s%s "
"%s:%s/gb.installed%s",
dir,
@ -16849,7 +16849,7 @@ void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool inclu
int32_t filenum = 0;
char filename[64];
sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
//int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
int32_t numDumped = 0;
uint32_t bytesDumped = 0;
loop:
@ -17016,7 +17016,7 @@ void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool inclu
filenum++;
sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
close(FD);
FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
//FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
bytesDumped = 0;
fprintf(stderr, "Started new file: %s. starts at docId: %"INT64".\n",filename, lastDocId);
}

8
qa.cpp

@ -248,10 +248,10 @@ void makeQADir ( ) {
char dir[1024];
snprintf(dir,1000,"%sqa",g_hostdb.m_dir);
log("mkdir mkdir %s",dir);
int32_t status = ::mkdir ( dir ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH );
int32_t status = ::mkdir ( dir ,getDirCreationFlags() );
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH );
if ( status == -1 && errno != EEXIST && errno )
log("qa: Failed to make directory %s: %s.",
dir,mstrerror(errno));

@ -14,6 +14,8 @@ import time
import flask
import signal, os
import random
from itertools import repeat
app = flask.Flask(__name__)
app.secret_key = 'oaisj84alwsdkjhf9238u'
@ -118,7 +120,8 @@ def injectItem(item, db, mode):
postVars = {'url':'http://archive.org/download/%s/%s' %
(item,ff['name']),
'metadata':json.dumps(itemMetadata),
'c':'ait'}
'c':'ait',
'spiderlinks':0}
start = time.time()
if mode == 'production':
try:
@ -315,21 +318,39 @@ def main():
if len(sys.argv) == 3:
if sys.argv[1] == 'force':
itemName = sys.argv[2]
db = getDb()
injectItem(itemName, db, 'production')
sys.exit(0)
if sys.argv[1] == 'forcefile':
global staleTime
staleTime = datetime.timedelta(0,0,0)
from multiprocessing.pool import ThreadPool
fileName = sys.argv[2]
items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
pool = ThreadPool(processes=len(items))
#print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
def injectItemTupleWrapper(itemName):
db = getDb()
ret = injectItem(itemName, db, 'production')
db.close()
return ret
answer = pool.map(injectItemTupleWrapper, items)
sys.exit(0)
if sys.argv[1] == 'run':
threads = int(sys.argv[2])
runInjects(threads)
# else:
# #getPage(3)
# from multiprocessing.pool import ThreadPool
# pool = ThreadPool(processes=150)
# pool.map(getPage, xrange(1,1300))
def runInjects(threads, mode='production'):
from multiprocessing.pool import ThreadPool
pool = ThreadPool(processes=threads)
try:
from itertools import repeat
maxPages = 1300
answer = pool.map(getPage, zip(xrange(1,maxPages), repeat(mode, maxPages)))
except (KeyboardInterrupt, SystemExit):

@ -100,7 +100,7 @@ def getSplitTime():
def copyToTwins(fname):
def copyToTwins(fname, backToFront=False):
fh = open(fname, 'r')
ret = {}
hosts = []
@ -117,23 +117,25 @@ def copyToTwins(fname):
continue
#print directory, ip1, note
step = len(hosts)/2
hostPlex = {}
someIp = None
cmds = []
for hostId, dnsPort, httpsPort, httpPort, udbPort,ip1, ip2, directory, note in hosts[:step]:
if ip1 not in hostPlex:
hostPlex[ip1] = []
someIp = ip1
hostPlex[ip1].append('scp -r %s:%s* %s:%s. ' % (ip1, directory, (hosts[hostId + step][5]), (hosts[hostId + step][7])))
backHostId, backDnsPort, backHttpsPort, backHttpPort, backUdbPort,backIp1, backIp2, backDirectory, backNote = hosts[hostId + step]
if note != directory:
print 'oh looks like you overlooked host %s' % hostId
if backNote != backDirectory:
print 'oh looks like you overlooked host %s' % backHostId
if backToFront:
cmd = 'scp -r %s:%s* %s:%s. &' % (backIp1, backDirectory, ip1, directory )
else:
cmd = 'scp -r %s:%s* %s:%s. &' % (ip1, directory, backIp1, backDirectory)
cmds.append(cmd)
#print 'scp -r %s:%s* %s:%s. &' % (ip1, directory, (hosts[hostId + step][5]), (hosts[hostId + step][7]))
while len(hostPlex[someIp]) > 0:
cmd = []
for cmd in cmds:
print cmd
for ip in hostPlex.iterkeys():
cmd.append(hostPlex[ip].pop())
#print hostPlex[ip].pop()
print '&\n'.join(cmd), ';'
def testDiskSpeed(host, directory):

Binary file not shown.