Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

Conflicts:
	Errno.cpp
	Errno.h
This commit is contained in:
Matt
2016-01-11 15:30:53 -08:00
91 changed files with 7643 additions and 827 deletions

@ -33,7 +33,7 @@ BigFile::~BigFile () {
//#define O_DIRECT 040000
BigFile::BigFile () {
m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
//m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
m_flags = O_RDWR ; // | O_DIRECT;
m_usePartFiles = true;
// NULLify all ptrs to files
@ -289,7 +289,7 @@ bool BigFile::open ( int flags ,
m_flags = flags;
//m_pc = pc;
m_permissions = permissions;
//m_permissions = permissions;
m_isClosing = false;
// this is true except when parsing big warc files
m_usePartFiles = true;//usePartFiles;
@ -363,7 +363,7 @@ int BigFile::getfd ( int32_t n , bool forReading ) { // , int64_t *vfd ) {
}
// open it if not opened
if ( ! f->calledOpen() ) {
if ( ! f->open ( m_flags , m_permissions ) ) {
if ( ! f->open ( m_flags , getFileCreationFlags() ) ) {
log("disk: Failed to open file part #%"INT32".",n);
return -1;
}
@ -1481,6 +1481,15 @@ bool BigFile::chopHead ( int32_t part ,
return unlinkRename ( NULL, part, true, callback, state );
}
class UnlinkRenameState {
public:
char m_oldFilename [ 1024 ];
char m_newFilename [ 1024 ];
int m_fd;
File *m_file;
collnum_t m_collnum;
};
static void *renameWrapper_r ( void *state , ThreadEntry *t ) ;
static void *unlinkWrapper_r ( void *state , ThreadEntry *t ) ;
static void doneRenameWrapper ( void *state , ThreadEntry *t ) ;
@ -1604,6 +1613,38 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
// save callback for when all parts are unlinked or renamed
m_callback = callback;
m_state = state;
#ifdef FIXBUG
// now use a special state in case RdbBase gets nuked
// because the collection gets deleted in the middle of this
UnlinkRenameState stackUr;
char *st =(char *)mmalloc( sizeof(UnlinkRenameState),"ulrnst");
UnlinkRenameState *urs = (UnlinkRenameState *)st;
if ( ! ur ) {
log("disk: failed to alloc unlinkrename state. "
"skipping thread.");
ur = stackUr;
}
urs->m_fd = m_fd;
urs->m_collnum = collnum; // can we supply this now?
urs->m_file = this;
urs->m_closedIt = false;
makeFilename_r ( m_baseFilename.getBufStart() ,
NULL ,
i ,
urs->m_oldFilename ,
1024 );
// rename also takes the new name
if ( ! m_isUnlink )
makeFilename_r ( m_newBaseFilename.getBufStart() ,
m_newBaseFilenameDir.getBufStart(),
i ,
urs->m_newFilename ,
1024 );
if ( ur == stackUr )
goto skipThread;
#endif
// . we spawn the thread here now
// . returns true on successful spawning
// . we can't make a disk thread cuz Threads.cpp checks its
@ -1668,6 +1709,30 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
}
void *renameWrapper_r ( void *state , ThreadEntry *t ) {
#ifdef FIXBUG
UnlinkRenameState *urs = (UnlinkRenameState *)state;
if ( ::rename ( urs->m_oldFilename , urs->m_newFilename ) ) {
// reset errno and return true if file does not exist
if ( errno == ENOENT ) {
log("disk: file %s does not exist.",oldFilename);
errno = 0;
}
// otherwise, it's a more serious error i guess
else log("disk: rename %s to %s: %s",
oldFilename,newFilename,mstrerror(errno));
return NULL;
}
// we must close the file descriptor in the thread otherwise the
// file will not actually be renamed in this thread
//f->close1_r();
// we can't call f->close1_r() because f might have been deleted
// because the collection was deleted.
if ( close1ByFd_r( urs->m_fd) )
urs->m_closedIt = true;
return;
#endif
// extract our class
File *f = (File *)state;
// . by getting the inode in the cache space the call to f->close()
@ -1721,6 +1786,16 @@ void *renameWrapper_r ( void *state , ThreadEntry *t ) {
}
void *unlinkWrapper_r ( void *state , ThreadEntry *t ) {
#ifdef FIXBUG
UnlinkRenameState *urs = (UnlinkRenameState *)state;
::unlink ( urs->m_oldFilename );
// we can't call f->close1_r() because f might have been deleted
// because the collection was deleted.
if ( close1ByFd_r( urs->m_fd) )
urs->m_closedIt = true;
return;
#endif
// get ourselves
File *f = (File *)state;
// . by getting the inode in the cache space the call to delete(f)
@ -1742,6 +1817,25 @@ void *unlinkWrapper_r ( void *state , ThreadEntry *t ) {
}
void doneRenameWrapper ( void *state , ThreadEntry *t ) {
#ifdef FIXBUG
// if collection got nuked, then file will be invalid
// so when we nuke a collection we scan all threads for unlink/rename
// operations that reference files from the collection being nuked and
// set their m_collectionGotNuked flag to true
UnlinkRenameState *urs = (UnlinkRenameState *)state;
File *f = urs->m_file;
collnum_t cn = urs->m_collnum;
RdbBase *base = getRdbBase ( cn );
mfree ( urs , sizeof(UrlRenameState), "urnst" );
if ( ! base ) { // urs->m_collectionGotNuked ) {
log("bigfile: captured rename on nuked collection %i",(int)cn);
g_unlinkRenameThreads--;
return;
}
#endif
// extract our class
File *f = (File *)state;
// . finish the close
@ -1795,6 +1889,24 @@ void doneRenameWrapper ( void *state , ThreadEntry *t ) {
}
void doneUnlinkWrapper ( void *state , ThreadEntry *t ) {
#ifdef FIXBUG
// if collection got nuked, then file will be invalid
// so when we nuke a collection we scan all threads for unlink/rename
// operations that reference files from the collection being nuked and
// set their m_collectionGotNuked flag to true
UnlinkRenameState *urs = (UnlinkRenameState *)state;
File *f = urs->m_file;
collnum_t cn = urs->m_collnum;
RdbBase *base = getRdbBase ( cn );
mfree ( urs , sizeof(UrlRenameState), "urnst" );
if ( ! base ) { // urs->m_collectionGotNuked ) {
log("bigfile: captured unlink on nuked collection %i",(int)cn);
g_unlinkRenameThreads--;
return;
}
#endif
// extract our class
File *f = (File *)state;
// finish the close

@ -353,7 +353,7 @@ class BigFile {
SafeBuf m_newBaseFilenameDir ;//[256];
int32_t m_permissions;
//int32_t m_permissions;
int32_t m_flags;
// determined in open() override

@ -333,6 +333,9 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
if ( cr->m_isCustomCrawl ) {
cr->m_getLinkInfo = false;
cr->m_computeSiteNumInlinks = false;
// limit each shard to 5 spiders per collection to prevent
// ppl from spidering the web and hogging up resources
cr->m_maxNumSpiders = 5;
}
// we need to compile the regular expressions or update the url
@ -633,10 +636,11 @@ bool Collectiondb::addNewColl ( char *coll ,
// MDW: create the new directory
retry22:
if ( ::mkdir ( dname ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ) {
if ( ::mkdir ( dname ,
getDirCreationFlags() ) ) {
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ) {
// valgrind?
if ( errno == EINTR ) goto retry22;
g_errno = errno;
@ -1401,10 +1405,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
log("admin: Trying to create collection %s but "
"directory %s already exists on disk.",cr->m_coll,dname);
}
if ( ::mkdir ( dname ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ) {
if ( ::mkdir ( dname ,
getDirCreationFlags() ) ) {
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ) {
// valgrind?
//if ( errno == EINTR ) goto retry22;
//g_errno = errno;
@ -1971,6 +1976,29 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
// it is binary now
gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
// if it had corrupted data from saving corrupted mem zero it out
CrawlInfo *stats = &m_localCrawlInfo;
// point to the stats for that host
int64_t *ss = (int64_t *)stats;
// are stats crazy?
bool crazy = false;
for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
// crazy stat?
if ( *ss > 1000000000LL ||
*ss < -1000000000LL ) {
crazy = true;
break;
}
ss++;
}
if ( m_localCrawlInfo.m_collnum != m_collnum )
crazy = true;
if ( crazy ) {
log("coll: had crazy spider stats for coll %s. zeroing out.",
m_coll);
m_localCrawlInfo.reset();
}
if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
log("coll: Loaded %s (%"INT32") local hasurlsready=%"INT32"",
@ -3787,12 +3815,30 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
i++;
}
// don't bother re-spidering old pages if hopcount == maxhopcount
// and only process new urls is true. because we don't need to
// harvest outlinks from them.
if ( m_diffbotOnlyProcessIfNewUrl && m_diffbotMaxHops > 0 &&
// only crawls, not bulk jobs
m_isCustomCrawl == 1 ) {
m_regExs[i].purge();
m_regExs[i].safePrintf("isindexed && hopcount==%"INT32,
m_diffbotMaxHops );
m_spiderPriorities [i] = 14;
m_spiderFreqs [i] = 0.0;
m_maxSpidersPerRule [i] = 0; // turn off spiders
m_harvestLinks [i] = false;
i++;
}
// diffbot needs to retry even on 500 or 404 errors since sometimes
// a seed url gets a 500 error mistakenly and it haults the crawl.
// so take out "!hastmperror".
m_regExs[i].set("errorcount>=1 && !hastmperror");
m_spiderPriorities [i] = 15;
m_spiderFreqs [i] = 0.0;
m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
m_spiderPriorities [i] = 14;
m_spiderFreqs [i] = 0.0416; // every hour
//m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
i++;
// and for docs that have errors respider once every 5 hours

@ -494,6 +494,7 @@ class CollectionRec {
char m_useSimplifiedRedirects ;
char m_useIfModifiedSince ;
char m_useTimeAxis ;
char m_indexWarcs;
char m_buildVecFromCont ;
int32_t m_maxPercentSimilarPublishDate;
char m_useSimilarityPublishDate;

@ -9,6 +9,25 @@
Conf g_conf;
static bool s_setUmask = false;;
mode_t getFileCreationFlags() {
if ( ! s_setUmask ) {
s_setUmask = true;
umask ( 0 );
}
return S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
}
mode_t getDirCreationFlags() {
if ( ! s_setUmask ) {
s_setUmask = true;
umask ( 0 );
}
return S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH |
S_IXUSR | S_IXGRP;
}
Conf::Conf ( ) {
m_save = true;
m_doingCommandLine = false;

7
Conf.h

@ -43,6 +43,9 @@
#define MAX_GEOCODERS 4
mode_t getFileCreationFlags();
mode_t getDirCreationFlags ();
class Conf {
public:
@ -180,7 +183,9 @@ class Conf {
//bool m_tagdbUseSeals;
//int32_t m_tagdbMinFilesToMerge;
//bool m_tagdbSaveCache;
//bool m_makeAllFilesGroupWritable;
// catdb parameters
int32_t m_catdbMaxTreeMem;
//int32_t m_catdbMaxDiskPageCacheMem;

@ -2470,7 +2470,8 @@ Host *Dns::getResponsibleHost ( key_t key ) {
// get the hostNum that should handle this
int32_t hostId = key.n1 % hostdb->getNumHosts();
// return it if it is alive
if ( ! hostdb->isDead ( hostId ) ) return hostdb->getHost ( hostId );
Host* h = hostdb->getHost ( hostId );
if ( h->m_spiderEnabled && ! hostdb->isDead ( hostId ) ) return h;
// how many are up?
int32_t numAlive = hostdb->getNumHostsAlive();
// NULL if none
@ -2482,6 +2483,7 @@ Host *Dns::getResponsibleHost ( key_t key ) {
for ( int32_t i = 0 ; i < hostdb->m_numHosts ; i++ ) {
// get the ith host
Host *host = &hostdb->m_hosts[i];
if ( !host->m_spiderEnabled ) continue;
// skip him if he is dead
if ( hostdb->isDead ( host ) ) continue;
// count it if alive, continue if not our number

@ -196,6 +196,7 @@ case EDNSERROR : return "DNS lookup error";
case ETHREADSDISABLED:return "Threads Disabled";
case EMALFORMEDQUERY: return "Malformed query";
case ESHARDDOWN: return "One or more shards are down";
case EDOCWARC: return "Doc is WARC or ARC and support is disabled";
case EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY: return "Diffbot request of third-party content timed out";
}
// if the remote error bit is clear it must be a regulare errno

@ -201,6 +201,8 @@ enum {
ETHREADSDISABLED,
EMALFORMEDQUERY,
ESHARDDOWN,
EDOCWARC,
EWRONGSHARD,
EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY
};
#endif

@ -238,7 +238,10 @@ bool File::open ( int flags , int permissions ) {
}
// save these in case we need to reopen in getfd()
m_flags = flags;
m_permissions = permissions;
//m_permissions = permissions;
// just override and use system settings so we can get the group
// writable/readable/executable bits if set that way in g_conf
//m_permissions = getFileCreationFlags();
m_calledOpen = true;
// sanity check
//int32_t ss = 0;
@ -668,7 +671,7 @@ int File::getfd () {
if ( fd == -1 ) {
t1 = gettimeofdayInMilliseconds();
retry7:
fd = ::open ( getFilename() , m_flags , m_permissions );
fd = ::open ( getFilename() , m_flags,getFileCreationFlags());
// valgrind
if ( fd == -1 && errno == EINTR ) goto retry7;
// 0 means stdout, right? why am i seeing it get assigned???
@ -676,7 +679,7 @@ int File::getfd () {
log("disk: Got fd of 0 when opening %s.",
getFilename());
if ( fd == 0 )
fd = ::open ( getFilename(), m_flags , m_permissions );
fd=::open(getFilename(),m_flags,getFileCreationFlags());
if ( fd == 0 )
log("disk: Got fd of 0 when opening2 %s.",
getFilename());

2
File.h

@ -193,7 +193,7 @@ class File {
// save the permission and flag sets in case of re-opening
int m_flags;
int m_permissions;
//int m_permissions;
char m_calledOpen;
char m_calledSet;

@ -623,8 +623,10 @@ bool HashTableX::save ( char *dir ,
char s[1024];
sprintf ( s , "%s/%s", dir , filename );
int fd = ::open ( s ,
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
S_IRGRP | S_IWGRP | S_IROTH);
O_RDWR | O_CREAT | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR |
// S_IRGRP | S_IWGRP | S_IROTH);
if ( fd < 0 ) {
//m_saveErrno = errno;
return log("db: Could not open %s for writing: %s.",

@ -691,16 +691,26 @@ bool Hostdb::init ( int32_t hostIdArg , char *netName ,
//skip:
h->m_queryEnabled = true;
h->m_spiderEnabled = true;
// check for something after the working dir
h->m_note[0] = '\0';
if ( *p != '\n' ) {
// save the note
char *n = p;
while ( *n && *n != '\n' && n < pend ) n++;
int32_t noteSize = n - p;
if ( noteSize > 127 ) noteSize = 127;
gbmemcpy(h->m_note, p, noteSize);
*p++ = '\0'; // NULL terminate for atoip
if(strstr(h->m_note, "noquery")) {
h->m_queryEnabled = false;
}
if(strstr(h->m_note, "nospider")) {
h->m_spiderEnabled = false;
}
}
else
*p = '\0';
@ -1642,6 +1652,56 @@ Host *Hostdb::getLiveHostInShard ( int32_t shardNum ) {
return &shard[0];
}
int32_t Hostdb::getHostIdWithSpideringEnabled ( uint32_t shardNum ) {
Host *hosts = g_hostdb.getShard ( shardNum);
int32_t numHosts = g_hostdb.getNumHostsPerShard();
int32_t hostNum = 0;
int32_t numTried = 0;
while( !hosts [ hostNum ].m_spiderEnabled && numTried < numHosts ) {
hostNum = (hostNum+1) % numHosts;
numTried++;
}
if( !hosts [ hostNum ].m_spiderEnabled) {
log("build: cannot spider when entire shard has nospider enabled");
char *xx = NULL; *xx = 0;
}
return hosts [ hostNum ].m_hostId ;
}
// if niceness 0 can't pick noquery host.
// if niceness 1 can't pick nospider host.
Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum , char niceness ) {
int32_t minOutstandingRequests = 0x7fffffff;
int32_t minOutstandingRequestsIndex = -1;
Host *shard = getShard ( shardNum );
Host *bestDead = NULL;
for(int32_t i = 0; i < m_numHostsPerShard; i++) {
Host *hh = &shard[i];
// don't pick a 'no spider' host if niceness is 1
if ( niceness > 0 && ! hh->m_spiderEnabled ) continue;
// don't pick a 'no query' host if niceness is 0
if ( niceness == 0 && ! hh->m_queryEnabled ) continue;
if ( ! bestDead ) bestDead = hh;
if(isDead(hh)) continue;
// log("host %"INT32 " numOutstanding is %"INT32, hh->m_hostId,
// hh->m_pingInfo.m_udpSlotsInUseIncoming);
if ( hh->m_pingInfo.m_udpSlotsInUseIncoming >
minOutstandingRequests )
continue;
minOutstandingRequests =hh->m_pingInfo.m_udpSlotsInUseIncoming;
minOutstandingRequestsIndex = i;
}
// we should never return a nospider/noquery host depending on
// the niceness, so return bestDead
if(minOutstandingRequestsIndex == -1) return bestDead;//shard;
return &shard[minOutstandingRequestsIndex];
}
// if all are dead just return host #0
Host *Hostdb::getFirstAliveHost ( ) {
for ( int32_t i = 0 ; i < m_numHosts ; i++ )
@ -1990,8 +2050,9 @@ bool Hostdb::saveHostsConf ( ) {
sprintf ( filename, "%shosts.conf", m_dir );
log ( LOG_INFO, "conf: Writing hosts.conf file to: %s",
filename );
int32_t fd = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
int32_t fd = open ( filename, O_CREAT|O_WRONLY|O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
if ( !fd ) {
log ( "conf: Failed to open %s for writing.", filename );
return false;

@ -211,6 +211,7 @@ class Host {
int64_t m_lastPing;
char m_tmpBuf[4];
int16_t m_tmpCount;
// . first time we sent an unanswered ping request to this host
// . used so we can determine when to send an email alert
@ -337,6 +338,10 @@ class Host {
int32_t m_lastTryError;
int32_t m_lastTryTime;
bool m_spiderEnabled;
bool m_queryEnabled;
//char m_requestBuf[MAX_PING_SIZE];
PingInfo m_pingInfo;//RequestBuf;
};
@ -445,6 +450,8 @@ class Hostdb {
//Host *getLiveHostInGroup ( int32_t groupId );
Host *getLiveHostInShard ( int32_t shardNum );
Host *getLeastLoadedInShard ( uint32_t shardNum , char niceness );
int32_t getHostIdWithSpideringEnabled ( uint32_t shardNum );
// in the entire cluster. return host #0 if its alive, otherwise
// host #1, etc.
@ -464,6 +471,7 @@ class Hostdb {
return &m_hosts[shardNum * m_numHostsPerShard];
};
//Host *getGroupFromGroupId ( uint32_t gid ) {
// return getGroup ( gid );
//};

@ -1778,8 +1778,8 @@ bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
else now = getTimeLocal();
// . buffer for the MIME request and brief html err msg
// . NOTE: ctime appends a \n to the time, so we don't need to
char msg[1024];
SafeBuf sb(msg,1024,0,false);
char msg[1524];
SafeBuf sb(msg,1524,0,false);
char *tt = asctime(gmtime ( &now ));
tt [ gbstrlen(tt) - 1 ] = '\0';
@ -1838,7 +1838,7 @@ bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
// use this new function that will compress the reply now if the
// request was a ZET instead of a GET
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
return sendReply2 ( sb.getBufStart(), sb.length() , NULL , 0 , s );
}
bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
@ -1851,8 +1851,8 @@ bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
else now = getTimeLocal();
int32_t format = gr->m_hr.getReplyFormat();
char msg[1024];
SafeBuf sb(msg,1024,0,false);
char msg[1524];
SafeBuf sb(msg,1524,0,false);
char *tt = asctime(gmtime ( &now ));
tt [ gbstrlen(tt) - 1 ] = '\0';
@ -1904,7 +1904,7 @@ bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
// use this new function that will compress the reply now if the
// request was a ZET instead of a GET
return sendReply2 ( msg , sb.length() , NULL , 0 , gr->m_socket );
return sendReply2 ( sb.getBufStart(),sb.length(),NULL,0,gr->m_socket );
}
// . send an error reply, like "HTTP/1.1 404 Not Found"
@ -1931,8 +1931,8 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , int32_t error , char *errmsg ,
// . buffer for the MIME request and brief html err msg
// . NOTE: ctime appends a \n to the time, so we don't need to
char msg[1024];
SafeBuf sb(msg,1024,0,false);
char msg[1524];
SafeBuf sb(msg,1524,0,false);
// if it's a 404, redirect to home page
/*
if ( error == 404 )
@ -2000,8 +2000,8 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , int32_t error , char *errmsg ,
// record it
if ( bytesSent ) *bytesSent = sb.length();//sendBufSize;
// use this new function that will compress the reply now if the
// request was a ZET instead of a GET
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
// request was a ZET instead of a GET mdw
return sendReply2 ( sb.getBufStart() , sb.length() , NULL , 0 , s );
/*
// . this returns false if blocked, true otherwise

@ -1007,7 +1007,10 @@ void Images::thumbStart_r ( bool amThread ) {
// Open/Create temporary file to store image to
int fhndl;
if( (fhndl = open( in, O_RDWR+O_CREAT, S_IWUSR+S_IRUSR )) < 0 ) {
if( (fhndl = open( in, O_RDWR+O_CREAT ,
getFileCreationFlags()
// // S_IWUSR+S_IRUSR
)) < 0 ) {
log( "image: Could not open file, %s, for writing: %s - %d.",
in, mstrerror( m_errno ), fhndl );
m_imgDataSize = 0;

@ -512,3 +512,66 @@ bool endsInCurly ( char *s , int32_t slen ) {
if ( e >= m && *e == '}' ) return true;
return false;
}
// Accepts a json string which has a top level object and a "key":val pair
// return false unless jsonStr has the new key:val
bool Json::prependKey(SafeBuf& jsonStr, char* keyVal) {
int32_t ndx = jsonStr.indexOf('{');
// no object? try array? fail for now
if( ndx == -1 || ndx == jsonStr.length() - 1 ) return false;
ndx++; //the insert pos
if(ndx == jsonStr.length()) return false;
// find if the object had any other keys
int32_t jsonStrLen = jsonStr.length();
int32_t i = ndx;
while(i < jsonStrLen && isspace(jsonStr[i])) i++;
if( i == jsonStrLen ) return false;
if (jsonStr[i] != '}') {
jsonStr.insert(",\n", i);
} //else we are the only item, no comma
return jsonStr.insert(keyVal, ndx);
}
// bool Json::printToString(SafeBuf& out, JsonItem* ji = NULL) {
// if(!ji) ji = getFirstItem();
// for ( ; ji ; ji = ji->m_next ) {
// switch (ji->m_type) {
// case JT_NULL:
// out.safeMemcpy("null", 4);
// break;
// case JT_NUMBER:
// int32_t vl;
// char* v = ji->getValueAsString(&vl);
// out.safeMemcpy(v, vl);
// break;
// case JT_STRING:
// int32_t vl;
// char* v = ji->getValueAsString(&vl);
// out.pushChar('"');
// out.safeMemcpy(v, vl);
// out.pushChar('"');
// break;
// case JT_ARRAY:
// // wha? really? I would've thought this would contain
// // jsonitems and not a string
// safeMemcpy(ji->m_valueArray, ji->m_valueArray);
// break;
// case JT_OBJECT:
// out.pushChar('{');
// out.safeMemcpy(v, vl);
// out.pushChar("\"");
// break;
// }
// }
// out->
// }

7
Json.h

@ -24,6 +24,7 @@ class JsonItem {
class JsonItem *m_next,*m_prev;
class JsonItem *m_parent;//child;
// the JT_* values above
int m_type;
@ -43,7 +44,6 @@ class JsonItem {
char *m_valueArray;
// for JT_String
int32_t getValueLen() { return m_valueLen; };
@ -78,6 +78,8 @@ class Json {
JsonItem *parseJsonStringIntoJsonItems ( char *json , int32_t niceness );
bool printToString(SafeBuf& out);
JsonItem *getFirstItem ( ) ;
JsonItem *getItem ( char *name );
@ -86,6 +88,9 @@ class Json {
Json() { m_stackPtr = 0; m_prev = NULL; };
static bool prependKey(SafeBuf& jsonString, char* newKey);
SafeBuf m_sb;
JsonItem *m_stack[MAXJSONPARENTS];
int32_t m_stackPtr;

@ -145,7 +145,7 @@ bool Language::convertLatin1DictToUTF8( char *infile ){
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -2763,7 +2763,7 @@ bool Language::makeWordFiles ( int32_t numWordsToDump , int32_t numWordsPerPhras
// then open a new one for appending
fds[i] = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fds[i] < 0 )
return log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
@ -3146,7 +3146,7 @@ bool Language::makePopFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase
// then open a new one for appending
fds[i] = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fds[i] < 0 )
return log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
@ -3683,7 +3683,7 @@ bool Language::makeQueryFiles ( ) {
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -3874,7 +3874,7 @@ bool Language::makeWikiFiles( ) {
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -4250,7 +4250,7 @@ bool Language::gotTermFreqs( StateDict *st ){
// then open a new one for appending
fd = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fd < 0 ){
log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
@ -4338,7 +4338,7 @@ bool StateAff::openAffinityFile( ){
unlink ( ff );
// then open a new one for appending
m_fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( m_fdw < 0 ){
log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -4537,7 +4537,7 @@ bool Language::cleanDictFile ( ) {
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
@ -4590,7 +4590,7 @@ bool Language::makePhonet( char *infile){
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
@ -4711,7 +4711,7 @@ bool Language::genTopPopFile ( char *infile ){
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
@ -4761,7 +4761,8 @@ bool Language::genDistributedPopFile ( char *infile, uint32_t myHash ){
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
@ -4848,7 +4849,8 @@ int32_t Language::spellcheckDict(){
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));

@ -961,7 +961,7 @@ static bool s_isLangTag(char *str) {
static uint8_t s_getCountryFromSpec(char *str) {
char code[6];
memset(code, 6, 0);
memset(code, 0,6);
gbmemcpy(code, str, s_wordLen(str));
for(int x = 0; x < 6; x++)
if(code[x] > 'A' && code[x] < 'Z') code[x] -= ('A' - 'a');

@ -603,6 +603,10 @@ bool getLinkInfo ( SafeBuf *reqBuf ,
Host *hosts = g_hostdb.getShard ( shardNum); // Group ( groupId );
if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
int32_t hostId = hosts [ hostNum ].m_hostId ;
if( !hosts [ hostNum ].m_spiderEnabled) {
hostId = g_hostdb.getHostIdWithSpideringEnabled ( shardNum );
}
// . serialize the string buffers
// . use Msg25Request::m_buf[MAX_NEEDED]
@ -665,7 +669,16 @@ static void sendReplyWrapper ( void *state ) {
// sanity
if ( req->m_udpSlot != slot2 ) { char *xx=NULL;*xx=0;}
// if in table, nuke it
g_lineTable.removeKey ( &req->m_siteHash64 );
// but only if it was in SITE mode, not PAGE. we've lost our
// table entry like this before.
// TODO: if this still doesn't work then ensure the stored 'req'
// is the same!
if ( req->m_mode == MODE_SITELINKINFO ) {
g_lineTable.removeKey ( &req->m_siteHash64 );
if ( g_conf.m_logDebugLinkInfo )
log("linkdb: removing sitehash64=%"INT64"",
req->m_siteHash64);
}
nextLink:
@ -746,6 +759,7 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
if ( head->m_next )
req->m_next = head->m_next;
head->m_next = req;
req->m_waitingInLine = 1;
// note it for debugging
log("build: msg25 request waiting in line for %s "
"udpslot=0x%"PTRFMT"",
@ -755,6 +769,8 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
return;
}
req->m_waitingInLine = 0;
// make a new Msg25
Msg25 *m25;
try { m25 = new ( Msg25 ); }

@ -76,6 +76,15 @@ public:
int32_t m_ourHostHash32 ;
int32_t m_ourDomHash32 ;
uint8_t m_waitingInLine:1;
uint8_t m_reserved1:1;
uint8_t m_reserved2:1;
uint8_t m_reserved3:1;
uint8_t m_reserved4:1;
uint8_t m_reserved5:1;
uint8_t m_reserved6:1;
uint8_t m_reserved7:1;
// new stuff
int32_t m_siteHash32;
int64_t m_siteHash64;

10
Log.cpp

@ -132,8 +132,9 @@ bool Log::init ( char *filename ) {
// open it for appending.
// create with -rw-rw-r-- permissions if it's not there.
m_fd = open ( m_filename ,
O_APPEND | O_CREAT | O_RDWR ,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
O_APPEND | O_CREAT | O_RDWR ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
if ( m_fd >= 0 ) return true;
// bitch to stderr and return false on error
fprintf(stderr,"could not open log file %s for appending\n",
@ -422,8 +423,9 @@ bool Log::makeNewLogFile ( ) {
// open it for appending.
// create with -rw-rw-r-- permissions if it's not there.
m_fd = open ( m_filename ,
O_APPEND | O_CREAT | O_RDWR ,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
O_APPEND | O_CREAT | O_RDWR ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
if ( m_fd >= 0 ) return true;
// bitch to stderr and return false on error
fprintf(stderr,"could not open new log file %s for appending\n",

@ -1014,7 +1014,7 @@ void printStackTrace ( int signum , siginfo_t *info , void *ptr ) {
// right now only works for 32 bit
//if ( arch != 32 ) return;
logf(LOG_DEBUG,"gb: seg fault. printing stack trace. use "
logf(LOG_DEBUG,"gb: Printing stack trace. use "
"'addr2line -e gb' to decode the hex below.");
if ( g_inMemFunction ) {
@ -1035,6 +1035,16 @@ void printStackTrace ( int signum , siginfo_t *info , void *ptr ) {
//,ba
//,g_profiler.getFnName(ba,0));
);
#ifdef INLINEDECODE
char cmd[256];
sprintf(cmd,"addr2line -e gb 0x%"XINT64" > ./tmpout"
,(uint64_t)s_bt[i]);
gbsystem ( cmd );
char obuf[1024];
SafeBuf fb (obuf,1024);
fb.load("./tmpout");
log("stack: %s",fb.getBufStart());
#endif
}
}
@ -1171,7 +1181,8 @@ void sigvtalrmHandler ( int x , siginfo_t *info , void *y ) {
//g_inSigHandler = true;
// NOT SAFE for pthreads cuz we're in sig handler
#ifndef PTHREADS
log("loop: missed quickpoll");
log("loop: missed quickpoll. Dumping stack.");
printStackTrace( x , info , y );
#endif
//g_inSigHandler = false;
// seems to core a lot in gbcompress() we need to
@ -1183,15 +1194,19 @@ void sigvtalrmHandler ( int x , siginfo_t *info , void *y ) {
}
// if it has been a while since heartbeat (> 10000ms) dump core so
// we can see where the process was... that is a missed quick poll?
// we can see where the process was... we are in a long niceness 0
// function or a niceness 1 function without a quickpoll, so that
// heartbeatWrapper() function never gets called.
if ( g_process.m_lastHeartbeatApprox == 0 ) return;
if ( g_conf.m_maxHeartbeatDelay <= 0 ) return;
if ( g_nowApprox - g_process.m_lastHeartbeatApprox >
g_conf.m_maxHeartbeatDelay ) {
#ifndef PTHREADS
logf(LOG_DEBUG,"gb: CPU seems blocked. Forcing core.");
logf(LOG_DEBUG,"gb: CPU seems blocked. Dumping stack.");
printStackTrace( x , info , y );
#endif
//char *xx=NULL; *xx=0;
}
//logf(LOG_DEBUG, "xxx now: %"INT64"! approx: %"INT64"", g_now, g_nowApprox);
@ -2708,6 +2723,32 @@ void Loop::enableTimer() {
}
FILE* gbpopen(char* cmd) {
// Block everything from interrupting this system call because
// if there is an alarm or a child thread crashes (pdftohtml)
// then this will hang forever.
// We should actually write our own popen so that we do
// fork, close all fds in the child, then exec.
// These child processes can hold open the http server and
// prevent a new gb from running even after it has died.
g_loop.disableTimer();
sigset_t oldSigs;
sigset_t sigs;
sigfillset ( &sigs );
if ( sigprocmask ( SIG_BLOCK , &sigs, &oldSigs ) < 0 ) {
log("build: had error blocking signals for popen");
}
FILE* fh = popen(cmd, "r");
if ( sigprocmask ( SIG_SETMASK , &oldSigs, NULL ) < 0 ) {
log("build: had error unblocking signals for popen");
}
g_loop.enableTimer();
return fh;
}
//calling with a 0 niceness will turn off the timer interrupt

4
Loop.h

@ -18,7 +18,9 @@
#define QUERYPRIORITYWEIGHT 16
#define QUICKPOLL_INTERVAL 10
int gbsystem(char *cmd ) ;
int gbsystem(char *cmd);
FILE* gbpopen(char* cmd);
#define sleep(a) { char *xx=NULL;*xx=0; }
//#define sleep(a) logf(LOG_INFO,"sleep: sleep");

@ -67,7 +67,7 @@ OBJS = UdpSlot.o Rebalance.o \
Dates.o Sections.o SiteGetter.o Syncdb.o qa.o \
Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
Cachedb.o Monitordb.o dlstubs.o PageCrawlBot.o Json.o PageBasic.o \
Version.o
Punycode.o Version.o
CHECKFORMATSTRING = -D_CHECK_FORMAT_STRING_
@ -407,7 +407,7 @@ Linkdb.o:
# final gigabit generation in here:
Msg40.o:
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
seo.o:
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
@ -788,14 +788,5 @@ install-pkgs-local:
warcinjector:
-rm -r /home/zak/.pex/build/inject-*
-rm -r /home/zak/.pex/install/inject-*
cd script && pex -v . requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject -o warc-inject --inherit-path --no-wheel
cd script && pex -v . gevent gevent-socketio requests pyopenssl ndg-httpsclient pyasn1 multiprocessing -e inject -o warc-inject --inherit-path --no-wheel
#pex -v inject requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script' --inherit-path --no-wheel
#pex -v inject requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script' --inherit-path --no-wheel
# pex -r requests -r pyopenssl -r ndg-httpsclient -r pyasn1 -r multiprocessing -e inject.inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script/' --inherit-path --no-wheel

@ -1736,6 +1736,75 @@ bool Matches::negTermsFound ( ) {
}
*/
bool Matches::docHasQueryTerms(int32_t totalInlinks) {
// Loop through all matches keeping a count of query term matches
// from link text.
// If a match is not from a link text max it out.
// Tally up the matched terms vs number of matches
// if only one or two link text matches out of > 10 then
// return false indicating that the doc does not
// have the term
if(m_numMatches == 0) {
// if there is no query and no matches then short circuit
return true;
}
int32_t qterms = 1024;
int32_t tmpBuf[qterms];
int32_t *numMatches = tmpBuf;
if(qterms < m_q->m_numTerms) {
qterms = m_q->m_numTerms;
numMatches = (int32_t *)mmalloc(qterms * sizeof(int32_t),
"matchesAnomaly");
}
memset(numMatches, 0, qterms * sizeof(int32_t));
for ( int32_t i = 0 ; i < m_numMatches ; i++ ) {
// get the match
Match *m = &m_matches[i];
if(m->m_flags & MF_LINK) {
numMatches[m->m_qwordNum]++;
continue;
}
numMatches[m->m_qwordNum] = m_numMatches;
// log("match flag %x wordnum %"INT32 " totalinlinks:%"INT32,
// m->m_flags, m->m_wordNum, totalInlinks);
}
// Assume the best, since we're really only after anomalous link text
// at this point.
bool hasTerms = true;
int32_t nqt = m_q->m_numTerms;
for ( int32_t i = 0 ; i < nqt ; i++ ) {
QueryTerm *qt = &m_q->m_qterms[i];
// skip if ignored *in certain ways only*
if ( ! isMatchableTerm ( qt ) ) {
continue;
}
// get the word it is from
QueryWord *qw = qt->m_qword;
// It is a match if it matched something other than link text
// or it matched at least 1 link text and there arent many link texts
// or it matched more than 2 link texts and there are many link texts
hasTerms &= ((numMatches[qw->m_wordNum] >= m_numMatches) ||
(numMatches[qw->m_wordNum] > 0 && totalInlinks < 10) ||
(numMatches[qw->m_wordNum] > 2 && totalInlinks > 10));
}
if (numMatches != tmpBuf) {
mfree(numMatches, qterms * sizeof(int32_t), "matchesAnomaly");
}
return hasTerms;
}
MatchOffsets::MatchOffsets() {
reset();
}
@ -1804,6 +1873,7 @@ bool MatchOffsets::set(Xml * xml, Words *words, Matches *matches,
return true;
}
int32_t MatchOffsets::getStoredSize() {
return m_numMatches * 5
+ 4 //numMatches

@ -148,6 +148,7 @@ class Matches {
//int32_t getTermsFound ( bool *hadPhrases , bool *hadWords );
uint32_t getTermsFound2(bool *hadPhrases, bool *hadWords);
//bool negTermsFound ( );
bool docHasQueryTerms(int32_t totalInlinks);
// used internally and by PageGet.cpp
bool isMatchableTerm ( class QueryTerm *qt );//, int32_t i );

@ -1718,7 +1718,7 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
int32_t slot = g_mem.getMemSlot ( ptr );
if ( slot < 0 ) {
log(LOG_LOGIC,"mem: could not find slot (note=%s)",note);
log(LOG_LOGIC,"mem: FIXME!!!");
//log(LOG_LOGIC,"mem: FIXME!!!");
// return for now so procog does not core all the time!
return;
//char *xx = NULL; *xx = 0;

@ -282,6 +282,12 @@ bool Msg13::forwardRequest ( ) {
int32_t nh = g_hostdb.m_numHosts;
int32_t hostId = hash32h(((uint32_t)r->m_firstIp >> 8), 0) % nh;
if((uint32_t)r->m_firstIp >> 8 == 0) {
// If the first IP is not set for the request then we don't
// want to hammer the first host with spidering enabled.
hostId = hash32n ( r->ptr_url ) % nh;
}
// avoid host #0 for diffbot hack which is dropping some requests
// because of the streaming bug methinks
if ( hostId == 0 && nh >= 2 && g_conf.m_diffbotMsg13Hack )
@ -295,12 +301,22 @@ bool Msg13::forwardRequest ( ) {
// get that host
//h = g_hostdb.getProxy ( hostId );;
h = g_hostdb.getHost ( hostId );
// stop if he is alive
if ( ! g_hostdb.isDead ( h ) ) break;
// Get the other one in shard instead of getting the first
// one we find sequentially because that makes the load
// imbalanced to the lowest host with spidering enabled.
if(!h->m_spiderEnabled) {
h = g_hostdb.getHost(g_hostdb.getHostIdWithSpideringEnabled(
h->m_hostId));
}
// stop if he is alive and able to spider
if ( h->m_spiderEnabled && ! g_hostdb.isDead ( h ) ) break;
// get the next otherwise
if ( ++hostId >= nh ) hostId = 0;
}
hostId = 0; // HACK!!
// forward it to self if we are the spider proxy!!!
@ -2364,7 +2380,7 @@ bool getTestSpideredDate ( Url *u , int32_t *origSpideredDate , char *testDir )
bool addTestSpideredDate ( Url *u , int32_t spideredTime , char *testDir ) {
// ensure dir exists
::mkdir(testDir,S_IRWXU);
::mkdir(testDir,getDirCreationFlags());
// set this
int64_t uh64 = hash64(u->getUrl(),u->getUrlLen());

@ -57,8 +57,9 @@ void handleRequest ( UdpSlot *slot , int32_t netnice ) {
return;
}
int32_t fd = open ( filename , O_RDONLY,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
int32_t fd = open ( filename , O_RDONLY ,
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
if ( ! fd ) {
log(LOG_DEBUG, "logviewer: Failed to open %s for reading: ",
filename);

@ -177,6 +177,12 @@ bool Msg20::getSummary ( Msg20Request *req ) {
int32_t timeout = 9999999; // 10 million seconds, basically inf.
if ( req->m_niceness == 0 ) timeout = 20;
// for diffbot make timeout super long so we aren't tripped up
// by dead hosts that aren't really dead.
// CollectionRec *cr = g_collectiondb.getRec ( req->m_collnum );
// if ( cr && cr->m_isCustomCrawl && req->m_niceness == 0 )
// timeout = 300;
// get our group
int32_t allNumHosts = hostdb->getNumHostsPerShard();
Host *allHosts = hostdb->getShard ( shardNum );//getGroup(groupId );
@ -189,13 +195,29 @@ bool Msg20::getSummary ( Msg20Request *req ) {
Host *hh = &allHosts[i];
// skip if dead
if ( g_hostdb.isDead(hh) ) continue;
// Respect no-spider, no-query directives from hosts.conf
if ( !req->m_getLinkInfo && ! hh->m_queryEnabled ) continue;
if ( req->m_getLinkInfo && ! hh->m_spiderEnabled ) continue;
// add it if alive
cand[nc++] = hh;
}
// if none alive, make them all candidates then
bool allDead = (nc == 0);
for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ )
for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ ) {
// NEVER add a noquery host to the candidate list, even
// if the query host is dead
if ( ! allHosts[i].m_queryEnabled ) continue;
cand[nc++] = &allHosts[i];
}
if ( nc == 0 ) {
log("msg20: error sending mcast: no queryable hosts "
"availble to handle summary generation");
g_errno = EBADENGINEER;
m_gotReply = true;
return true;
}
// route based on docid region, not parity, because we want to hit
// the urldb page cache as much as possible

@ -157,46 +157,13 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
firstHostId = hosts [ hostNum ].m_hostId ;
*/
Host *firstHost ;
// if niceness 0 can't pick noquery host.
// if niceness 1 can't pick nospider host.
firstHost = g_hostdb.getLeastLoadedInShard ( shardNum, r->m_niceness );
int32_t firstHostId = firstHost->m_hostId;
// get our group
int32_t allNumHosts = g_hostdb.getNumHostsPerShard();
Host *allHosts = g_hostdb.getShard ( shardNum );//Group ( groupId );
// put all alive hosts in this array
Host *cand[32];
int64_t nc = 0;
for ( int32_t i = 0 ; i < allNumHosts ; i++ ) {
// get that host
Host *hh = &allHosts[i];
// skip if dead
if ( g_hostdb.isDead(hh) ) continue;
// add it if alive
cand[nc++] = hh;
}
// if none alive, make them all candidates then
bool allDead = (nc == 0);
for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ )
cand[nc++] = &allHosts[i];
// route based on docid region, not parity, because we want to hit
// the urldb page cache as much as possible
int64_t sectionWidth =((128LL*1024*1024)/nc)+1;//(DOCID_MASK/nc)+1LL;
// we mod by 1MB since tied scores resort to sorting by docid
// so we don't want to overload the host responsible for the lowest
// range of docids. CAUTION: do this for msg22 too!
// in this way we should still ensure a pretty good biased urldb
// cache...
// . TODO: fix the urldb cache preload logic
int32_t hostNum = (docId % (128LL*1024*1024)) / sectionWidth;
if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids
if ( hostNum >= nc ) { char *xx = NULL; *xx = 0; }
int32_t firstHostId = cand [ hostNum ]->m_hostId ;
// while this prevents tfndb seeks, it also causes bottlenecks
// if one host is particularly slow, because load balancing is
// bypassed.
//if ( ! g_conf.m_useBiasedTfndb ) firstHostId = -1;
// flag it
m_outstanding = true;
r->m_inUse = 1;

@ -1115,6 +1115,8 @@ bool Msg3::doneScanning ( ) {
m_lists[i].getListSize() ,
0 ); // timestamp. 0 = now
QUICKPOLL(m_niceness);
// if from our 'page' cache, no need to constrain
if ( ! m_lists[i].constrain ( m_startKey ,
m_constrainKey , // m_endKey

@ -470,6 +470,12 @@ bool Msg3a::gotCacheReply ( ) {
for ( int32_t i = 0; i < m_numHosts ; i++ ) { // m_indexdbSplit; i++ ) {
// get that host
Host *h = g_hostdb.getHost(i);
if(!h->m_queryEnabled) {
m_numReplies++;
continue;
}
// if not a full split, just round robin the group, i am not
// going to sweat over performance on non-fully split indexes
// because they suck really bad anyway compared to full
@ -701,10 +707,12 @@ bool Msg3a::gotAllShardReplies ( ) {
// bad reply?
if ( ! mr || replySize < 29 ) {
m_skippedShards++;
log(LOG_LOGIC,"query: msg3a: Bad reply (size=%i) from "
"host #%"INT32". Dead? Timeout? OOM?"
,(int)replySize
,i);
if(g_hostdb.getHost(i)->m_queryEnabled) {
log(LOG_LOGIC,"query: msg3a: Bad reply (size=%i) from "
"host #%"INT32". Dead? Timeout? OOM?"
,(int)replySize
,i);
}
m_reply [i] = NULL;
m_replyMaxSize[i] = 0;
// it might have been timd out, just ignore it!!

@ -1434,8 +1434,9 @@ bool saveAddsInProgress ( char *prefix ) {
sprintf ( filename , "%s%saddsinprogress.saving",
g_hostdb.m_dir , prefix );
int32_t fd = open ( filename, O_RDWR | O_CREAT | O_TRUNC ,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
int32_t fd = open ( filename, O_RDWR | O_CREAT | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
if ( fd < 0 ) {
log ("build: Failed to open %s for writing: %s",
filename,strerror(errno));
@ -1460,6 +1461,12 @@ bool saveAddsInProgress ( char *prefix ) {
// 4 bytes is how much of the total buffer is used, including
// those 4 bytes.
if ( used == 4 ) continue;
// test it
if ( used <= 4 || used > 300000000 ) { // > 300MB????
log("msg4: saving addsinprogress. bad bucket "
"used size of %"INT32,used);
continue;
}
// the buf itself
write ( fd , s_hostBufs[i] , used );
}
@ -1473,6 +1480,20 @@ bool saveAddsInProgress ( char *prefix ) {
if ( ! slot->m_callback ) continue;
// skip if got reply
if ( slot->m_readBuf ) continue;
// if not sending something, skip
if ( ! slot->m_sendBuf ) continue;
// test it
int32_t used = *(int32_t *)slot->m_sendBuf;
if ( used <= 4 || used > 300000000 ) { // > 300MB????
log("msg4: saving addsinprogress. bad slot "
"used size of %"INT32,used);
continue;
}
if ( used != slot->m_sendBufSize ) {
log("msg4: saving addsinprogress. bad used size of "
"%"INT32" != %"INT32,used,slot->m_sendBufSize);
continue;
}
// write hostid sent to
write ( fd , &slot->m_hostId , 4 );
// write that
@ -1510,6 +1531,9 @@ bool saveAddsInProgress ( char *prefix ) {
g_hostdb.m_dir , prefix );
::rename ( filename , newFilename );
log(LOG_INFO,"build: Renamed %s to %s",filename,newFilename);
return true;
}
@ -1577,12 +1601,12 @@ bool loadAddsInProgress ( char *prefix ) {
p += 4;
if ( numHostBufs != s_numHostBufs ) {
g_errno = EBADENGINEER;
return log("build: addsinprogress.dat has wrong number of "
"host bufs.");
log("build: addsinprogress.dat has wrong number of "
"host bufs.");
}
// deserialize each hostbuf
for ( int32_t i = 0 ; i < s_numHostBufs ; i++ ) {
for ( int32_t i = 0 ; i < numHostBufs ; i++ ) {
// break if nothing left to read
if ( p >= pend ) break;
// USED size of the buf
@ -1595,6 +1619,8 @@ bool loadAddsInProgress ( char *prefix ) {
s_hostBufSizes[i] = 0;
continue;
}
if ( used < 4 || used > 300000000 )
return log("msg4: bad used bytes in bucket 1");
// malloc the min buf size
int32_t allocSize = MAXHOSTBUFSIZE;
if ( allocSize < used ) allocSize = used;
@ -1620,6 +1646,12 @@ bool loadAddsInProgress ( char *prefix ) {
log("build: file %s is bad.",filename);
char *xx = NULL; *xx = 0;
}
if ( i >= s_numHostBufs ) {
mfree ( buf , allocSize ,"hostbuf");
log("build: skipping host buf #%"INT32,i);
continue;
}
// set the array
s_hostBufs [i] = buf;
s_hostBufSizes [i] = allocSize;
@ -1635,15 +1667,12 @@ bool loadAddsInProgress ( char *prefix ) {
p += 4;
// get host
Host *h = g_hostdb.getHost(hostId);
// must be there
if ( ! h ) {
close (fd);
return log("build: bad msg4 hostid %"INT32"",hostId);
}
// host many bytes
int32_t numBytes;
read ( fd , (char *)&numBytes , 4 );
p += 4;
if ( numBytes < 4 || numBytes > 300000000 )
return log("msg4: bad used bytes in slot 1");
// allocate buffer
char *buf = (char *)mmalloc ( numBytes , "msg4loadbuf");
if ( ! buf ) {
@ -1657,6 +1686,14 @@ bool loadAddsInProgress ( char *prefix ) {
return log("build: bad msg4 buf read");
}
p += numBytes;
// must be there
if ( ! h ) {
//close (fd);
log("build: bad msg4 hostid %"INT32" nb=%"INT32,
hostId,nb);
mfree ( buf , numBytes,"hostbuf");
continue;
}
// send it!
if ( ! g_udpServer.sendRequest ( buf ,
numBytes ,

@ -83,7 +83,7 @@ static bool gotSummaryWrapper ( void *state );
bool isSubDom(char *s , int32_t len);
Msg40::Msg40() {
m_firstTime = true;
m_calledFacets = false;
m_doneWithLookup = false;
m_socketHadError = 0;
m_buf = NULL;
@ -109,6 +109,8 @@ Msg40::Msg40() {
m_printCount = 0;
//m_numGigabitInfos = 0;
m_numCollsToSearch = 0;
m_numMsg20sIn = 0;
m_numMsg20sOut = 0;
}
#define MAX2 50
@ -1427,8 +1429,12 @@ bool Msg40::launchMsg20s ( bool recalled ) {
// hard limit
if ( m_numRequests-m_numReplies >= maxOut ) break;
// do not launch another until m_printi comes back because
// all summaries are bottlenecked on printing him out now
// all summaries are bottlenecked on printing him out now.
if ( m_si->m_streamResults &&
// must have at least one outstanding summary guy
// otherwise we can return true below and cause
// the stream to truncate results in gotSummary()
//m_numReplies < m_numRequests &&
i >= m_printi + MAX_OUTSTANDING_MSG20S - 1 )
break;
@ -1499,8 +1505,21 @@ bool Msg40::launchMsg20s ( bool recalled ) {
// if to a dead host, skip it
int64_t docId = m_msg3a.m_docIds[i];
uint32_t shardNum = g_hostdb.getShardNumFromDocId ( docId );
if ( g_hostdb.isShardDead ( shardNum ) ) {
log("msg40: skipping summary lookup #%"INT32" of "
// get the collection rec
CollectionRec *cr = g_collectiondb.getRec(m_firstCollnum);
// if shard is dead then do not send to it if not crawlbot
if ( g_hostdb.isShardDead ( shardNum ) &&
cr &&
// diffbot urls.csv downloads often encounter dead
// hosts that are not really dead, so wait for it
! cr->m_isCustomCrawl &&
// this is causing us to truncate streamed results
// too early when we have false positives that a
// host is dead because the server is locking up
// periodically
! m_si->m_streamResults ) {
log("msg40: skipping summary "
"lookup #%"INT32" of "
"docid %"INT64" for dead shard #%"INT32""
, i
, docId
@ -1547,8 +1566,6 @@ bool Msg40::launchMsg20s ( bool recalled ) {
// keep for-loops int16_ter with this
//if ( i > m_maxiLaunched ) m_maxiLaunched = i;
// get the collection rec
CollectionRec *cr =g_collectiondb.getRec(m_firstCollnum);
//getRec(m_si->m_coll2,m_si->m_collLen2);
if ( ! cr ) {
log("msg40: missing coll");
@ -1737,7 +1754,7 @@ Msg20 *Msg40::getAvailMsg20 ( ) {
if ( m_msg20[i]->m_launched ) continue;
return m_msg20[i];
}
// how can this happen???
// how can this happen??? THIS HAPPEND
char *xx=NULL;*xx=0;
return NULL;
}
@ -1762,7 +1779,7 @@ bool gotSummaryWrapper ( void *state ) {
THIS->m_numReplies,
THIS->m_msg3a.m_numDocIds);
// it returns false if we're still awaiting replies
if ( ! THIS->gotSummary ( ) ) return false;
if ( ! THIS->m_calledFacets && ! THIS->gotSummary ( ) ) return false;
// lookup facets
if ( THIS->m_si &&
! THIS->m_si->m_streamResults &&
@ -2215,12 +2232,11 @@ bool Msg40::gotSummary ( ) {
complete:
// . ok, now i wait for everybody.
// . ok, now i wait for all msg20s (getsummary) to come back in.
// . TODO: evaluate if this hurts us
if ( m_numReplies < m_numRequests )
return false;
// if streaming results, we are done
if ( m_si && m_si->m_streamResults ) {
// unless waiting for last transmit to complete
@ -2444,6 +2460,9 @@ bool Msg40::gotSummary ( ) {
for ( int32_t i = 0 ; dedupPercent && i < m_numReplies ; i++ ) {
// skip if already invisible
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// Skip if invalid
if ( m_msg20[i]->m_errno ) continue;
// start with the first docid we have not yet checked!
//int32_t m = oldNumContiguous;
// get it
@ -2462,6 +2481,8 @@ bool Msg40::gotSummary ( ) {
// skip if already invisible
if ( *level != CR_OK ) continue;
// get it
if ( m_msg20[m]->m_errno ) continue;
Msg20Reply *mrm = m_msg20[m]->m_r;
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
@ -6280,8 +6301,8 @@ bool Msg40::lookupFacets ( ) {
if ( m_doneWithLookup ) return true;
if ( m_firstTime ) {
m_firstTime = false;
if ( !m_calledFacets ) {
m_calledFacets = true;
m_numMsg20sOut = 0;
m_numMsg20sIn = 0;
m_j = 0;

@ -223,7 +223,7 @@ class Msg40 {
bool m_doneWithLookup;
HashTableX m_facetTextTable;
SafeBuf m_facetTextBuf;
bool m_firstTime;
bool m_calledFacets;
int32_t m_omitCount;
bool printFacetTables ( class SafeBuf *sb ) ;

@ -88,7 +88,7 @@ bool MsgC::getIp(char *hostname , int32_t hostnameLen ,
if ( g_dns.isInCache ( key , ip ) ) {
if ( *ip == 3 ) { char *xx=NULL;*xx=0; }
// debug msg
//log("dns::getIp: %s (key=%"UINT64") has ip=%s in cache!!!",
//log(LOG_DEBUG, "dns::getIp: %s (key=%"UINT64") has ip=%s in cache!!!",
// tmp,key.n0,iptoa(*ip));
return true;
}

@ -607,6 +607,11 @@ loop:
// debug msg
//log("Multicast:: no hosts left to send to");
g_errno = ENOHOSTS; return false; }
// log("build: msg %x sent to host %"INT32 " first hostId is %"INT32
// " oustanding msgs %"INT32,
// m_msgType, i, firstHostId, m_hostPtrs[i]->m_numOutstandingRequests);
// . send to this guy, if we haven't yet
// . returns false and sets g_errno on error
// . if it returns true, we sent ok, so we should return true

@ -3570,6 +3570,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
" <a href=/v3/crawl/download/%s_urls.csv>"
"new csv format</a>"
" <a href=/search?q=gbsortby"
"int%%3AgbssSpiderTime&n=50&c=%s>"
"last 50 download attempts</a>"
"</td>"
"</tr>"
@ -3645,6 +3649,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// urls.csv new format v3
, cr->m_coll
// last 50 downloaded urls
, cr->m_coll
// latest objects in html
, cr->m_coll
, rand64

@ -483,7 +483,7 @@ bool processLoop ( void *state ) {
"><td>"
//"<font face=times,sans-serif color=black size=-1>"
"<span style=\"%s\">"
"This is Gigablast's cached page of </span>"
"This is <a href=/>Gigablast<a>'s cached page of </span>"
"<a href=\"%s\" style=\"%s\">%s</a>"
"" , styleTitle, f->getUrl(), styleLink,
f->getUrl() );

@ -200,7 +200,7 @@ skipReplaceHost:
//"<td><b>resends sent</td>"
//"<td><b>errors recvd</td>"
"<td><b>try agains recvd</b></td>"
"<td><b>try agains sent</b></td>"
"<td><a href=\"/admin/hosts?c=%s&sort=3\">"
"<b>dgrams resent</b></a></td>"
@ -630,6 +630,15 @@ skipReplaceHost:
if ( !(flags & PFLAG_MERGEMODE0) )
fb.safePrintf ( "y");
if ( format == FORMAT_HTML && !h->m_spiderEnabled) {
fb.safePrintf("<span title=\"Spider Disabled\" style=\"text-decoration:line-through;\">S</span>");
}
if ( format == FORMAT_HTML && !h->m_queryEnabled) {
fb.safePrintf("<span title=\"Query Disabled\" style=\"text-decoration:line-through;\">Q</span>");
}
// clear it if it is us, this is invalid
if ( ! h->m_gotPingReply ) {
fb.reset();
@ -758,6 +767,13 @@ skipReplaceHost:
sb.safePrintf("\t\t<note>%s</note>\n",
h->m_note );
sb.safePrintf("\t\t<spider>%"INT32"</spider>\n",
(int32_t)h->m_spiderEnabled );
sb.safePrintf("\t\t<query>%"INT32"</query>\n",
(int32_t)h->m_queryEnabled );
sb.safePrintf("\t</host>\n");
continue;
@ -859,6 +875,14 @@ skipReplaceHost:
sb.safePrintf("\t\t\"note\":\"%s\"\n",
h->m_note );
sb.safePrintf("\t\t\"spider\":\"%"INT32"\"\n",
(int32_t)h->m_spiderEnabled );
sb.safePrintf("\t\t\"query\":\"%"INT32"\"\n",
(int32_t)h->m_queryEnabled );
sb.safePrintf("\t},\n");
continue;
@ -1313,12 +1337,14 @@ skipReplaceHost:
*/
"<tr class=poo>"
"<td>try agains recvd</td>"
"<td>try agains sent</td>"
"<td>How many ETRYAGAIN errors "
"were received in response to a "
"has this host sent out? they are sent out some times "
"in response to a "
"request to add data. Usually because the host's memory "
"is full and it is dumping its data to disk. This number "
"can be high if the host if failing to dump the data "
"can be relatively high if the host if failing to dump "
"the data "
"to disk because of some malfunction, and it can therefore "
"bottleneck the entire cluster."
"</td>"

@ -131,6 +131,53 @@ Host *getHostToHandleInjection ( char *url ) {
Host *group = g_hostdb.getShard ( shardNum );
int32_t hostNum = docId % g_hostdb.m_numHostsPerShard;
Host *host = &group[hostNum];
bool isWarcInjection = false;
int32_t ulen = gbstrlen(url);
if ( ulen > 10 && strcmp(url+ulen-8,".warc.gz") == 0 )
isWarcInjection = true;
if ( ulen > 10 && strcmp(url+ulen-5,".warc") == 0 )
isWarcInjection = true;
if ( ! isWarcInjection ) return host;
// warc files end up calling XmlDoc::indexWarcOrArc() which spawns
// a msg7 injection request for each doc in the warc/arc file
// so let's do load balancing differently for them so one host
// doesn't end up doing a bunch of wget/gunzips on warc files
// thereby bottlenecking the cluster. get the first hostid that
// we have not sent a msg7 injection request to that is still out
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
h->m_tmpCount = 0;
}
for ( UdpSlot *slot = g_udpServer.m_head2 ;
slot ;
slot = slot->m_next2 ) {
// skip if not injection request
if ( slot->m_msgType != 0x07 ) continue;
//if ( ! slot->m_weInitiated ) continue;
// if we did not initiate the injection request, i.e. if
// it is to us, skip it
if ( ! slot->m_callback ) continue;
// who is it from?
int32_t hostId = slot->m_hostId;
if ( hostId < 0 ) continue;
Host *h = g_hostdb.getHost ( hostId );
if ( ! h ) continue;
h->m_tmpCount++;
}
int32_t min = 999999;
Host *minh = NULL;
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
if ( h->m_tmpCount == 0 ) return h;
if ( h->m_tmpCount >= min ) continue;
min = h->m_tmpCount;
minh = h;
}
if ( minh ) return minh;
// how can this happen?
return host;
}
@ -182,6 +229,9 @@ bool Msg7::sendInjectionRequestToHost ( InjectionRequest *ir ,
return log("inject: url too big.");
}
// hack fix core
if ( ir->size_metadata == 0 ) ir->ptr_metadata = NULL;
int32_t sirSize = 0;
char *sir = serializeMsg2 ( ir ,
sizeof(InjectionRequest),
@ -615,7 +665,7 @@ void sendUdpReply7 ( void *state ) {
uint32_t statColor = 0xccffcc;
if(xd->m_indexCode) {
statColor = 0x4e99e9;
statColor = 0xaaddaa;//0x4e99e9;
}
g_stats.addStat_r ( xd->m_rawUtf8ContentSize,
xd->m_injectStartTime,
@ -652,11 +702,29 @@ void sendUdpReply7 ( void *state ) {
void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
InjectionRequest *ir = (InjectionRequest *)slot->m_readBuf;
// now just supply the first guy's char ** and size ptr
deserializeMsg2 ( &ir->ptr_url, &ir->size_url );
if ( ! deserializeMsg2 ( &ir->ptr_url, &ir->size_url ) ) {
log("inject: error deserializing inject request from "
"host ip %s port %i",iptoa(slot->m_ip),(int)slot->m_port);
g_errno = EBADREQUEST;
g_udpServer.sendErrorReply(slot,g_errno);
//g_corruptCount++;
return;
}
// the url can be like xyz.com. so need to do another corruption
// test for ia
if ( ! ir->ptr_url ) { // || strncmp(ir->ptr_url,"http",4) != 0 ) {
//log("inject: trying to inject NULL or non http url.");
log("inject: trying to inject NULL url.");
g_errno = EBADURL;
//g_corruptCount++;
g_udpServer.sendErrorReply(slot,g_errno);
return;
}
CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum );
if ( ! cr ) {
@ -692,6 +760,10 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
s_injectHead = xd;
s_injectTail = xd;
}
if(ir->ptr_content && ir->ptr_content[ir->size_content - 1]) {
// XmlDoc expects this buffer to be null terminated.
char *xx=NULL;*xx=0;
}
if ( ! xd->injectDoc ( ir->ptr_url , // m_injectUrlBuf.getBufStart() ,
cr ,
@ -722,7 +794,8 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
ir->m_injectDocIp ,
ir->ptr_contentDelim,
ir->ptr_metadata,
ir->size_metadata
ir->size_metadata,
ir->size_content - 1 // there should be a null in that last byte
) )
// we blocked...
return;

@ -240,7 +240,7 @@ bool showLine ( SafeBuf *sb , char *s , int32_t len ) {
return sb->brify ( s , len ,
0 , // niceness
80 , // cols
8000 , // cols
"<br>",
false ); // isHtml?
}

@ -600,6 +600,14 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
// ! cr->m_isCustomCrawl )
// si->m_docsWanted = maxpp;
// BUT if it is a custom diffbot crawl with no &stream=1 option,
// then to prevent a results page of 1.6GB, limit it here
if ( si->m_docsWanted > 1000 && ! si->m_streamResults ) {
si->m_docsWanted = 1000;
log("query: limiting query %s without &stream=1 option to "
"%"INT32" results.",st->m_si.m_displayQuery,1000);
}
st->m_numDocIds = si->m_docsWanted;
// watch out for cowboys
@ -5008,26 +5016,31 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
// print the URL
//
////////////
StackBuf(tmpBuf);
char* displayUrl = Url::getDisplayUrl(url, &tmpBuf);
uint32_t displayUrlLen = tmpBuf.length();
// hack off the http:// if any for displaying it on screen
if ( urlLen > 8 && strncmp ( url , "http://" , 7 )==0 ) {
url += 7; urlLen -= 7; }
if ( displayUrlLen > 8 && strncmp ( displayUrl , "http://" , 7 )==0 ) {
displayUrl += 7; displayUrlLen -= 7; }
// . remove trailing /
// . only remove from root urls in case user cuts and
// pastes it for link: search
if ( url [ urlLen - 1 ] == '/' ) {
if ( displayUrl [ displayUrlLen - 1 ] == '/' ) {
// see if any other slash before us
int32_t j;
for ( j = urlLen - 2 ; j >= 0 ; j-- )
if ( url[j] == '/' ) break;
for ( j = displayUrlLen - 2 ; j >= 0 ; j-- )
if ( displayUrl[j] == '/' ) break;
// if there wasn't, we must have been a root url
// so hack off the last slash
if ( j < 0 ) urlLen--;
if ( j < 0 ) displayUrlLen--;
}
if ( si->m_format == FORMAT_HTML ) {
sb->safePrintf ("<font color=gray>" );
//sb->htmlEncode ( url , gbstrlen(url) , false );
// 20 for the date after it
sb->safeTruncateEllipsis ( url , 50 ); // cols - 30 );
sb->safeTruncateEllipsis ( displayUrl , 50 ); // cols - 30 );
// turn off the color
sb->safePrintf ( "</font>\n" );
}
@ -5058,12 +5071,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
if ( si->m_format == FORMAT_XML ) {
sb->safePrintf("\t\t<url><![CDATA[");
sb->safeMemcpy ( url , urlLen );
sb->safeMemcpy ( displayUrl , displayUrlLen );
sb->safePrintf("]]></url>\n");
}
if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"url\":\"");
sb->jsonEncode ( url , urlLen );
sb->jsonEncode ( displayUrl , displayUrlLen );
sb->safePrintf("\",\n");
}
@ -5717,10 +5730,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
*/
if ( mr->size_metadataBuf && si->m_format == FORMAT_JSON) {
sb->safePrintf("\t\t\"metadata\":");
sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
sb->pushChar(',');
sb->safePrintf("\t\t\"metadata\":[");
//sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
sb->safeStrcpy(mr->ptr_metadataBuf);
// without this \n we seem to lose our ] i guess it gets
// backed up over
sb->safePrintf("],\n");
}

@ -2523,7 +2523,17 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
ir->ptr_url = hr->getString("u",NULL);
if ( ! ir->ptr_url ) ir->ptr_url = hr->getString("url",NULL);
if ( ! ir->ptr_url ) ir->ptr_url = hr->getString("urls",NULL);
if ( ! ir->ptr_url ) {
g_errno = EBADURL;
doneInjectingWrapper3 ( st1 );
return true;
}
// include \0 in size
ir->size_url = gbstrlen(ir->ptr_url)+1;
// get back a short reply so we can show the status code easily
ir->m_shortReply = 1;

@ -7,6 +7,7 @@
#include "Dns.h"
#include "SafeBuf.h"
#include "Msg13.h"
#include "Linkdb.h" // Msg25Request
static void printTcpTable (SafeBuf *p,char *title,TcpServer *server);
static void printUdpTable (SafeBuf *p,char *title,UdpServer *server,
@ -554,6 +555,62 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
if ( msgType == 0x50 ) desc = "get root quality";
if ( msgType == 0x25 ) desc = "get link info";
if ( msgType == 0xfd ) desc = "proxy forward";
char *req = NULL;
int32_t reqSize = 0;
if ( s->m_callback ) {
req = s->m_sendBuf;
reqSize = s->m_sendBufSize;
}
// are we receiving the request?
else {
req = s->m_readBuf;
reqSize = s->m_readBufSize;
// if not completely read in yet...
if ( s->hasDgramsToRead ())
req = NULL;
}
SafeBuf tmp;
char *altText = "";
// MSG25
if ( req && msgType == 0x25 ) {
Msg25Request *mr = (Msg25Request *)req;
// it doesn't hurt if we call Msg25Request::deserialize
// again if it has already been called
mr->deserialize();
if ( mr->m_mode == 2 ) { // MODE_SITELINKINFO ) {
tmp.safePrintf(" title=\""
"getting site link info for "
"%s "
"in collnum %i.\n"
"sitehash64=%"UINT64" "
"waitinginline=%i"
"\""
,mr->ptr_site
,(int)mr->m_collnum
,mr->m_siteHash64
,(int)mr->m_waitingInLine
);
desc = "getting site link info";
}
else {
tmp.safePrintf(" title=\""
"getting page link info for "
"%s "
"in collnum %i."
"\""
,mr->ptr_url
,(int)mr->m_collnum
);
desc = "getting page link info";
}
}
if ( tmp.getLength() )
altText = tmp.getBufStart();
p->safePrintf ( "<tr bgcolor=#%s>"
"<td>%s</td>" // age
@ -609,12 +666,14 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
if ( ! s->m_callback ) toFrom = "from";
//"<td><a href=http://%s:%hu/cgi/15.cgi>%"INT32"</a></td>"
p->safePrintf ( "<td>0x%hhx</td>" // msgtype
"<td><nobr>%s</nobr></td>" // desc
"<td%s><nobr>"
"%s</nobr></td>" // desc
"<td><nobr>%s <a href=http://%s:%hu/"
"admin/sockets?"
"c=%s>%s</a></nobr></td>"
"<td>%s%"INT32"%s</td>" , // niceness
s->m_msgType ,
altText,
desc,
//iptoa(s->m_ip) ,
//s->m_port ,

@ -49,9 +49,18 @@ class StateStatsdb {
static time_t genDate( char *date, int32_t dateLen ) ;
static void sendReply ( void *st ) ;
static bool s_graphInUse = false;
// . returns false if blocked, otherwise true
// . sets g_errno on error
bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
if ( s_graphInUse ) {
char *msg = "stats graph calculating for another user. "
"Try again later.";
g_httpServer.sendErrorReply(s,500,msg);
return true;
}
char *cgi;
int32_t cgiLen;
@ -121,7 +130,6 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
st->m_endDate = st->m_endDateR;
}
g_statsdb.addDocsIndexed();
//
// this is no longer a gif, but an html graph in g_statsdb.m_sb
//
@ -130,8 +138,10 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
st->m_samples ,
&st->m_sb2 ,
st ,
sendReply ) )
sendReply ) ) {
s_graphInUse = true;
return false;
}
// if we didn't block call it ourselves directly
sendReply ( st );
@ -139,6 +149,15 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
return true;
}
void genStatsDataset(SafeBuf *buf, StateStatsdb *st) {
if ( ! g_conf.m_useStatsdb ) {
buf->safePrintf("{\"error\":\"statsdb disabled\"}\n" );
return;
}
}
static void writeControls ( SafeBuf *buf, StateStatsdb *st ) ;
void genStatsGraphTable(SafeBuf *buf, StateStatsdb *st) {
if ( ! g_conf.m_useStatsdb )
@ -186,6 +205,8 @@ void genStatsGraphTable(SafeBuf *buf, StateStatsdb *st) {
void sendReply ( void *state ) {
s_graphInUse = false;
StateStatsdb *st = (StateStatsdb *)state;
if ( g_errno ) {
@ -196,6 +217,10 @@ void sendReply ( void *state ) {
TcpSocket *s = st->m_socket;
if(st->m_request.getLong("json", 0)) {
//xxxxxxxxxxxxxxxxxxxxxxxxx
}
if(st->m_request.getLong("justgraph", 0)) {
SafeBuf buf( 1024*32 , "tmpbuf0" );
genStatsGraphTable(&buf, st);

@ -6800,7 +6800,7 @@ void Parms::init ( ) {
m->m_off = (char *)&cr.m_maxSearchResultsPerQuery - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_flags = 0;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
@ -10548,7 +10548,7 @@ void Parms::init ( ) {
m->m_off = (char *)&g_conf.m_maxHeartbeatDelay - g;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_flags = PF_CLONE; // PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
@ -12401,12 +12401,31 @@ void Parms::init ( ) {
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
/*
m->m_title = "files group writable";
m->m_desc = "Make all created files group writable? If you have "
"multiple user accounts starting Gigablast processes you "
"will want the files to be group writable. You will "
"need to make sure you run gigablast under the "
"primary group you want to use for gigablast administration.";
m->m_cgi = "afgw";
m->m_off = (char *)&g_conf.m_makeAllFilesGroupWritable - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
*/
m->m_title = "verify disk writes";
m->m_desc = "Read what was written in a verification step. Decreases "
"performance, but may help fight disk corruption mostly on "
@ -16655,6 +16674,21 @@ void Parms::init ( ) {
m->m_flags = PF_CLONE;
m++;
m->m_title = "index warc or arc files";
m->m_desc = "If this is true Gigablast will index .warc and .arc "
"files by injecting the pages contained in them as if they "
"were spidered with the content in the .warc or .arc file. "
"The spidered time will be taken from the archive file "
"as well.";
m->m_cgi = "indexwarcs";
m->m_off = (char *)&cr.m_indexWarcs - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
/*
m->m_title = "add url enabled";
m->m_desc = "If this is enabled others can add "
@ -21338,9 +21372,23 @@ void tryToCallCallbacks ( ) {
if ( pn->m_calledCallback ) continue;
// should we call the callback?
bool callIt = false;
// 8 seconds is enough to wait for all replies to come in
if ( now - pn->m_startTime > 8 ) callIt = true;
if ( pn->m_numReplies >= pn->m_numRequests ) callIt = true;
// sometimes we don't launch any requests to update parms
// because we are jammed up. same logic as we use for
// freeing the pn below.
if ( pn->m_numGoodReplies < pn->m_numHostsTotal )
callIt = false;
// 8 seconds is enough to wait for all replies to come in.
// a host might be dead, so we need this here lest the
// underlying page handler (i.e. sendPageCrawlbot()) never
// get called if a host is dead. if you are updating some
// parms you want the page to return.
if ( now - pn->m_startTime > 8 &&
! callIt &&
g_hostdb.hasDeadHost() )
callIt = true;
if ( ! callIt ) continue;
// callback is NULL for updating parms like spiderRoundNum
// in Spider.cpp
@ -21475,6 +21523,8 @@ bool Parms::doParmSendingLoop ( ) {
if ( ! s_headNode ) return true;
if ( g_isDumpingRdbFromMain ) return true;
if ( s_inLoop ) return true;
s_inLoop = true;
@ -21551,8 +21601,8 @@ bool Parms::doParmSendingLoop ( ) {
}
// debug log
log(LOG_INFO,"parms: sending parm request "
"to hostid %"INT32"",h->m_hostId);
log(LOG_INFO,"parms: sending parm request id %i "
"to hostid %"INT32"",(int)pn->m_parmId,h->m_hostId);
// count it
pn->m_numRequests++;
@ -22946,6 +22996,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"\"temporary\" errors like DNS timeouts."
"</td></tr>"
"<tr class=poo><td>errorcode==32880</td>"
"<td>"
"If the last time it was spidered it had this "
"numeric error code. See the error codes in "
"Errno.cpp. In this particular example 32880 is "
"for EBADURL."
"</td></tr>"
"<tr class=poo><td>hastmperror</td>"
"<td>"
"This is true if the last spider attempt resulted "

@ -6019,7 +6019,6 @@ void PosdbTable::intersectLists10_r ( ) {
#define RINGBUFSIZE 4096
//#define RINGBUFSIZE 1024
unsigned char ringBuf[RINGBUFSIZE+10];
unsigned char *ringBufEnd = ringBuf + RINGBUFSIZE;
// for overflow conditions in loops below
ringBuf[RINGBUFSIZE+0] = 0xff;
ringBuf[RINGBUFSIZE+1] = 0xff;
@ -6363,18 +6362,7 @@ void PosdbTable::intersectLists10_r ( ) {
// for 'search engine'. it might save time!
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
for ( int32_t *rb = (int32_t *)ringBuf ; ; ) {
rb[0] = 0xffffffff;
rb[1] = 0xffffffff;
rb[2] = 0xffffffff;
rb[3] = 0xffffffff;
rb[4] = 0xffffffff;
rb[5] = 0xffffffff;
rb[6] = 0xffffffff;
rb[7] = 0xffffffff;
rb += 8;
if ( rb >= (int32_t *)ringBufEnd ) break;
}
memset ( ringBuf, 0xff, RINGBUFSIZE );
// now to speed up 'time enough for love' query which does not
// have many super high scoring guys on top we need a more restrictive

@ -885,6 +885,9 @@ void hdtempWrapper ( int fd , void *state ) {
// or if haven't waited int32_t enough
if ( now < s_nextTime ) return;
// see if this fixes the missed heartbeats
//return;
// set it
g_process.m_threadOut = true;
// . call thread to call popen
@ -968,7 +971,11 @@ float getDiskUsage ( int64_t *diskAvail ) {
char cmd[10048];
char out[1024];
sprintf(out,"%sdiskusage",g_hostdb.m_dir);
snprintf(cmd,10000,"df -ka %s | tail -1 | "
snprintf(cmd,10000,
// "ulimit -v 25000 ; "
// "ulimit -t 30 ; "
// "ulimit -a; "
"df -ka %s | tail -1 | "
"awk '{print $4\" \"$5}' > %s",
g_hostdb.m_dir,
out);
@ -982,7 +989,9 @@ float getDiskUsage ( int64_t *diskAvail ) {
return -1.0; // unknown
}
// this will happen if you don't upgrade glibc to 2.2.4-32 or above
if ( err != 0 ) {
// for some reason it returns no mem but the file is ok.
// something to do with being in a thread?
if ( err != 0 && errno != ENOMEM ) {
log("build: Call to system(\"%s\") had error: %s",
cmd,mstrerror(errno));
return -1.0; // unknown
@ -1175,8 +1184,12 @@ void heartbeatWrapper ( int fd , void *state ) {
// check the "cat /proc/<pid>/status | grep SigQ" output
// to see if its overflowed. hopefully i will fix this by
// queue the signals myself in Loop.cpp.
log("db: missed heartbeat by %"INT64" ms. Num elapsed alarms = "
"%"INT32"", elapsed-100,(int32_t)(g_numAlarms - s_lastNumAlarms));
log("db: missed calling niceness 0 heartbeatWrapper "
"function by %"INT64" ms. Either you need a quickpoll "
"somewhere or a niceness 0 function is taking too long. "
"Num elapsed alarms = "
"%"INT32"", elapsed-100,(int32_t)(g_numAlarms -
s_lastNumAlarms));
s_last = now;
s_lastNumAlarms = g_numAlarms;
@ -1524,21 +1537,32 @@ bool Process::shutdown2 ( ) {
static bool s_printed = false;
// wait for all threads to return
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
int32_t n = g_threads.getNumWriteThreadsOut();
waitLoop:
// wait for all 'write' threads to be done. they can be done
// and just waiting for a join, in which case we won't coun them.
int32_t n = g_threads.getNumActiveWriteUnlinkRenameThreadsOut();
// we can't wait for the write thread if we had a seg fault, but
// do print a msg in the log
if ( n != 0 && m_urgent ) {
log(LOG_INFO,"gb: Has %"INT32" write/unlink/rename "
"threads active. Waiting.",n);
sleep(1);
goto waitLoop;
}
if ( n != 0 && ! m_urgent ) {
log(LOG_INFO,"gb: Has %"INT32" write threads out. Waiting for "
log(LOG_INFO,"gb: Has %"INT32" write/unlink/rename "
"threads out. Waiting for "
"them to finish.",n);
return false;
}
else if ( ! s_printed && ! m_urgent ) {
s_printed = true;
log(LOG_INFO,"gb: No write threads out.");
log(LOG_INFO,"gb: No write/unlink/rename threads active.");
}
// disable all spidering
// we can exit while spiders are in the queue because
// if they are in the middle of being added they will be
@ -1650,11 +1674,18 @@ bool Process::shutdown2 ( ) {
// urgent means we need to dump core, SEGV or something
if ( m_urgent ) {
// log it
log("gb: Dumping core after saving.");
// at least destroy the page caches that have shared memory
// because they seem to not clean it up
resetPageCaches();
if ( g_threads.amThread() ) {
uint64_t tid = (uint64_t)getpidtid();
log("gb: calling abort from thread with tid of "
"%"UINT64" (thread)",tid);
}
else {
pid_t pid = getpid();
log("gb: calling abort from main process "
"with pid of %"UINT64" (main process)",
(uint64_t)pid);
}
// let's ensure our core file can dump
struct rlimit lim;
@ -1662,9 +1693,48 @@ bool Process::shutdown2 ( ) {
if ( setrlimit(RLIMIT_CORE,&lim) )
log("gb: setrlimit: %s.", mstrerror(errno) );
// if we are in this code then we are the main process
// and not a thread.
// see if this makes it so we always dump core again.
// joins with all threads, too.
log("gb: Joining with all threads");
g_threads.killAllThreads();
// log it
log("gb: Dumping core after saving.");
// at least destroy the page caches that have shared memory
// because they seem to not clean it up
//resetPageCaches();
// use the default segmentation fault handler which should
// dump core rather than call abort() which doesn't always
// work because perhaps of threads doing something
int signum = SIGSEGV;
signal(signum, SIG_DFL);
kill(getpid(), signum);
// this is the trick: it will trigger the core dump by
// calling the original SIGSEGV handler.
//int signum = SIGSEGV;
//signal(signum, SIG_DFL);
//kill(getpid(), signum);
// try resetting the SEGV sig handle to default. when
// we return it should call the default handler.
// struct sigaction sa;
// sigemptyset (&sa.sa_mask);
// sa.sa_flags = SA_RESETHAND;
// sa.sa_sigaction = NULL;
// sigaction ( SIGSEGV, &sa, 0 ) ;
// return true;
// . force an abnormal termination which will cause a core dump
// . do not dump core on SIGHUP signals any more though
abort();
//abort();
// return from this signal handler so we can execute
// original SIGSEGV handler right afterwards
// default handler should be called after we return now
// keep compiler happy
return true;
}
@ -1674,6 +1744,12 @@ bool Process::shutdown2 ( ) {
// cleanup threads, this also launches them too
g_threads.timedCleanUp(0x7fffffff,MAX_NICENESS);
// there's no write/unlink/rename threads active,
// so just kill the remaining threads and join
// with them so we can try to get a proper exit status code
log("gb: Joining with all threads");
g_threads.killAllThreads();
// wait for all threads to complete...
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
//if ( n > 0 )

@ -1866,7 +1866,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
ff.safePrintf("%strash/profile.txt",g_hostdb.m_dir);
char *filename = ff.getBufStart();
unlink ( filename );
int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
int fd = open ( filename , O_RDWR | O_CREAT , getFileCreationFlags() );
if ( fd < 0 ) {
sb->safePrintf("FAILED TO OPEN %s for writing: %s"
,ff.getBufStart(),mstrerror(errno));
@ -2094,7 +2094,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
ff.reset();
ff.safePrintf("%strash/qp.txt",g_hostdb.m_dir);
filename = ff.getBufStart();
fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
//fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
if ( fd < 0 ) {
sb->safePrintf("FAILED TO OPEN %s for writing: %s"
,ff.getBufStart(),strerror(errno));

268
Punycode.cpp Normal file

@ -0,0 +1,268 @@
#include "Punycode.h"
#include <string.h>
/* #include "punycode.h" */
/*** Bootstring parameters for Punycode ***/
enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
/* basic(cp) tests whether cp is a basic code point: */
#define basic(cp) ((punycode_uint)(cp) < 0x80)
/* delim(cp) tests whether cp is a delimiter: */
#define delim(cp) ((cp) == delimiter)
/* decode_digit(cp) returns the numeric value of a basic code */
/* point (for use in representing integers) in the range 0 to */
/* base-1, or base if cp does not represent a value. */
static punycode_uint decode_digit(punycode_uint cp)
{
return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
cp - 97 < 26 ? cp - 97 : base;
}
/* encode_digit(d,flag) returns the basic code point whose value */
/* (when used for representing integers) is d, which needs to be in */
/* the range 0 to base-1. The lowercase form is used unless flag is */
/* nonzero, in which case the uppercase form is used. The behavior */
/* is undefined if flag is nonzero and digit d has no uppercase form. */
static char encode_digit(punycode_uint d, int flag)
{
return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
}
/* flagged(bcp) tests whether a basic code point is flagged */
/* (uppercase). The behavior is undefined if bcp is not a */
/* basic code point. */
#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
/* encode_basic(bcp,flag) forces a basic code point to lowercase */
/* if flag is zero, uppercase if flag is nonzero, and returns */
/* the resulting code point. The code point is unchanged if it */
/* is caseless. The behavior is undefined if bcp is not a basic */
/* code point. */
static char encode_basic(punycode_uint bcp, int flag)
{
bcp -= (bcp - 97 < 26) << 5;
return bcp + ((!flag && (bcp - 65 < 26)) << 5);
}
/*** Platform-specific constants ***/
/* maxint is the maximum value of a punycode_uint variable: */
static const punycode_uint maxint = -1;
/* Because maxint is unsigned, -1 becomes the maximum value. */
/*** Bias adaptation function ***/
static punycode_uint adapt(
punycode_uint delta, punycode_uint numpoints, int firsttime )
{
punycode_uint k;
delta = firsttime ? delta / damp : delta >> 1;
/* delta >> 1 is a faster way of doing delta / 2 */
delta += delta / numpoints;
for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
delta /= base - tmin;
}
return k + (base - tmin + 1) * delta / (delta + skew);
}
/*** Main encode function ***/
enum punycode_status punycode_encode(
size_t input_length_orig,
const punycode_uint input[],
const unsigned char case_flags[],
size_t *output_length,
char output[] )
{
punycode_uint input_length, n, delta, h, b, bias, j, m, q, k, t;
size_t out, max_out;
/* The Punycode spec assumes that the input length is the same type */
/* of integer as a code point, so we need to convert the size_t to */
/* a punycode_uint, which could overflow. */
if (input_length_orig > maxint) return punycode_overflow;
input_length = (punycode_uint) input_length_orig;
/* Initialize the state: */
n = initial_n;
delta = 0;
out = 0;
max_out = *output_length;
bias = initial_bias;
/* Handle the basic code points: */
for (j = 0; j < input_length; ++j) {
if (basic(input[j])) {
if (max_out - out < 2) return punycode_big_output;
output[out++] = case_flags ?
encode_basic(input[j], case_flags[j]) : (char) input[j];
}
/* else if (input[j] < n) return punycode_bad_input; */
/* (not needed for Punycode with unsigned code points) */
}
h = b = (punycode_uint) out;
/* cannot overflow because out <= input_length <= maxint */
/* h is the number of code points that have been handled, b is the */
/* number of basic code points, and out is the number of ASCII code */
/* points that have been output. */
if (b > 0) output[out++] = delimiter;
/* Main encoding loop: */
while (h < input_length) {
/* All non-basic code points < n have been */
/* handled already. Find the next larger one: */
for (m = maxint, j = 0; j < input_length; ++j) {
/* if (basic(input[j])) continue; */
/* (not needed for Punycode) */
if (input[j] >= n && input[j] < m) m = input[j];
}
/* Increase delta enough to advance the decoder's */
/* <n,i> state to <m,0>, but guard against overflow: */
if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
delta += (m - n) * (h + 1);
n = m;
for (j = 0; j < input_length; ++j) {
/* Punycode does not need to check whether input[j] is basic: */
if (input[j] < n /* || basic(input[j]) */ ) {
if (++delta == 0) return punycode_overflow;
}
if (input[j] == n) {
/* Represent delta as a generalized variable-length integer: */
for (q = delta, k = base; ; k += base) {
if (out >= max_out) return punycode_big_output;
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
k >= bias + tmax ? tmax : k - bias;
if (q < t) break;
output[out++] = encode_digit(t + (q - t) % (base - t), 0);
q = (q - t) / (base - t);
}
output[out++] = encode_digit(q, case_flags && case_flags[j]);
bias = adapt(delta, h + 1, h == b);
delta = 0;
++h;
}
}
++delta, ++n;
}
*output_length = out;
return punycode_success;
}
/*** Main decode function ***/
enum punycode_status punycode_decode(
size_t input_length,
const char input[],
size_t *output_length,
punycode_uint output[],
unsigned char case_flags[] )
{
punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
size_t b, j, in;
/* Initialize the state: */
n = initial_n;
out = i = 0;
max_out = *output_length > maxint ? maxint
: (punycode_uint) *output_length;
bias = initial_bias;
/* Handle the basic code points: Let b be the number of input code */
/* points before the last delimiter, or 0 if there is none, then */
/* copy the first b code points to the output. */
for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j;
if (b > max_out) return punycode_big_output;
for (j = 0; j < b; ++j) {
if (case_flags) case_flags[out] = flagged(input[j]);
if (!basic(input[j])) return punycode_bad_input;
output[out++] = input[j];
}
/* Main decoding loop: Start just after the last delimiter if any */
/* basic code points were copied; start at the beginning otherwise. */
for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
/* in is the index of the next ASCII code point to be consumed, */
/* and out is the number of code points in the output array. */
/* Decode a generalized variable-length integer into delta, */
/* which gets added to i. The overflow checking is easier */
/* if we increase i as we go, then subtract off its starting */
/* value at the end to obtain delta. */
for (oldi = i, w = 1, k = base; ; k += base) {
if (in >= input_length) return punycode_bad_input;
digit = decode_digit(input[in++]);
if (digit >= base) return punycode_bad_input;
if (digit > (maxint - i) / w) return punycode_overflow;
i += digit * w;
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
k >= bias + tmax ? tmax : k - bias;
if (digit < t) break;
if (w > maxint / (base - t)) return punycode_overflow;
w *= (base - t);
}
bias = adapt(i - oldi, out + 1, oldi == 0);
/* i was supposed to wrap around from out+1 to 0, */
/* incrementing n each time, so we'll fix that now: */
if (i / (out + 1) > maxint - n) return punycode_overflow;
n += i / (out + 1);
i %= (out + 1);
/* Insert n at position i of the output: */
/* not needed for Punycode: */
/* if (basic(n)) return punycode_bad_input; */
if (out >= max_out) return punycode_big_output;
if (case_flags) {
memmove(case_flags + i + 1, case_flags + i, out - i);
/* Case of last ASCII code point determines case flag: */
case_flags[i] = flagged(input[in - 1]);
}
memmove(output + i + 1, output + i, (out - i) * sizeof *output);
output[i++] = n;
}
*output_length = (size_t) out;
/* cannot overflow because out <= old value of *output_length */
return punycode_success;
}

154
Punycode.h Normal file

@ -0,0 +1,154 @@
/*
punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
http://www.nicemice.net/idn/
Adam M. Costello
http://www.nicemice.net/amc/
This is ANSI C code (C89) implementing Punycode 1.0.x.
*/
#include <limits.h>
#include <stddef.h>
enum punycode_status {
punycode_success = 0,
punycode_bad_input = 1, /* Input is invalid. */
punycode_big_output = 2, /* Output would exceed the space provided. */
punycode_overflow = 3 /* Wider integers needed to process input. */
};
/* punycode_uint needs to be unsigned and needs to be */
/* at least 26 bits wide. The particular type can be */
/* specified by defining PUNYCODE_UINT, otherwise a */
/* suitable type will be chosen automatically. */
#ifdef PUNYCODE_UINT
typedef PUNYCODE_UINT punycode_uint;
#elif UINT_MAX >= (1 << 26) - 1
typedef unsigned int punycode_uint;
#else
typedef unsigned long punycode_uint;
#endif
enum punycode_status punycode_encode(
size_t, /* input_length */
const punycode_uint [], /* input */
const unsigned char [], /* case_flags */
size_t *, /* output_length */
char [] /* output */
);
/*
punycode_encode() converts a sequence of code points (presumed to be
Unicode code points) to Punycode.
Input arguments (to be supplied by the caller):
input_length
The number of code points in the input array and the number
of flags in the case_flags array.
input
An array of code points. They are presumed to be Unicode
code points, but that is not strictly necessary. The
array contains code points, not code units. UTF-16 uses
code units D800 through DFFF to refer to code points
10000..10FFFF. The code points D800..DFFF do not occur in
any valid Unicode string. The code points that can occur in
Unicode strings (0..D7FF and E000..10FFFF) are also called
Unicode scalar values.
case_flags
A null pointer or an array of boolean values parallel to
the input array. Nonzero (true, flagged) suggests that the
corresponding Unicode character be forced to uppercase after
being decoded (if possible), and zero (false, unflagged)
suggests that it be forced to lowercase (if possible).
ASCII code points (0..7F) are encoded literally, except that
ASCII letters are forced to uppercase or lowercase according
to the corresponding case flags. If case_flags is a null
pointer then ASCII letters are left as they are, and other
code points are treated as unflagged.
Output arguments (to be filled in by the function):
output
An array of ASCII code points. It is *not* null-terminated;
it will contain zeros if and only if the input contains
zeros. (Of course the caller can leave room for a
terminator and add one if needed.)
Input/output arguments (to be supplied by the caller and overwritten
by the function):
output_length
The caller passes in the maximum number of ASCII code points
that it can receive. On successful return it will contain
the number of ASCII code points actually output.
Return value:
Can be any of the punycode_status values defined above except
punycode_bad_input. If not punycode_success, then output_size
and output might contain garbage.
*/
enum punycode_status punycode_decode(
size_t, /* input_length */
const char [], /* input */
size_t *, /* output_length */
punycode_uint [], /* output */
unsigned char [] /* case_flags */
);
/*
punycode_decode() converts Punycode to a sequence of code points
(presumed to be Unicode code points).
Input arguments (to be supplied by the caller):
input_length
The number of ASCII code points in the input array.
input
An array of ASCII code points (0..7F).
Output arguments (to be filled in by the function):
output
An array of code points like the input argument of
punycode_encode() (see above).
case_flags
A null pointer (if the flags are not needed by the caller)
or an array of boolean values parallel to the output array.
Nonzero (true, flagged) suggests that the corresponding
Unicode character be forced to uppercase by the caller (if
possible), and zero (false, unflagged) suggests that it
be forced to lowercase (if possible). ASCII code points
(0..7F) are output already in the proper case, but their
flags will be set appropriately so that applying the flags
would be harmless.
Input/output arguments (to be supplied by the caller and overwritten
by the function):
output_length
The caller passes in the maximum number of code points
that it can receive into the output array (which is also
the maximum number of flags that it can receive into the
case_flags array, if case_flags is not a null pointer). On
successful return it will contain the number of code points
actually output (which is also the number of flags actually
output, if case_flags is not a null pointer). The decoder
will never need to output more code points than the number
of ASCII code points in the input, because of the way the
encoding is defined. The number of code points output
cannot exceed the maximum possible value of a punycode_uint,
even if the supplied output_length is greater than that.
Return value:
Can be any of the punycode_status values defined above. If not
punycode_success, then output_length, output, and case_flags
might contain garbage.
*/

@ -3212,7 +3212,7 @@ bool Query::setQWords ( char boolFlag ,
// no punct, alnum only
if ( words.isPunct(i) ) continue;
// skip if not a stop word
if ( ! bits.m_bits[i] & D_IS_STOPWORD ) continue;
if ( ! (bits.m_bits[i] & D_IS_STOPWORD) ) continue;
// continue if you can still pair across prev punct word
if ( bits.m_bits[i-1] & D_CAN_PAIR_ACROSS ) continue;
// otherwise, we can now start a phrase

70
Rdb.cpp

@ -374,16 +374,16 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
char dstDir[256];
// make the trash dir if not there
sprintf ( dstDir , "%s/trash/" , g_hostdb.m_dir );
int32_t status = ::mkdir ( dstDir ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ;
int32_t status = ::mkdir ( dstDir , getDirCreationFlags() );
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ;
// we have to create it
sprintf ( dstDir , "%s/trash/rebuilt%"UINT32"/" , g_hostdb.m_dir , t );
status = ::mkdir ( dstDir ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ;
status = ::mkdir ( dstDir , getDirCreationFlags() );
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ;
if ( status && errno != EEXIST ) {
g_errno = errno;
return log("repair: Could not mkdir(%s): %s",dstDir,
@ -643,10 +643,10 @@ bool Rdb::deleteAllRecs ( collnum_t collnum ) {
bool makeTrashDir() {
char trash[1024];
sprintf(trash, "%strash/",g_hostdb.m_dir);
if ( ::mkdir ( trash,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 ) {
if ( ::mkdir ( trash , getDirCreationFlags() ) ) {
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 ) {
if ( errno != EEXIST ) {
log("dir: mkdir %s had error: %s",
trash,mstrerror(errno));
@ -1424,10 +1424,12 @@ bool Rdb::gotTokenForDump ( ) {
RdbBucket *b = m_buckets.m_buckets[i];
collnum_t cn = b->getCollnum();
int32_t nk = b->getNumKeys();
for ( int32_t j = 0 ; j < nk; j++ ) {
cr = g_collectiondb.m_recs[cn];
if ( cr ) cr->m_treeCount++;
}
// for ( int32_t j = 0 ; j < nk; j++ ) {
// cr = g_collectiondb.m_recs[cn];
// if ( cr ) cr->m_treeCount++;
// }
cr = g_collectiondb.m_recs[cn];
if ( cr ) cr->m_treeCount += nk;
}
}
@ -1542,6 +1544,20 @@ bool Rdb::dumpCollLoop ( ) {
"available secondary id for titledb: %s." ,
mstrerror(g_errno) );
}
// if we add to many files then we can not merge, because merge op
// needs to add a file and it calls addNewFile() too
static int32_t s_flag = 0;
if ( base->m_numFiles + 1 >= MAX_RDB_FILES ) {
if ( s_flag < 10 )
log("db: could not dump tree to disk for cn="
"%i %s because it has %"INT32" files on disk. "
"Need to wait for merge operation.",
(int)m_dumpCollnum,m_dbname,base->m_numFiles);
s_flag++;
goto loop;
}
// this file must not exist already, we are dumping the tree into it
m_fn = base->addNewFile ( id2 ) ;
if ( m_fn < 0 ) return log(LOG_LOGIC,"db: rdb: Failed to add new file "
@ -1797,6 +1813,8 @@ void attemptMergeAll2 ( ) {
tryLoop:
QUICKPOLL(niceness);
// if a collection got deleted, reset this to 0
if ( s_lastCollnum >= g_collectiondb.m_numRecs ) {
s_lastCollnum = 0;
@ -1836,6 +1854,26 @@ void attemptMergeAll2 ( ) {
if ( base && base->attemptMerge(niceness,force,true) )
return;
// also try to merge on rdbs being rebuilt
base = cr->getBasePtr(RDB2_POSDB2);
if ( base && base->attemptMerge(niceness,force,true) )
return;
base = cr->getBasePtr(RDB2_TITLEDB2);
if ( base && base->attemptMerge(niceness,force,true) )
return;
base = cr->getBasePtr(RDB2_TAGDB2);
if ( base && base->attemptMerge(niceness,force,true) )
return;
base = cr->getBasePtr(RDB2_LINKDB2);
if ( base && base->attemptMerge(niceness,force,true) )
return;
base = cr->getBasePtr(RDB2_SPIDERDB2);
if ( base && base->attemptMerge(niceness,force,true) )
return;
base = cr->getBasePtr(RDB2_CLUSTERDB2);
if ( base && base->attemptMerge(niceness,force,true) )
return;
// try next collection
s_lastCollnum++;

@ -165,10 +165,10 @@ bool RdbBase::init ( char *dir ,
}
// make a special "cat" dir for it if we need to
sprintf ( tmp , "%s%s" , dir , dbname );
int32_t status = ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH );
int32_t status = ::mkdir ( tmp , getDirCreationFlags() );
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH );
if ( status == -1 && errno != EEXIST && errno )
return log("db: Failed to make directory %s: %s.",
tmp,mstrerror(errno));
@ -186,9 +186,9 @@ bool RdbBase::init ( char *dir ,
// make a special "cat" dir for it if we need to
sprintf ( tmp , "%scat" , dir );
if ( ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
return log("db: Failed to make directory %s: %s.",
tmp,mstrerror(errno));
}
@ -202,9 +202,9 @@ bool RdbBase::init ( char *dir ,
// make a special "stats" dir for it if necessary
sprintf ( tmp , "%sstats" , dir );
if ( ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
return log( "db: Failed to make directory %s: %s.",
tmp, mstrerror( errno ) );
}
@ -218,9 +218,9 @@ bool RdbBase::init ( char *dir ,
// make a special "stats" dir for it if necessary
sprintf ( tmp , "%saccess" , dir );
if ( ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
return log( "db: Failed to make directory %s: %s.",
tmp, mstrerror( errno ) );
}
@ -234,9 +234,9 @@ bool RdbBase::init ( char *dir ,
// make a special "stats" dir for it if necessary
sprintf ( tmp , "%ssyncdb" , dir );
if ( ::mkdir ( tmp ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
return log( "db: Failed to make directory %s: %s.",
tmp, mstrerror( errno ) );
}
@ -1607,8 +1607,8 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
if ( ! m_mergeUrgent && numFiles - 14 >= m_minToMerge ) {
m_mergeUrgent = true;
if ( doLog )
log(LOG_INFO,"merge: Entering urgent merge mode for %s.",
m_dbname);
log(LOG_INFO,"merge: Entering urgent merge mode for %s "
"coll=%s.", m_dbname,m_coll);
g_numUrgentMerges++;
}
@ -1811,7 +1811,8 @@ void RdbBase::gotTokenForMerge ( ) {
m_mergeUrgent = true;
if ( m_doLog )
log(LOG_INFO,
"merge: Entering urgent merge mode for %s.", m_dbname);
"merge: Entering urgent merge mode (2) for %s coll=%s.",
m_dbname,m_coll);
g_numUrgentMerges++;
}
// tfndb has his own merge class since titledb merges write tfndb recs
@ -1892,8 +1893,13 @@ void RdbBase::gotTokenForMerge ( ) {
// sanity check
if ( n <= 1 ) {
log(LOG_LOGIC,"merge: attemptMerge: Resuming. bad "
"engineer");
"engineer for %s coll=%s",m_dbname,m_coll);
//g_msg35.releaseToken();
if ( m_mergeUrgent ) {
log("merge: leaving urgent merge mode");
g_numUrgentMerges--;
m_mergeUrgent = false;
}
return false;
}
// make a log note

@ -338,10 +338,10 @@ class RdbBase {
// . older files are listed first (lower fileIds)
// . filenames should include the directory (full filenames)
// . TODO: RdbMgr should control what rdb gets merged?
BigFile *m_files [ MAX_RDB_FILES ];
int32_t m_fileIds [ MAX_RDB_FILES ];
int32_t m_fileIds2 [ MAX_RDB_FILES ]; // for titledb/tfndb linking
RdbMap *m_maps [ MAX_RDB_FILES ];
BigFile *m_files [ MAX_RDB_FILES+1 ];
int32_t m_fileIds [ MAX_RDB_FILES+1 ];
int32_t m_fileIds2 [ MAX_RDB_FILES+1 ]; // for titledb/tfndb linking
RdbMap *m_maps [ MAX_RDB_FILES+1 ];
int32_t m_numFiles;
// this class contains a ptr to us

@ -2060,8 +2060,10 @@ bool RdbBuckets::fastSave_r() {
char s[1024];
sprintf ( s , "%s/%s-buckets-saving.dat", m_dir , m_dbname );
int fd = ::open ( s ,
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
S_IRGRP | S_IWGRP | S_IROTH);
O_RDWR | O_CREAT | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR |
// S_IRGRP | S_IWGRP | S_IROTH);
if ( fd < 0 ) {
m_saveErrno = errno;
return log("db: Could not open %s for writing: %s.",

@ -1484,7 +1484,7 @@ bool RdbCache::save_r ( ) {
//f.set ( g_hostdb.m_dir , filename );
// open the file
//if ( ! f.open ( O_RDWR | O_CREAT ) )
int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
int fd = open ( filename , O_RDWR | O_CREAT , getFileCreationFlags() );
if ( fd < 0 )
return log("db: Had opening file to save cache to: %s.",
mstrerror(errno));

@ -1340,6 +1340,7 @@ bool RdbList::constrain ( char *startKey ,
// ensure we our first key is 12 bytes if m_useHalfKeys is true
if ( m_useHalfKeys && isHalfBitOn ( m_list ) ) {
g_errno = ECORRUPTDATA;
g_numCorrupt++;
return log("db: First key is 6 bytes. Corrupt data "
"file.");
}
@ -1347,12 +1348,14 @@ bool RdbList::constrain ( char *startKey ,
// sanity. hint key should be full key
if ( m_ks == 18 && hintKey && (hintKey[0]&0x06)){
g_errno = ECORRUPTDATA;
g_numCorrupt++;
return log("db: Hint key is corrupt.");
//char *xx=NULL;*xx=0;}
}
if ( hintOffset > m_listSize ) { //char *xx=NULL;*xx=0; }
g_errno = ECORRUPTDATA;
g_numCorrupt++;
return log("db: Hint offset %"INT32" > %"INT32" is corrupt."
,hintOffset,
m_listSize);
@ -1418,6 +1421,7 @@ bool RdbList::constrain ( char *startKey ,
m_listPtrHi = savelistPtrHi ;
m_listPtrLo = savelistPtrLo ;
g_errno = ECORRUPTDATA;
g_numCorrupt++;
return log("db: Got record size of %"INT32" < 0. "
"Corrupt data file.",recSize);
}
@ -1525,13 +1529,16 @@ bool RdbList::constrain ( char *startKey ,
if ( minRecSizes < 0 ) maxPtr = m_listEnd;
// size of last rec we read in the list
int32_t size = -1 ;
// char *savedp = p;
// if ( savedp == (char *)0x001 ) { char *xx=NULL;*xx=0;}
// advance until endKey or minRecSizes kicks us out
//while ( p < m_listEnd && getKey(p) <= endKey && p < maxPtr ) {
while ( p < m_listEnd ) {
QUICKPOLL(niceness);
getKey(p,k);
if ( KEYCMP(k,endKey,m_ks)>0 ) break;
if ( p >= maxPtr ) break;
// only break out if we've set the size AND are >= maxPtr
if ( p >= maxPtr && size > 0 ) break;
size = getRecSize ( p );
// watch out for corruption, let Msg5 fix it
if ( size < 0 ) {
@ -1540,6 +1547,7 @@ bool RdbList::constrain ( char *startKey ,
m_listPtrLo = savelistPtrLo;
m_listPtr = savelist;
g_errno = ECORRUPTDATA;
g_numCorrupt++;
return log("db: Corrupt record size of %"INT32" "
"bytes in %s.",size,filename);
}
@ -1559,6 +1567,7 @@ bool RdbList::constrain ( char *startKey ,
m_listPtrLo = savelistPtrLo;
m_listPtr = savelist;
g_errno = ECORRUPTDATA;
g_numCorrupt++;
return log("db: Corrupt record size of %"INT32" "
"bytes in %s.",size,filename);
}
@ -1580,6 +1589,7 @@ bool RdbList::constrain ( char *startKey ,
m_listPtrLo = savelistPtrLo;
m_listPtr = savelist;
g_errno = ECORRUPTDATA;
g_numCorrupt++;
return log("db: Corrupt record size of %"INT32" "
"bytes in %s.",size,filename);
}
@ -1587,17 +1597,23 @@ bool RdbList::constrain ( char *startKey ,
//endKey = getKey ( p - size );
getKey(p-size,endKey);
}
// bitch if size is -1 still
if ( size == -1 ) {
log("db: Corruption. Encountered bad endkey in %s.",filename);
char *xx=NULL;*xx=0;
m_list = savelist;
m_listPtrHi = savelistPtrHi;
m_listPtrLo = savelistPtrLo;
m_listPtr = savelist;
g_errno = ECORRUPTDATA;
g_numCorrupt++;
return false;
}
// cut the tail
m_listEnd = p;
m_listSize = m_listEnd - m_list;
// bitch if size is -1 still
if ( size == -1 ) {
log("db: Encountered bad endkey in %s. listSize=%"INT32"",
filename,m_listSize);
char *xx=NULL;*xx=0;
}
// otherwise store the last key if size is not -1
else if ( m_listSize > 0 ) {
if ( m_listSize > 0 ) {
//m_lastKey = getKey ( p - size );
getKey(p-size,m_lastKey);
m_lastKeyIsValid = true;

@ -2488,8 +2488,10 @@ bool RdbTree::fastSave_r() {
char s[1024];
sprintf ( s , "%s/%s-saving.dat", m_dir , m_dbname );
int fd = ::open ( s ,
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
S_IRGRP | S_IWGRP | S_IROTH);
O_RDWR | O_CREAT | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR | S_IWUSR |
// S_IRGRP | S_IWGRP | S_IROTH);
if ( fd < 0 ) {
m_saveErrno = errno;
return log("db: Could not open %s for writing: %s.",

@ -198,6 +198,15 @@ bool SafeBuf::safeMemcpy ( Words *w , int32_t a , int32_t b ) {
return safeMemcpy ( p , pend - p );
}
char* SafeBuf::pushStr (char* str, uint32_t len) {
int32_t initLen = m_length;
bool status = safeMemcpy ( str , len );
status &= nullTerm();
m_length++; //count the null so it isn't overwritten
if(!status) return NULL;
return m_buf + initLen;
}
bool SafeBuf::pushPtr ( void *ptr ) {
if ( m_length + (int32_t)sizeof(char *) > m_capacity )
if(!reserve(sizeof(char *)))//2*m_capacity + 1))
@ -431,7 +440,7 @@ bool SafeBuf::reserve(int32_t i , char *label, bool clearIt ) {
//buffer size.
bool SafeBuf::reserve2x(int32_t i, char *label) {
//watch out for overflow!
if((m_capacity << 1) + i < 0) return false;
if((m_capacity << 1) + i < m_capacity) return false;
if(i + m_length >= m_capacity)
return reserve(m_capacity + i,label);
else return true;
@ -449,8 +458,9 @@ int32_t SafeBuf::save ( char *fullFilename ) {
int32_t SafeBuf::dumpToFile(char *filename ) {
retry22:
int32_t fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
int32_t fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC ,
getFileCreationFlags() );
//S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry22;
@ -484,8 +494,9 @@ int32_t SafeBuf::safeSave (char *filename ) {
fn.safePrintf( "%s.saving",filename );
int32_t fd = open ( fn.getBufStart() ,
O_CREAT | O_WRONLY | O_TRUNC,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
O_CREAT | O_WRONLY | O_TRUNC ,
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
if ( fd < 0 ) {
// valgrind
if ( errno == EINTR ) goto retry22;
@ -571,8 +582,8 @@ int32_t SafeBuf::fillFromFile(char *filename) {
reserve(results.st_size+1);
retry:
int32_t fd = open ( filename , O_RDONLY,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
int32_t fd = open ( filename , O_RDONLY , getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
if ( ! fd ) {
// valgrind
if ( errno == EINTR ) goto retry;
@ -862,6 +873,22 @@ bool SafeBuf::utf8Encode2(char *s, int32_t len, bool encodeHTML,int32_t nicenes
return htmlEncode(m_length-tmp,niceness);
}
bool SafeBuf::utf32Encode(UChar32* codePoints, int32_t cpLen) {
if(m_encoding != csUTF8) return safePrintf("FIXME %s:%i", __FILE__, __LINE__);
int32_t need = 0;
for(int32_t i = 0; i < cpLen;i++) need += utf8Size(codePoints[i]);
if(!reserve(need)) return false;
for(int32_t i = 0; i < cpLen;i++) {
m_length += ::utf8Encode(codePoints[i], m_buf + m_length);
}
return true;
}
/*
bool SafeBuf::utf32Encode(UChar32 c) {
if(!reserve2x(8)) return false;
@ -3666,3 +3693,12 @@ bool SafeBuf::hasDigits() {
if ( is_digit(m_buf[i]) ) return true;
return false;
}
int32_t SafeBuf::indexOf(char c) {
char* p = m_buf;
char* pend = m_buf + m_length;
while (p < pend && *p != c) p++;
if (p == pend) return -1;
return p - m_buf;
}

@ -259,6 +259,7 @@ public:
int32_t niceness=0);
bool latin1Encode(char *s, int32_t len, bool htmlEncode=false,
int32_t niceness=0);
bool utf32Encode(UChar32* codePoints, int32_t cpLen);
//bool utf16Encode(UChar *s, int32_t len, bool htmlEncode=false);
//bool utf16Encode(char *s, int32_t len, bool htmlEncode=false) {
// return utf16Encode((UChar*)s, len>>1, htmlEncode); };
@ -327,6 +328,7 @@ public:
return true;
};
int32_t indexOf(char c);
bool safeCdataMemcpy(char *s, int32_t len);
bool pushChar (char i) {
@ -346,6 +348,7 @@ public:
// hack off trailing 0's
bool printFloatPretty ( float f ) ;
char* pushStr (char* str, uint32_t len);
bool pushPtr ( void *ptr );
bool pushLong (int32_t i);
bool pushLongLong (int64_t i);

@ -1805,7 +1805,8 @@ bool Speller::createUnifiedDict (){
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
getFileCreationFlags());
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));

@ -2759,6 +2759,7 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
// remove all his keys just because we restarted and think he
// is alive even though we have gotten no ping from him.
//if ( hp->m_numPingRequests > 0 )
removeFromTree:
// these operations should fail if writes have been disabled
// and becase the trees/tables for spidercache are saving
// in Process.cpp's g_spiderCache::save() call
@ -2793,7 +2794,15 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
m_waitingTreeKeyValid = true;
m_scanningIp = firstIp;
// sanity
if ( firstIp == 0 || firstIp == -1 ) { char *xx=NULL;*xx=0; }
if ( firstIp == 0 || firstIp == -1 ) {
//char *xx=NULL;*xx=0; }
log("spider: removing corrupt spiderreq firstip of %"INT32
" from waiting tree collnum=%i",
firstIp,(int)m_collnum);
goto removeFromTree;
}
// avoid corruption
// we set this to true when done
//m_isReadDone = false;
// compute the best request from spiderdb list, not valid yet
@ -2877,6 +2886,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
if ( m_deleteMyself ) { char *xx=NULL;*xx=0; }
// skip if spiders off
if ( ! m_cr->m_spideringEnabled ) return;
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
// skip if udp table is full
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
// if entering for the first time, we need to read list from spiderdb
@ -3160,6 +3170,8 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
// since addSpiderRequest() calls addToWaitingTree() which then calls
// this.
if ( ! g_conf.m_spideringEnabled ) return;
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
// skip if udp table is full
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
@ -4106,6 +4118,20 @@ bool SpiderColl::scanListForWinners ( ) {
//srep = NULL;
continue;
}
// ignore these to fix diffbot's malformed url bug
if ( tmp->m_errCode == 32880 &&
// and is before about dec 18th 2015
tmp->m_spideredTime < 1450488447 )
continue;
// ignore these to fix diffbot's ebadtitlerec error
// 'bad cached document'.
// ignore them so we can respider the urls and
// the new logic in xmldoc.cpp can ignore them.
// i fixed xmldoc.cpp to index these status docs.
if ( tmp->m_errCode == 32792 &&
// and is before about dec 22nd 2015
tmp->m_spideredTime < 1450897197 )
continue;
// bad langid?
if ( ! getLanguageAbbr (tmp->m_langId) ) {
log("spider: got corrupt 4 spiderReply in "
@ -4268,7 +4294,18 @@ bool SpiderColl::scanListForWinners ( ) {
m_lastCBlockIp = cblock;
// only add firstip if manually added and not fake
// if ( uh48 == 272628060426254 )
// log("spider: got special seed");
// #undef sleep
// if ( uh48 == 272628060426254 ) {
// log("spider: got special seed");
// bool flag = true;
// sleepLoop:
// sleep(1);
// if ( flag ) goto sleepLoop;
// }
// #define sleep(a) { char *xx=NULL;*xx=0; }
//
// just calculating page counts? if the url filters are based
@ -5889,6 +5926,8 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
if ( ! srep && sreq->m_isInjecting ) return spiderTimeMS;
if ( ! srep && sreq->m_isPageReindex ) return spiderTimeMS;
//log("spider: getting spider time %"INT64, spiderTimeMS);
// to avoid hammering an ip, get last time we spidered it...
int64_t lastMS ;
lastMS = m_lastDownloadCache.getLongLong ( m_collnum ,
@ -6073,6 +6112,8 @@ bool isAssignedToUs ( int32_t firstIp ) {
// . ignore lower 8 bits of ip since one guy often owns a whole block!
//int32_t hostId=(((uint32_t)firstIp) >> 8) % g_hostdb.getNumHosts();
if( !g_hostdb.getMyHost()->m_spiderEnabled ) return false;
// get our group
//Host *group = g_hostdb.getMyGroup();
Host *shard = g_hostdb.getMyShard();
@ -6097,22 +6138,30 @@ bool isAssignedToUs ( int32_t firstIp ) {
int32_t i = ((uint32_t)h64) % hpg;
Host *h = &shard[i];
// return that if alive
if ( ! g_hostdb.isDead(h) ) return (h->m_hostId == g_hostdb.m_hostId);
if ( ! g_hostdb.isDead(h) && h->m_spiderEnabled) {
return (h->m_hostId == g_hostdb.m_hostId);
}
// . select another otherwise
// . put all alive in an array now
Host *alive[64];
int32_t upc = 0;
for ( int32_t j = 0 ; j < hpg ; j++ ) {
Host *h = &shard[i];
Host *h = &shard[j];
if ( g_hostdb.isDead(h) ) continue;
if( ! h->m_spiderEnabled ) continue;
alive[upc++] = h;
}
// if none, that is bad! return the first one that we wanted to
if ( upc == 0 ) return (h->m_hostId == g_hostdb.m_hostId);
if ( upc == 0 ) {
log("spider: no hosts can handle spider request for ip=%s", iptoa(firstIp));
return false;
//return (h->m_hostId == g_hostdb.m_hostId);
}
// select from the good ones now
i = ((uint32_t)firstIp) % hpg;
i = ((uint32_t)firstIp) % upc;
// get that
h = &shard[i];
h = alive[i]; //&shard[i];
// guaranteed to be alive... kinda
return (h->m_hostId == g_hostdb.m_hostId);
}
@ -6217,7 +6266,11 @@ void SpiderLoop::startLoop ( ) {
// in case host when dead.
// now that we only send the info on startup and if changed,
// let's move back down to 1 second
if ( !g_loop.registerSleepCallback(3000,
// . make it 20 seconds because handlerequestc1 is always on
// profiler when we have thousands of collections
// . let's try 10 seconds so as not to think a job is done when
// it is not
if ( !g_loop.registerSleepCallback(10000,
this,
updateAllCrawlInfosSleepWrapper))
log("build: failed to register updatecrawlinfowrapper");
@ -6232,6 +6285,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
// if spidering disabled then do not do this crap
if ( ! g_conf.m_spideringEnabled ) return;
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
//if ( ! g_conf.m_webSpideringEnabled ) return;
// or if trying to exit
if ( g_process.m_mode == EXIT_MODE ) return;
@ -6250,6 +6305,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
return;
}
//if ( g_hostdb.hasDeadHost() ) return;
static int32_t s_count = -1;
// count these calls
s_count++;
@ -6299,6 +6356,7 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
// if ( ! cr ) continue;
// skip if not enabled
if ( ! crp->m_spideringEnabled ) continue;
// get it
//SpiderColl *sc = cr->m_spiderColl;
SpiderColl *sc = g_spiderCache.getSpiderColl(crp->m_collnum);
@ -6694,6 +6752,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
// must be spidering to dole out
if ( ! g_conf.m_spideringEnabled ) return;
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
// or if trying to exit
if ( g_process.m_mode == EXIT_MODE ) return;
// if we don't have all the url counts from all hosts, then wait.
@ -7543,7 +7603,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// note it
if ( (g_corruptCount % 1000) == 0 )
log("spider: got corrupt doledb record. ignoring. "
"pls fix!!!");
"pls fix!!! cn=%i",(int)m_collnum);
g_corruptCount++;
// skip for now....!! what is causing this???
m_list.skipCurrentRecord();
@ -8278,7 +8338,13 @@ bool SpiderLoop::spiderUrl2 ( ) {
// count it as a hit
//g_stats.m_spiderUrlsHit++;
// sanity check
if (m_sreq->m_priority <= -1 ) { char *xx=NULL;*xx=0; }
if (m_sreq->m_priority <= -1 ) {
log("spider: fixing bogus spider req priority of %i for "
"url %s",
(int)m_sreq->m_priority,m_sreq->m_url);
m_sreq->m_priority = 0;
//char *xx=NULL;*xx=0;
}
//if(m_sreq->m_priority >= MAX_SPIDER_PRIORITIES){char *xx=NULL;*xx=0;}
// update this
m_sc->m_outstandingSpiders[(unsigned char)m_sreq->m_priority]++;
@ -9588,7 +9654,10 @@ bool printList ( State11 *st ) {
if ( list->getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0;}
// sanity check. requests ONLY in doledb
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec )) {
char*xx=NULL;*xx=0;}
log("spider: not printing spiderreply");
continue;
//char*xx=NULL;*xx=0;
}
// get the spider rec, encapsed in the data of the doledb rec
SpiderRequest *sreq = (SpiderRequest *)rec;
// print it into sbTable
@ -11428,7 +11497,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( langId >= 0 ) { // if ( srep ) {
// this is NULL on corruption
lang = getLanguageAbbr ( langId );//srep->m_langId );
langLen = gbstrlen(lang);
if (lang) langLen = gbstrlen(lang);
}
// . get parent language in the request
@ -12919,6 +12988,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// skip fast
p += 10;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// EBADURL malformed url is ... 32880
if ( *p=='e' && strncmp(p,"errorcode",9) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// int16_tcut
int32_t a = srep->m_errCode;
// make it point to the retry count
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// skip fast
p += 9;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
@ -13810,6 +13910,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// . TODO: do not update on error???
for ( ; ptr < end ; ptr++ ) {
QUICKPOLL ( slot->m_niceness );
// get collnum
collnum_t collnum = (collnum_t)(ptr->m_collnum);
@ -13875,6 +13977,12 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// loop over
for ( int32_t x = 0 ; x < g_collectiondb.m_numRecs ; x++ ) {
QUICKPOLL ( slot->m_niceness );
// a niceness 0 routine could have nuked it?
if ( x >= g_collectiondb.m_numRecs )
break;
CollectionRec *cr = g_collectiondb.m_recs[x];
if ( ! cr ) continue;
@ -13897,20 +14005,35 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
if ( ! cia ) continue;
for ( int32_t k = 0 ; k < g_hostdb.m_numHosts; k++ ) {
QUICKPOLL ( slot->m_niceness );
// get the CrawlInfo for the ith host
CrawlInfo *stats = &cia[k];
// point to the stats for that host
int64_t *ss = (int64_t *)stats;
int64_t *gs = (int64_t *)gi;
// add each hosts counts into the global accumulators
// are stats crazy?
bool crazy = false;
for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
*gs = *gs + *ss;
// crazy stat?
if ( *ss > 1000000000LL ||
*ss < -1000000000LL )
*ss < -1000000000LL ) {
log("spider: crazy stats %"INT64" "
"from host #%"INT32" coll=%s",
"from host #%"INT32" coll=%s. "
"ignoring.",
*ss,k,cr->m_coll);
crazy = true;
break;
}
ss++;
}
// reset ptr to accumulate
ss = (int64_t *)stats;
for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
// do not accumulate if corrupted.
// probably mem got corrupted and it saved
// to disk.
if ( crazy ) break;
*gs = *gs + *ss;
gs++;
ss++;
}
@ -14177,7 +14300,7 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
QUICKPOLL(MAX_NICENESS);
QUICKPOLL(slot->m_niceness);
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
@ -14370,10 +14493,27 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
uint32_t now = (uint32_t)getTimeGlobal();
// hit crawl round max? this could be SP_ROUNDDONE and it doesn't
// get converted to SP_MAXROUNDS until we call spiderDoledUrls()
// so fix the crawlbot nightly smoke test by setting this here
// to SP_MAXROUNDS.
// smoketest msg = FAIL: testCrawlRounds (__main__.TestRepeatCrawl)
// self.assertEqual(j['jobs'][0]['jobStatus']['status'],1,msg=self.name
// AssertionError: 4 != 1 : 1227151934RepeatCrawlself.
// assertEqual(j['jobs'][0]['jobStatus']['status'],1,msg=self.name)
int32_t spiderStatus = cx->m_spiderStatus;
if ( spiderStatus == SP_ROUNDDONE &&
cx->m_maxCrawlRounds > 0 &&
cx->m_isCustomCrawl &&
cx->m_spiderRoundNum >= cx->m_maxCrawlRounds )
spiderStatus = SP_MAXROUNDS;
// try to fix crawlbot nightly test complaining about job status
// for TestRepeatCrawlWithMaxToCrawl
if ( (cx->m_spiderStatus == SP_MAXTOCRAWL ||
cx->m_spiderStatus == SP_MAXTOPROCESS ) &&
if ( (spiderStatus == SP_MAXTOCRAWL ||
spiderStatus == SP_MAXTOPROCESS ) &&
cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime &&
cx->m_spiderRoundNum >= cx->m_maxCrawlRounds ) {
@ -14384,7 +14524,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
// . 0 means not to RE-crawl
// . indicate if we are WAITING for next round...
if ( cx->m_spiderStatus == SP_MAXTOCRAWL &&
if ( spiderStatus == SP_MAXTOCRAWL &&
cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
@ -14395,7 +14535,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
now));
}
if ( cx->m_spiderStatus == SP_MAXTOPROCESS &&
if ( spiderStatus == SP_MAXTOPROCESS &&
cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
@ -14407,19 +14547,19 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
}
if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
if ( spiderStatus == SP_MAXTOCRAWL ) {
*status = SP_MAXTOCRAWL;
return msg->safePrintf ( "Job has reached maxToCrawl "
"limit." );
}
if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
if ( spiderStatus == SP_MAXTOPROCESS ) {
*status = SP_MAXTOPROCESS;
return msg->safePrintf ( "Job has reached maxToProcess "
"limit." );
}
if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
if ( spiderStatus == SP_MAXROUNDS ) {
*status = SP_MAXROUNDS;
return msg->safePrintf ( "Job has reached maxRounds "
"limit." );
@ -14453,7 +14593,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
// return msg->safePrintf("Crawl is waiting for urls.");
//}
if ( cx->m_spiderStatus == SP_INITIALIZING ) {
if ( spiderStatus == SP_INITIALIZING ) {
*status = SP_INITIALIZING;
return msg->safePrintf("Job is initializing.");
}
@ -14479,7 +14619,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
"repeat is scheduled.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
if ( spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Nothing currently "
"available to spider. "
@ -14502,7 +14642,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
}
if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
if ( spiderStatus == SP_ROUNDDONE ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Job round completed.");
}
@ -14755,12 +14895,21 @@ bool SpiderRequest::isCorrupt ( ) {
}
// sanity check. check for http(s)://
if ( m_url[0] != 'h' &&
// might be a docid from a pagereindex.cpp
! is_digit(m_url[0]) ) {
if ( m_url[0] == 'h' )
return false;
// might be a docid from a pagereindex.cpp
if ( ! is_digit(m_url[0]) ) {
log("spider: got corrupt 1 spiderRequest");
return true;
}
// if it is a digit\0 it is ok, not corrupt
if ( ! m_url[1] )
return false;
// if it is not a digit after the first digit, that is bad
if ( ! is_digit(m_url[1]) ) {
log("spider: got corrupt 2 spiderRequest");
return true;
}
return false;
}

@ -143,7 +143,10 @@ bool buildProxyTable ( ) {
*s = '\0';
log("buf: %s for %s",msg,p);
*s = c;
return false;
//return false;
// advance p
p = s;
continue;
}
// convert it
@ -706,6 +709,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
int32_t hslot = s_loadTable.getSlot ( &urlIp );
// scan all proxies that have this urlip outstanding
for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){
QUICKPOLL(niceness);
// get the bucket
LoadBucket *lb;
lb = (LoadBucket *)s_loadTable.getValueFromSlot(i);
@ -736,6 +740,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
// get the min of the counts
int32_t minCount = 999999;
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
QUICKPOLL(niceness);
// skip empty slots
if ( ! s_iptab.m_flags[i] ) continue;
// get the spider proxy
@ -824,6 +829,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
int32_t slotCount = s_iptab.getNumSlots();
// . now find the best proxy wih the minCount
for ( int32_t i = start ; ; i++ ) {
QUICKPOLL(niceness);
// scan all slots in hash table, then stop
if ( slotCount-- <= 0 ) break;
// wrap around to zero if we hit the end
@ -896,8 +902,8 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
static int32_t s_lbid = 0;
// add it now, iff not for passing to diffbot backend
if ( preq->m_opCode != OP_GETPROXYFORDIFFBOT ) {
s_loadTable.addKey ( &urlIp , &bb );
bb.m_id = s_lbid++;
s_loadTable.addKey ( &urlIp , &bb );
// winner count update
winnersp->m_timesUsed++;
}
@ -931,12 +937,29 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
// and the loadbucket id
//*(int32_t *)p = bb.m_id; p += 4;
//int32_t sanityCount = 0;//s_loadTable.getNumSlots();
// top:
// with dup keys we end up with long chains of crap and this
// takes forever. so just flush the whole thing every 2 minutes AND
// when 20000+ entries are in there
static time_t s_lastTime = 0;
time_t now = nowms / 1000;
if ( s_lastTime == 0 ) s_lastTime = now;
time_t elapsed = now - s_lastTime;
if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) {
log("sproxy: flushing %i entries from proxy loadtable that "
"have accumulated since %i seconds ago",
(int)s_loadTable.m_numSlotsUsed,(int)elapsed);
s_loadTable.clear();
// only do this one per minute
s_lastTime = now;
}
int32_t sanityCount = 0;//s_loadTable.getNumSlots();
// top:
// now remove old entries from the load table. entries that
// have completed and have a download end time more than 10 mins ago
for ( int32_t i = 0 ; i < s_loadTable.getNumSlots() ; i++ ) {
// have completed and have a download end time more than 10 mins ago.
for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) {
QUICKPOLL(niceness);
// skip if empty
if ( ! s_loadTable.m_flags[i] ) continue;
// get the bucket
@ -948,8 +971,8 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
// < 10 mins? now it's < 15 seconds to prevent clogging.
if ( took < LOADPOINT_EXPIRE_MS ) continue;
// 100 at a time
//if ( sanityCount++ > 100 ) break;
// 100 at a time so we don't slam cpu
if ( sanityCount++ > 100 ) break;
// ok, its too old, nuke it to save memory
s_loadTable.removeSlot(i);
@ -957,7 +980,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
// mis out on analyzing any keys if we just keep looping here
// should we? TODO: figure it out. if we miss a few it's not
// a big deal.
i--;
//i--;
//goto top;
}

@ -65,21 +65,19 @@ static Label s_labels[] = {
// . 300MB/s is max read rate regardless to stop graph shrinkage
// . use 1KB as the min resolution per pixel
// . stored in Bps so use 1/1000 as scalar to get into KBps
{ GRAPH_QUANTITY,200,"disk_read",1,"%.0f MBps",1.0/(1000.0*1000.0),0x000000,
"disk read"},
{ GRAPH_QUANTITY,200,"disk_read",1,"%.0f MBps",1.0/(1000.0*1000.0),0x000000,"disk read"},
// . 300MB/s is max write rate regardless to stop graph shrinkage
// . use 1KB as the min resolution per pixel
// . stored in Bps so use 1/1000 as scalar to get into KBps
{GRAPH_QUANTITY,200,"disk_write",1,"%.0f Mbps",1.0/(1000.0*1000.0), 0xff0000,
"disk write"},
{GRAPH_QUANTITY,200,"disk_write",1,"%.0f Mbps",1.0/(1000.0*1000.0), 0xff0000, "disk write"},
// . 20 is the max dps regardless to stop graph shrinkage
// . use .03 qps as the min resolution per pixel
{GRAPH_OPS,20,"parse_doc", .005,"%.1f dps" , 1.0 , 0x00fea915,"parsed doc" },
{GRAPH_QUANTITY_PER_OP,1000,"docs_per_second", .005,"%.1f docs" , .001 , 0x1F2F5C,"docs per second" },
{GRAPH_QUANTITY_PER_OP,-1,"docs_per_second", .1,"%.1f docs per second" , -1 , 0x1F2F5C,"*successfully* indexed docs per second" },
// . use .1 * 1000 docs as the min resolution per pixel
// . max = -1, means dynamic size the ymax!
@ -88,7 +86,7 @@ static Label s_labels[] = {
// . make it 2M now not 50M. seems like it is per pixel and theres
// like 1000 pixels vertically. but we need to autoscale it
// eventually
{GRAPH_QUANTITY,2000000.0,"docs_indexed", .1,"%.0fK docs" , .001 , 0x00cc0099,"docs indexed" }
{GRAPH_QUANTITY,-1,"docs_indexed", .1,"%.0f docs" , -1, 0x00cc0099,"docs indexed" }
//{ "termlist_intersect",0x0000ff00},
@ -122,6 +120,7 @@ Label *Statsdb::getLabel ( int32_t labelHash ) {
return *label;
}
Statsdb::Statsdb ( ) {
m_init = false;
m_disabled = true;
@ -246,6 +245,8 @@ void flushStatsWrapper ( int fd , void *state ) {
void Statsdb::addDocsIndexed ( ) {
if ( ! isClockInSync() ) return;
if ( g_hostdb.hasDeadHost() ) return;
// only host #0 needs this
if ( g_hostdb.m_hostId != 0 ) return;
@ -270,18 +271,23 @@ void Statsdb::addDocsIndexed ( ) {
// divide by # of groups
total /= g_hostdb.getNumHostsPerShard();
// skip if no change
if ( total == s_lastTotal ) return;
int32_t docsIndexedInInterval = total - s_lastTotal;
float docsPerSecond = docsIndexedInInterval / (float)interval;
s_lastTotal = total;
log("build: total docs indexed: %f. docs per second %f %i %i", (float)total, docsPerSecond, docsIndexedInInterval, interval);
// add it if changed though
int64_t nowms = gettimeofdayInMillisecondsGlobal();
addStat ( MAX_NICENESS,"docs_indexed", nowms, nowms, (float)total );
addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond );
// Prevent a datapoint which adds all of the docs indexed to date.
if( s_lastTotal != 0 ) {
addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond );
}
s_lastTotal = total;
}
// . m_key bitmap in statsdb:
@ -896,12 +902,13 @@ char *Statsdb::plotGraph ( char *pstart ,
bool needMax = true;
float ymin = 0.0;
float ymax = 0.0;
float yscalar = label->m_yscalar;
char *p = pstart;
for ( ; p < pend ; p += 12 ) {
// breathe
QUICKPOLL ( m_niceness );
if ( m_gw.getLength() > 10000000 ) break;
// get the y
float y2 = *(float *)(p+4);
// get color of this point
@ -909,7 +916,8 @@ char *Statsdb::plotGraph ( char *pstart ,
// stop if not us
if ( gh != graphHash ) continue;
// put into scaled space right away
y2 = y2 * label->m_yscalar;
if (label->m_yscalar >= 0)
y2 = y2 * label->m_yscalar;
// . limit y to absolute max
// . these units should be scaled as well!
if ( y2 > label->m_absYMax && label->m_absYMax > 0.0 )
@ -922,13 +930,21 @@ char *Statsdb::plotGraph ( char *pstart ,
}
// force to zero for now
ymin = 0.0;
//ymin = 0.0;
// . and force to ymax for now as well
// . -1 indicates dynamic though!
if ( label->m_absYMax > 0.0 ) ymax = label->m_absYMax;
// add a 20% ceiling
else ymax *= 1.20;
// else ymax *= 1.20;
if( label->m_yscalar <= 0 ) {
if(ymax == ymin) {
yscalar = 0;
} else {
yscalar = (float)DY2 / (ymax - ymin);
}
}
// return that!
char *retp = p;
@ -951,7 +967,7 @@ char *Statsdb::plotGraph ( char *pstart ,
// . pad y range if total range is small
// . only do this for certain types of stats, like qps and disk i/o
if ( ourDiff < minDiff ) {
if ( label->m_yscalar >=0 && ourDiff < minDiff ) {
float pad = (minDiff - ourDiff) / 2;
// pad it out
ymin -= pad ;
@ -981,16 +997,23 @@ char *Statsdb::plotGraph ( char *pstart ,
for ( ; p < pend ; ) {
// breathe
QUICKPOLL ( m_niceness );
if ( m_gw.getLength() > 10000000 ) break;
// first is x pixel pos
int32_t x2 = *(int32_t *)p; p += 4;
// then y pos
float y2 = *(float *)p; p += 4;
// scale it right away
y2 *= label->m_yscalar;
if(label->m_yscalar < 0) {
y2 = (y2 - ymin) * yscalar;
}
else {
y2 *= yscalar;
}
// adjust
if ( y2 > ymax ) y2 = ymax;
if ( y2 < 0 ) y2 = 0;
// then graphHash
int32_t gh = *(int32_t *)p; p += 4;
@ -1003,8 +1026,10 @@ char *Statsdb::plotGraph ( char *pstart ,
float y1 = lasty;
// normalize y into pixel space
y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
if(label->m_yscalar >= 0 && ymax != ymin) {
y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
}
// set lasts for next iteration of this loop
lastx = x2;
lasty = y2;
@ -1073,13 +1098,20 @@ char *Statsdb::plotGraph ( char *pstart ,
}
float lastZ = -1;
for ( float z = ymin ; z < ymax ; z += deltaz ) {
// breathe
QUICKPOLL ( m_niceness );
// draw it
drawHR ( z , ymin , ymax , m_gw , label , zoff , color );
if(z == lastZ) break;
lastZ = z;
//if ( m_gw.getLength() > 10000000 ) break;
}
if ( m_gw.getLength() > 10000000 )
log("statsdb: graph too big");
return retp;
//#endif
@ -1158,7 +1190,7 @@ void Statsdb::drawHR ( float z ,
"font-size:14px;"
"min-height:20px;"
"min-width:3px;\""
" class=\"color-%"XINT32"\";"
" class=\"color-%"XINT32"\""
">%s</div>\n"
, (int32_t)(m_bx)
, (int32_t)z2 +m_by
@ -1194,6 +1226,13 @@ bool Statsdb::processList ( ) {
m_done = true;
}
// HACK: the user can request all of the events, it can
// become quite large. so limit to 100 mb right now.
if( m_sb3.length() > 100000000) {
log("statsdb: truncating statsdb results.");
m_done = true;
}
//
// all these points are accumulated into 1-second buckets
@ -1590,7 +1629,7 @@ void Statsdb::drawLine3 ( SafeBuf &sb ,
"z-index:-5;"
"min-height:%"INT32"px;"
"min-width:%"INT32"px;\""
"class=\"color-%"XINT32"\"></div>\n"
" class=\"color-%"XINT32"\"></div>\n"
, x1 + m_bx
, (fy1 - width/2) + m_by
, color
@ -1599,3 +1638,5 @@ void Statsdb::drawLine3 ( SafeBuf &sb ,
, color
);
}

@ -2803,24 +2803,15 @@ bool Msg8a::launchGetRequests ( ) {
//uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true );
//Host *group = g_hostdb.getGroup ( gid );
int32_t shardNum = getShardNum ( m_rdbId , &startKey );//, true );
Host *group = g_hostdb.getShard ( shardNum );
//int32_t numTwins = g_hostdb.getNumHostsPerShard();
// use top byte!
uint8_t *sks = (uint8_t *)&startKey;
uint8_t top = sks[sizeof(TAGDB_KEY)-1];
//int32_t hostNum = 0;
//if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
// TODO: fix this!
//if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
// support more than 2 stripes now...
int32_t hostNum = top % g_hostdb.getNumHostsPerShard();
int32_t hostId = group[hostNum].m_hostId;
Host *firstHost ;
// if niceness 0 can't pick noquery host.
// if niceness 1 can't pick nospider host.
firstHost = g_hostdb.getLeastLoadedInShard ( shardNum , m_niceness );
int32_t firstHostId = firstHost->m_hostId;
// . launch this request, even if to ourselves
// . TODO: just use msg0!!
bool status = m->getList ( hostId , // hostId
bool status = m->getList ( firstHostId , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
@ -2837,7 +2828,7 @@ bool Msg8a::launchGetRequests ( ) {
true , // error correction?
true , // include tree?
true , // doMerge?
-1 , // firstHostId
firstHostId , // firstHostId
0 , // startFileNum
-1 , // numFiles
3600*24*365 );// timeout

@ -2918,6 +2918,67 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
SSL_set_connect_state(s->m_ssl);
}
// . set hostname for SNI (Server Name Identification)
// . can test with page parser on the test page: https://sni.velox.ch/
// . we can parse the mime reliably here because we are the ones
// that created the request, so we know it should be standardish.
if ( s->m_sendBuf && ! s->m_readBuf ) {
// grab hostname from the mime
// skip first line
char *p = s->m_sendBuf;
char *pend = p + s->m_sendBufSize;
if ( p+10 >= pend )
goto skipSNI;
bool gotIt = false;
if ( p[0] == 'G' && p[1] == 'E' && p[2] == 'T' && p[3]==' ' )
gotIt = true;
if ( p[0] == 'P' && p[1] == 'O' && p[2] == 'S' && p[3]=='T' &&
p[4] == ' ' )
gotIt = true;
// need to start with "GET " or "POST "
if ( ! gotIt )
goto skipSNI;
scanMimeSomeMore:
// skip to the first \r, indicating end of line
for ( ; p < pend && *p != '\r' ; p++ );
// if we couldn't find it, then there's no Host: directive
if ( p == pend )
goto skipSNI;
// skip \r\n
if ( *p == '\r' )
p++;
if ( p == pend )
goto skipSNI;
if ( *p == '\n' )
p++;
if ( p == pend )
goto skipSNI;
// end of mime (\r\n\r\n)
if ( p+2<pend && p[0] == '\r' && p[1] == '\n' )
goto skipSNI;
// is it host:?
if ( p+6 >= pend )
goto skipSNI;
if ( strncasecmp(p,"Host:",5) )
goto scanMimeSomeMore;
p += 5;
if ( p<pend && *p == ' ' ) p++;
if ( p<pend && *p == ' ' ) p++;
char *hostname = p;
// find end of line
for ( ; p<pend && *p != '\r' ; p++ );
if ( p == pend )
goto skipSNI;
// temp null
char c = *p;
*p = '\0';
/// @todo what if we can't set TLS servername extension?
SSL_set_tlsext_host_name(s->m_ssl, hostname );
// replace the \0 with original char
*p = c;
}
skipSNI:
// SSL_connect() calls malloc()
g_inMemFunction = true;
int r = SSL_connect(s->m_ssl);

@ -320,7 +320,7 @@ bool Threads::init ( ) {
// i raised since global specs new servers have 2 (hyperthreaded?) cpus
int32_t max = g_conf.m_maxCpuThreads;
if ( max < 1 ) max = 1;
if ( ! g_threads.registerType ( INTERSECT_THREAD,max,200) )
if ( ! g_threads.registerType ( INTERSECT_THREAD,max,10) )
return log("thread: Failed to register thread type." );
// filter thread spawned to call popen() to filter an http reply
if ( ! g_threads.registerType ( FILTER_THREAD, 2/*maxThreads*/,300) )
@ -334,10 +334,10 @@ bool Threads::init ( ) {
// it was taking forever to go one at a time through the unlink
// thread queue. seemed like a 1 second space between unlinks.
// 1/23/1014
if ( ! g_threads.registerType ( UNLINK_THREAD,30/*maxThreads*/,3000) )
if ( ! g_threads.registerType ( UNLINK_THREAD,5/*maxThreads*/,3000) )
return log("thread: Failed to register thread type." );
// generic multipurpose
if ( ! g_threads.registerType (GENERIC_THREAD,100/*maxThreads*/,100) )
if ( ! g_threads.registerType (GENERIC_THREAD,20/*maxThreads*/,100) )
return log("thread: Failed to register thread type." );
// for call SSL_accept() which blocks for 10ms even when socket
// is non-blocking...
@ -435,6 +435,13 @@ int32_t Threads::getNumWriteThreadsOut() {
return m_threadQueues[DISK_THREAD].getNumWriteThreadsOut();
}
int32_t Threads::getNumActiveWriteUnlinkRenameThreadsOut() {
// these do not countthreads that are done, and just awaiting join
int32_t n = m_threadQueues[DISK_THREAD].getNumWriteThreadsOut();
n += m_threadQueues[UNLINK_THREAD].getNumActiveThreadsOut();
return n;
}
// . returns false (and may set errno) if failed to launch a thread
// . returns true if thread added to queue successfully
// . may be launched instantly or later depending on # of threads in the queue
@ -853,6 +860,19 @@ bool ThreadQueue::init ( char threadType, int32_t maxThreads, int32_t maxEntries
return true;
}
int32_t ThreadQueue::getNumActiveThreadsOut() {
int32_t n = 0;
for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
ThreadEntry *e = &m_entries[i];
if ( ! e->m_isOccupied ) continue;
if ( ! e->m_isLaunched ) continue;
// if it is done and just waiting for a join, do not count
if ( e->m_isDone ) continue;
n++;
}
return n;
}
int32_t ThreadQueue::getNumThreadsOutOrQueued() {
// MDW: we also need to count threads that are returned but need their
// callback called so, in the case of RdbDump, the rdblist that was written
@ -1108,6 +1128,7 @@ int32_t Threads::timedCleanUp (int32_t maxTime, int32_t niceness) {
return 0;
if ( ! m_needsCleanup ) return 0;
//if ( g_inSigHandler ) return 0;
int64_t startTime = gettimeofdayInMillisecondsLocal();
int64_t took = 0;
@ -1299,7 +1320,15 @@ bool ThreadQueue::timedCleanUp ( int32_t maxNiceness ) {
// . join up with that thread
// . damn, sometimes he can block forever on his
// call to sigqueue(),
int64_t startTime = gettimeofdayInMillisecondsLocal();
int64_t took;
int32_t status = pthread_join ( t->m_joinTid , NULL );
took = startTime - gettimeofdayInMillisecondsLocal();
if ( took > 50 ) {
log("threads: pthread_join took %i ms",
(int)took);
}
if ( status != 0 ) {
log("threads: pthread_join %"INT64" = %s (%"INT32")",
(int64_t)t->m_joinTid,mstrerror(status),
@ -2088,7 +2117,8 @@ bool ThreadQueue::launchThread2 ( ) {
if ( m_threadType != DISK_THREAD ) {
// if one thread of this type is already out, forget it
if ( m_launchedHead ) return false;
// then we can't have 100 GENERIC THREADS!!! with this...
//if ( m_launchedHead ) return false;
// first try niceness 0 queue
ThreadEntry **bestHeadPtr = &m_waitHead0;
ThreadEntry **bestTailPtr = &m_waitTail0;
@ -3315,3 +3345,23 @@ void Threads::printState() {
}
}
}
void ThreadQueue::killAllThreads ( ) {
for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
ThreadEntry *e = &m_entries[i];
if ( ! e->m_isOccupied ) continue;
if ( ! e->m_isLaunched ) continue;
log("threads: killling thread id %i",(int)e->m_joinTid);
pthread_kill ( e->m_joinTid , SIGKILL );
log("threads: joining with thread id %i",(int)e->m_joinTid);
pthread_join ( e->m_joinTid , NULL );
}
}
void Threads::killAllThreads ( ) {
log("threads: killing all threads");
for ( int32_t j = 0 ; j < m_numQueues ; j++ ) {
ThreadQueue *tq = &m_threadQueues[j];
tq->killAllThreads();
}
}

@ -161,6 +161,7 @@ class ThreadQueue {
int32_t getNumThreadsOutOrQueued();
int32_t getNumWriteThreadsOut() ;
int32_t getNumActiveThreadsOut() ;
// . for adding an entry
@ -196,6 +197,8 @@ class ThreadQueue {
void suspendLowPriorityThreads();
void resumeLowPriorityThreads();
void killAllThreads();
// this is true if low priority threads are temporarily suspended
bool m_isLowPrioritySuspended ;
@ -246,6 +249,8 @@ class Threads {
bool areThreadsDisabled() { return m_disabled; };
bool areThreadsEnabled () { return ! m_disabled; };
void killAllThreads();
// . returns false and sets errno if thread launch failed
// . returns true on success
// . when thread is done a signal will be put on the g_loop's
@ -301,6 +306,8 @@ class Threads {
int32_t getNumThreadsOutOrQueued();
int32_t getNumWriteThreadsOut() ;
int32_t getNumActiveWriteUnlinkRenameThreadsOut() ;
// counts the high/low priority (niceness <= 0) threads
//int64_t m_hiLaunched;
//int64_t m_hiReturned;

@ -286,6 +286,7 @@ bool UdpServer::init ( uint16_t port, UdpProtocol *proto, int32_t niceness,
// no requests waiting yet
m_requestsInWaiting = 0;
// special count
m_msg07sInWaiting = 0;
m_msg10sInWaiting = 0;
m_msgc1sInWaiting = 0;
//m_msgDsInWaiting = 0;
@ -1005,7 +1006,7 @@ UdpSlot *UdpServer::getBestSlotToSend ( int64_t now ) {
UdpSlot *maxi = NULL;
int32_t score;
//UdpSlot *slot;
// . we send dgrams with the lowest "score" first
// . we send dgrams with the lowest "score" first
// . the "score" is just number of ACKs you're waiting for
// . that way transmissions that are the most caught up to their ACKs
// are considered faster so we send to them first
@ -1482,6 +1483,9 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
// rate, these are pretty lightweight. msg 0x10 reply gen times
// are VERY low. MDW
bool getSlot = true;
if ( msgType == 0x07 && m_msg07sInWaiting >= 100 )
getSlot = false;
if ( msgType == 0x10 && m_msg10sInWaiting >= 50 )
getSlot = false;
// crawl update info from Spider.cpp
@ -1671,6 +1675,7 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
// if we connected to a request slot, count it
m_requestsInWaiting++;
// special count
if ( msgType == 0x07 ) m_msg07sInWaiting++;
if ( msgType == 0x10 ) m_msg10sInWaiting++;
if ( msgType == 0xc1 ) m_msgc1sInWaiting++;
//if ( msgType == 0xd ) m_msgDsInWaiting++;
@ -3122,6 +3127,7 @@ void UdpServer::destroySlot ( UdpSlot *slot ) {
// one less request in waiting
m_requestsInWaiting--;
// special count
if ( slot->m_msgType == 0x07 ) m_msg07sInWaiting--;
if ( slot->m_msgType == 0x10 ) m_msg10sInWaiting--;
if ( slot->m_msgType == 0xc1 ) m_msgc1sInWaiting--;
//if ( slot->m_msgType == 0xd ) m_msgDsInWaiting--;

@ -390,6 +390,7 @@ class UdpServer {
int32_t m_requestsInWaiting;
// like m_requestsInWaiting but requests which spawn other requests
int32_t m_msg07sInWaiting;
int32_t m_msg10sInWaiting;
int32_t m_msgc1sInWaiting;
//int32_t m_msgDsInWaiting;

@ -1280,6 +1280,8 @@ bool UdpSlot::readDatagramOrAck ( int sock ,
}
// handle acks
if ( m_proto->isAck ( peek , peekSize ) ) {
// if ack for msg4 core to test its save stuff
//if ( m_msgType == 0x04 ) { char *xx=NULL;*xx=0; }
readAck ( sock, dgramNum , now );
// keep stats
if ( m_host ) m_host->m_dgramsFrom++;

@ -10,6 +10,12 @@
#include "UdpProtocol.h"
#include "Hostdb.h"
// i'm seeing some networks not liking big dgrams, so
// lets go super small. we won't be able to send back
// huge msgs unfortunately, so we'll have to fix that
// a different way later.
#define SMALLDGRAMS
// . we want to avoid the overhead of IP level fragmentation
// . so for an MTU of 1500 we got 28 bytes overhead (IP and UDP headers)
// . later we can try large DGRAM_SIZE values to see if faster
@ -19,9 +25,9 @@
//#define DGRAM_SIZE 7500
//#define DGRAM_SIZE ((1500-28)*5)
// this was the most stable size, but now, 4/8/04, i'm trying bigger...
#ifdef _SMALLDGRAMS_
#ifdef SMALLDGRAMS
// newspaperarchive machines need this smaller size
#define DGRAM_SIZE (1500-28)
#define DGRAM_SIZE (1500-28-10)
#else
// . here's the new size, 4/8/04, about 20x bigger
// . only use this for our machines
@ -30,10 +36,11 @@
// . let's see if smaller dgrams fix the ping spike problem on gk0c
// . this is in addition to lower the ack windows from 12 to 4
#define DGRAM_SIZE 16400
#endif
// . the 45k dgram doesn't travel well over the internet, and javier needs
// to do that for the "interface client" code
#define DGRAM_SIZE_INTERNET (1500-28)
#endif
#define DGRAM_SIZE_INTERNET (1500-28-10)
// i'd like to have less dgram to decrease interrupts and
// to decrease the MAX_DGRAMS define which decrease UdpSlot size
@ -76,10 +83,11 @@
// raised from 50MB to 80MB so Msg13 compression proxy can send back big replies > 5MB
// raised from 80MB to 180MB since we could be sending back a Msg95Reply
// which is a list of QueryChanges. 3/29/13.
#define MAX_DGRAMS (((180*1024*1024) / DGRAM_SIZE_LB) + 1)
//#define MAX_DGRAMS (((180*1024*1024) / DGRAM_SIZE_LB) + 1)
#define MAX_DGRAMS (((80*1024*1024) / DGRAM_SIZE) + 1)
//#endif
#define MAX_ABSDOCLEN ((MAX_DGRAMS * DGRAM_SIZE_LB)-50000)
#define MAX_ABSDOCLEN ((MAX_DGRAMS * DGRAM_SIZE)-50000)
// . the max size of an incoming request for a hot udp server
// . we cannot call malloc so it must fit in here

@ -66,15 +66,26 @@ static int utf8_sane[] = {
// how many bytes is char pointed to by p?
inline char getUtf8CharSize ( uint8_t *p ) {
return bytes_in_utf8_code[*p];
uint8_t c = *p;
if(c<128)
return 1;
else
return bytes_in_utf8_code[c];
}
inline char getUtf8CharSize ( char *p ) {
return bytes_in_utf8_code[*(uint8_t *)p];
uint8_t c = (uint8_t)*p;
if(c<128)
return 1;
else
return bytes_in_utf8_code[c];
}
inline char getUtf8CharSize ( uint8_t c ) {
return bytes_in_utf8_code[c];
if(c<128)
return 1;
else
return bytes_in_utf8_code[c];
}
inline char getUtf8CharSize2 ( uint8_t *p ) {

252
Url.cpp

@ -5,6 +5,8 @@
#include "Errno.h"
#include "HashTable.h"
#include "Speller.h"
#include "Punycode.h"
#include "Unicode.h"
static void print_string ( char *s , int32_t len );
@ -137,7 +139,7 @@ void Url::set (Url *baseUrl,char *s,int32_t len,bool addWWW,bool stripSessionId,
// . i know sun.com has urls like "http://sun.com/;$sessionid=123ABC$"
// . url should be ENCODED PROPERLY for this to work properly
void Url::set ( char *t , int32_t tlen , bool addWWW , bool stripSessionId ,
bool stripPound , bool stripCommonFile ,
bool stripPound , bool stripCommonFile ,
int32_t titleRecVersion ) {
reset();
// debug
@ -157,11 +159,163 @@ void Url::set ( char *t , int32_t tlen , bool addWWW , bool stripSessionId ,
while ( tlen > 0 && !is_alnum_a(*t) && *t!='-' && *t!='/'){t++;tlen--;}
// . stop t at first space or binary char
// . url should be in encoded form!
int32_t i ;
int32_t i = 0;
int32_t nonAsciiPos = -1;
for ( i = 0 ; i < tlen ; i++ ) {
if ( ! is_ascii(t[i]) ) break; // no non-ascii chars allowed
if ( is_wspace_a(t[i]) ) break; // no spaces allowed
if ( ! is_ascii(t[i]) ) {
// Sometimes the length with the null is passed in,
// so ignore nulls FIXME?
if( t[i] ) nonAsciiPos = i;
break; // no non-ascii chars allowed
}
}
if(nonAsciiPos != -1) {
// Try turning utf8 and latin1 encodings into punycode.
// All labels(between dots) in the domain are encoded
// separately. We don't support encoded tlds, but they are
// not widespread yet.
// If it is a non ascii domain it needs to take the form
// xn--<punycoded label>.xn--<punycoded label>.../
char tmp = t[tlen];
if(t[tlen]) t[tlen] = 0;
log(LOG_DEBUG, "build: attempting to decode unicode url %s pos at %"INT32, t, nonAsciiPos);
if(tmp) t[tlen] = tmp;
char encoded [ MAX_URL_LEN ];
size_t encodedLen = MAX_URL_LEN;
char *encodedDomStart = encoded;
char *p = t;
char *pend = t+tlen;
// Find the start of the domain
if(tlen > 7 && strncmp(p, "http://", 7) == 0) p += 7;
else if(tlen > 8 && strncmp(p, "https://", 8) == 0) p += 8;
gbmemcpy(encodedDomStart, t, p-t);
encodedDomStart += p-t;
while(p < pend && *p != '/') {
char *labelStart = p;
uint32_t tmpBuf[MAX_URL_LEN];
int32_t tmpLen = 0;
while(p < pend && *p != '.' && *p != '/') p++;
int32_t labelLen = p - labelStart;
bool tryLatin1 = false;
// For utf8 urls
p = labelStart;
bool labelIsAscii = true;
// Convert the domain to code points and copy it to
// tmpbuf to be punycoded
for(;p-labelStart<labelLen;
p += utf8Size(tmpBuf[tmpLen]), tmpLen++) {
labelIsAscii &= is_ascii(*p);
tmpBuf[tmpLen] = utf8Decode(p);
if(!tmpBuf[tmpLen]) { // invalid char?
tryLatin1 = true;
break;
}
}
if(labelIsAscii) {
if(labelStart[labelLen] == '.') {
labelLen++;
p++;
}
gbmemcpy(encodedDomStart, labelStart, labelLen);
encodedDomStart += labelLen;
continue;
}
if( tryLatin1 ) {
// For latin1 urls
tmpLen = 0;
for(;tmpLen<labelLen;tmpLen++) {
tmpBuf[tmpLen] = labelStart[tmpLen];
}
}
gbmemcpy(encodedDomStart, "xn--", 4);
encodedDomStart += 4;
punycode_status status ;
status = punycode_encode(tmpLen,
tmpBuf,
NULL,
&encodedLen,
encodedDomStart);
if ( status != 0 ) {
// Give up? try again?
log("build: Bad Engineer, failed to "
"punycode international url %s", t);
return;
}
// We should check if what we encoded were valid url
// characters, no spaces, etc
// FIXME: should we exclude just the bad chars? I've
// seen plenty of urls with
// a newline in the middle. Just discard the whole
// chunk for now
bool badUrlChars = false;
for(uint32_t i=0;i<encodedLen;i++) {
if(is_wspace_a(encodedDomStart[i])){
badUrlChars = true;
break;
}
}
if(encodedLen == 0 || badUrlChars) {
encodedDomStart -= 4; //don't need the xn--
p++;
} else {
encodedDomStart += encodedLen;
*encodedDomStart++ = *p++; // Copy in the . or the /
}
}
// p now points to the end of the domain
// encodedDomStart now points to the first free space in encoded string
// Now copy the rest of the url in. Watch out for non-ascii chars
// truncate the url, and keep it under max url length
uint32_t newUrlLen = encodedDomStart - encoded;
while(p < pend) {
if ( ! *p ) break; // null?
if(!is_ascii(*p)) {
//break;
// url encode utf8 characters now
char cs = getUtf8CharSize(p);
// bad utf8 char?
if ( cs <= 1 ) break;
// too long?
if ( newUrlLen + 12 >= MAX_URL_LEN )
break;
char stored = urlEncode ( &encoded[newUrlLen],
12 ,
p ,
cs );
p += cs;
newUrlLen += stored;
continue;
}
if(is_wspace_a(*p)) break;
if(newUrlLen >= MAX_URL_LEN) break;
encoded[newUrlLen++] = *p++;
}
//gbmemcpy(encodedDomStart, p, restOfUrlLen);
encoded[newUrlLen] = '\0';
return this->set(encoded, newUrlLen, addWWW, stripSessionId,
stripPound, stripCommonFile, titleRecVersion);
}
// truncate length to the first occurence of an unacceptable char
tlen = i;
// . decode characters that should not have been encoded
@ -955,6 +1109,10 @@ char *Url::getPathComponent ( int32_t num , int32_t *clen ) {
// return pc + pclen;
//}
bool Url::isHostWWW ( ) {
if ( m_hlen < 4 ) return false;
if ( m_host[0] != 'w' ) return false;
@ -2380,3 +2538,91 @@ bool Url::hasMediaExtension ( ) {
return false;
}
uint32_t Url::unitTests() {
char* urls[] = {
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela\032s-arsenals.php",
"http://topbeskæring.dk/velkommen",
"www.Alliancefrançaise.nu",
"française.Alliance.nu",
"française.Alliance.nu/asdf",
"http://française.Alliance.nu/asdf",
"http://française.Alliance.nu/",
"幸运.龍.com",
"幸运.龍.com/asdf/运/abc",
"幸运.龍.com/asdf",
"http://幸运.龍.com/asdf",
"http://Беларуская.org/Акадэмічная",
"https://hi.Български.com",
"https://fakedomain.中文.org/asdf",
"https://gigablast.com/abc/文/efg",
"https://gigablast.com/?q=文",
"http://www.example.сайт",
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqués",
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
"http://腕時計通販.jp/",
// Lets check some bad urls too:
"https://pypi.python\n\n\t\t\t\t.org/packages/source/p/pyramid/pyramid-1.5.tar.gz#md5=8747658dcbab709a9c491e43d3b0d58b"
};
StackBuf(sb);
uint32_t len = sizeof(urls) / sizeof(char*);
for(uint32_t i = 0; i < len; i++) {
Url u;
u.set(urls[i], strlen(urls[i]));
log("build:%s normalized to %s, printed to %s ",
urls[i], u.getUrl(), Url::getDisplayUrl(u.getUrl(), &sb));
sb.reset();
}
//FIXME: need to return an error if there is a problem
return 0;
}
char* Url::getDisplayUrl(char* url, SafeBuf* sb) {
char* found;
char* labelCursor = url;
if((found = strstr(labelCursor, "xn--"))) {
sb->safeMemcpy(url, found - url);
char* p = url;
char* pend = url + gbstrlen(url);
if(strncmp(p, "http://", 7) == 0) p += 7;
else if(strncmp(p, "https://", 8) == 0) p += 8;
while(p < pend && *p != '/') p++;
char* domEnd = p;
do {
if(found > domEnd) {
// Dont even look if it is past the domain
break;
}
char* encodedStart = found + 4;
uint32_t decoded [ MAX_URL_LEN];
size_t decodedLen = MAX_URL_LEN - 1 ;
char* labelEnd = encodedStart;
while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' )
labelEnd++;
punycode_status status = punycode_decode(labelEnd - encodedStart,
encodedStart,
&decodedLen,
decoded, NULL);
if(status != 0) {
log("build: Bad Engineer, failed to depunycode international url %s", url);
sb->safePrintf("%s", url);
return url;
}
sb->utf32Encode(decoded, decodedLen);
//sb->pushChar(*labelEnd);
labelCursor = labelEnd;
} while((found = strstr(labelCursor, "xn--")));
}
// Copy in the rest
sb->safePrintf("%s", labelCursor);
sb->nullTerm();
return sb->getBufStart();
}

4
Url.h

@ -232,6 +232,7 @@ public:
// this is private
bool isSpam ( char *s , int32_t slen ) ;
// . detects crazy repetetive urls like this:
// http://www.pittsburghlive.com:8000/x/tribune-review/opinion/
// steigerwald/letters/send/archive/letters/send/archive/bish/
@ -244,6 +245,9 @@ public:
// is probably more accurate than this function.
bool isLinkLoop();
static uint32_t unitTests();
static char* getDisplayUrl(char* url, SafeBuf* sb);
// private:
char m_url[MAX_URL_LEN]; // the normalized url

File diff suppressed because it is too large Load Diff

@ -475,7 +475,7 @@ class XmlDoc {
key_t *doledbKey ,
char *coll ,
class SafeBuf *pbuf ,
int32_t niceness ,
int32_t niceness ,
char *utf8Content = NULL ,
bool deleteFromIndex = false ,
int32_t forcedIp = 0 ,
@ -483,9 +483,11 @@ class XmlDoc {
uint32_t spideredTime = 0 , // time_t
bool contentHasMime = false ,
// for container docs, what is the separator of subdocs?
char *contentDelim = NULL,
char *metadata = NULL,
uint32_t metadataLen = 0) ;
char *contentDelim = NULL,
char *metadata = NULL,
uint32_t metadataLen = 0,
// for injected docs we have the recv, buffer size don't exceed that
int32_t payloadLen = -1) ;
// we now call this right away rather than at download time!
int32_t getSpideredTime();
@ -513,7 +515,9 @@ class XmlDoc {
bool indexDoc2 ( );
bool isContainerDoc ( );
bool indexContainerDoc ( );
bool indexWarcOrArc ( char ct ) ;
bool readMoreWarc();
bool indexWarcOrArc ( ) ;
key_t *getTitleRecKey() ;
//char *getSkipIndexing ( );
char *prepareToMakeTitleRec ( ) ;
@ -521,6 +525,7 @@ class XmlDoc {
bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
SafeBuf *getTitleRecBuf ( );
bool appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) ;
SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
bool forDelete ) ;
SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
@ -705,7 +710,7 @@ class XmlDoc {
char **getExpandedUtf8Content ( ) ;
char **getUtf8Content ( ) ;
// we download large files to a file on disk, like warcs and arcs
BigFile *getUtf8ContentInFile ( int64_t *fileSizeArg );
FILE *getUtf8ContentInFile ( );
int32_t *getContentHash32 ( ) ;
int32_t *getContentHashJson32 ( ) ;
//int32_t *getTagHash32 ( ) ;
@ -768,6 +773,8 @@ class XmlDoc {
uint64_t m_ipStartTime;
uint64_t m_ipEndTime;
bool m_updatedMetaData;
void copyFromOldDoc ( class XmlDoc *od ) ;
class SpiderReply *getFakeSpiderReply ( );
@ -813,6 +820,7 @@ class XmlDoc {
int32_t getBoostFromSiteNumInlinks ( int32_t inlinks ) ;
bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ;
bool hashMetaTags ( class HashTableX *table ) ;
bool hashMetaData ( class HashTableX *table ) ;
bool hashIsClean ( class HashTableX *table ) ;
bool hashZipCodes ( class HashTableX *table ) ;
bool hashMetaZip ( class HashTableX *table ) ;
@ -1067,6 +1075,7 @@ class XmlDoc {
int32_t m_addedSpiderRequestSize;
int32_t m_addedSpiderReplySize;
int32_t m_addedStatusDocSize;
int64_t m_addedStatusDocId;
SafeBuf m_metaList2;
SafeBuf m_zbuf;
@ -1084,12 +1093,16 @@ class XmlDoc {
int32_t m_warcError ;
int32_t m_arcError ;
bool m_doneInjectingWarc ;
bool m_doneInjectingArc ;
int64_t m_fileOff ;
int64_t m_bytesStreamed;
char *m_fileBuf ;
int32_t m_fileBufAllocSize;
bool m_registeredWgetReadCallback;
char *m_fptr ;
char *m_fptrEnd ;
FILE* m_pipe;
BigFile m_file;
int64_t m_fileSize;
FileState m_fileState;
@ -2401,7 +2414,6 @@ class XmlDoc {
bool m_setFromDocId;
bool m_freeLinkInfo1;
bool m_freeLinkInfo2;
bool m_contentInjected;
bool m_recycleContent;
@ -2470,7 +2482,8 @@ class XmlDoc {
// for container docs consisting of subdocs to inject
char *contentDelim = NULL,
char* metadata = NULL,
uint32_t metadataLen = 0);
uint32_t metadataLen = 0,
int32_t payloadLen = -1);
bool injectLinks ( HashTableX *linkDedupTable ,

@ -2515,7 +2515,7 @@ int32_t deserializeMsg ( int32_t baseSize ,
return baseSize + (p - stringBuf);//getStringBuf());
}
void deserializeMsg2 ( char **firstStrPtr , // ptr_url
bool deserializeMsg2 ( char **firstStrPtr , // ptr_url
int32_t *firstSizeParm ) { // size_url
int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
// point to our string buffer
@ -2531,7 +2531,7 @@ void deserializeMsg2 ( char **firstStrPtr , // ptr_url
// make it NULL if size is 0 though
if ( *sizePtr == 0 ) *strPtr = NULL;
// sanity check
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
if ( *sizePtr < 0 ) return false;//{ char *xx = NULL; *xx =0; }
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string
@ -2540,6 +2540,7 @@ void deserializeMsg2 ( char **firstStrPtr , // ptr_url
}
// return how many bytes we processed
//return baseSize + (p - stringBuf);//getStringBuf());
return true;
}
// print it to stdout for debugging Dates.cpp
@ -2618,4 +2619,3 @@ bool verifyUtf8 ( char *txt ) {
int32_t tlen = gbstrlen(txt);
return verifyUtf8(txt,tlen);
}

@ -237,7 +237,7 @@ bool saveTimeAdjustment ( ) ;
#define is_hspace_a(c) g_map_is_hspace[(unsigned char)c]
#define is_ascii(c) g_map_is_ascii[(unsigned char)c]
#define is_ascii9(c) g_map_is_ascii[(unsigned char)c]
#define is_ascii3(c) g_map_is_ascii3[(unsigned char)c]
#define is_ascii3(c) ((unsigned char)c<128 || g_map_is_ascii3[(unsigned char)c])
#define is_punct_a(c) g_map_is_punct[(unsigned char)c]
#define is_alnum_a(c) g_map_is_alnum[(unsigned char)c]
#define is_alpha_a(c) g_map_is_alpha[(unsigned char)c]
@ -627,6 +627,6 @@ int32_t deserializeMsg ( int32_t baseSize ,
char **firstStrPtr ,
char *stringBuf ) ;
void deserializeMsg2 ( char **firstStrPtr , int32_t *firstSizeParm );
bool deserializeMsg2 ( char **firstStrPtr , int32_t *firstSizeParm );
#endif

@ -236,7 +236,7 @@ int filterContent ( char *buf , int32_t n , int32_t mimeLen , char ctype , int32
//fprintf(stderr,"in=%s\n",in);
int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU );
int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU | S_IRWXG );
if ( fd < 0 ) {
fprintf(stderr,"gbfilter: open: %s\n",strerror(errno));
return -1;

144
main.cpp

@ -289,7 +289,7 @@ bool summaryTest1 ( char *rec, int32_t listSize, char *coll , int64_t docId ,
// time a big write, read and then seeks
bool thrutest ( char *testdir , int64_t fileSize ) ;
void seektest ( char *testdir , int32_t numThreads , int32_t maxReadSize ,
char *filename );
char *filename , bool doSeqWriteThread );
bool pingTest ( int32_t hid , uint16_t clientPort );
bool memTest();
@ -810,17 +810,21 @@ int main2 ( int argc , char *argv[] ) {
"parser speed tests\n\n"
*/
/*
"thrutest [dir] [fileSize]\n\tdisk write/read speed "
"test\n\n"
"thrutest [dir] [fileSize]\n\tdisk sequential "
"write then read speed tests.\n\n"
"seektest [dir] [numThreads] [maxReadSize] "
"[filename]\n"
"\tdisk seek speed test\n\n"
"\tdisk access speed test. (IOps)\n\n"
"rwtest [dir] [numThreads] [maxReadSize] "
"[filename]\n"
"\tdisk read access speed test while sequentially "
"writing. Simulates Gigablast while spidering and "
"querying nicely.\n\n"
"memtest\n"
"\t Test how much memory we can use\n\n"
*/
/*
// Quality Tests
@ -1390,7 +1394,20 @@ int main2 ( int argc , char *argv[] ) {
if ( cmdarg+2 < argc ) numThreads = atol(argv[cmdarg+2]);
if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]);
if ( cmdarg+4 < argc ) filename = argv[cmdarg+4];
seektest ( testdir , numThreads , maxReadSize , filename );
seektest ( testdir , numThreads , maxReadSize ,filename,false);
return 0;
}
// gb rwtest <testdir> <numThreads> <maxReadSize>
if ( strcmp ( cmd , "rwtest" ) == 0 ) {
char *testdir = "/tmp/";
int32_t numThreads = 20; //30;
int64_t maxReadSize = 20000;
char *filename = NULL;
if ( cmdarg+1 < argc ) testdir = argv[cmdarg+1];
if ( cmdarg+2 < argc ) numThreads = atol(argv[cmdarg+2]);
if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]);
if ( cmdarg+4 < argc ) filename = argv[cmdarg+4];
seektest ( testdir , numThreads , maxReadSize,filename,true);
return 0;
}
@ -2572,6 +2589,13 @@ int main2 ( int argc , char *argv[] ) {
false );// sendtoproxies
}
if ( strcmp ( cmd , "unittest" ) == 0 ) {
if ( cmdarg + 1 >= argc ) exit(1);
if(strcmp("url", argv[cmdarg+1]) == 0) {
exit(Url::unitTests());
}
}
// gb startclassifier coll ruleset [hostId]
/*
if ( strcmp ( cmd , "startclassifier" ) == 0 ) {
@ -4936,7 +4960,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
// ensure directory is there, if
// not then make it
"ssh %s 'mkdir %s' ; "
"scp -r %s %s:%s"
"scp -p -r %s %s:%s"
, ipStr
, h2->m_dir
@ -5022,7 +5046,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
if ( ! f.doesExist() ) target = "gb";
sprintf(tmp,
"scp -c arcfour " // blowfish is faster
"scp -p " // blowfish is faster
"%s%s "
"%s:%s/gb.installed%s",
dir,
@ -5058,7 +5082,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
// don't copy to ourselves
//if ( h2->m_hostId == h->m_hostId ) continue;
sprintf(tmp,
"scp "
"scp -p "
"%sgb.new "
"%s:%s/tmpgb.installed &",
dir,
@ -5071,7 +5095,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
// don't copy to ourselves
//if ( h2->m_hostId == h->m_hostId ) continue;
sprintf(tmp,
"scp %sgb.conf %shosts.conf %s:%s %s",
"scp -p %sgb.conf %shosts.conf %s:%s %s",
dir ,
dir ,
//h->m_hostId ,
@ -5453,7 +5477,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
}
*/
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/content.rdf.u8 "
"%s:%scatdb/content.rdf.u8",
dir,
@ -5462,7 +5486,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/structure.rdf.u8 "
"%s:%scatdb/structure.rdf.u8",
dir,
@ -5471,7 +5495,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/gbdmoz.structure.dat "
"%s:%scatdb/gbdmoz.structure.dat",
dir,
@ -5480,7 +5504,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/gbdmoz.content.dat "
"%s:%scatdb/gbdmoz.content.dat",
dir,
@ -5503,7 +5527,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
// don't copy to ourselves
if ( h2->m_hostId == 0 ) continue;
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/content.rdf.u8.new "
"%s:%scatdb/content.rdf.u8.new",
dir,
@ -5512,7 +5536,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/structure.rdf.u8.new "
"%s:%scatdb/structure.rdf.u8.new",
dir,
@ -5521,7 +5545,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/gbdmoz.structure.dat.new "
"%s:%scatdb/gbdmoz.structure.dat.new",
dir,
@ -5530,7 +5554,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/gbdmoz.content.dat.new "
"%s:%scatdb/gbdmoz.content.dat.new",
dir,
@ -5539,7 +5563,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"scp "
"scp -p "
"%scatdb/gbdmoz.content.dat.new.diff "
"%s:%scatdb/gbdmoz.content.dat.new.diff",
dir,
@ -6384,6 +6408,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
bool justPrintSentences,
bool justPrintWords ) {
g_isDumpingRdbFromMain = 1;
if (!ucInit(g_hostdb.m_dir, true)) {
log("Unicode initialization failed!");
return;
@ -6903,6 +6928,8 @@ void dumpDoledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTr
printf("\n");
// must be a request -- for now, for stats
if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) {
// error!
continue;
char *xx=NULL;*xx=0; }
// cast it
SpiderRequest *sreq = (SpiderRequest *)srec;
@ -11642,17 +11669,19 @@ static BigFile s_f;
static int32_t s_numThreads = 0;
static int64_t s_maxReadSize = 1;
static int64_t s_startTime = 0;
static bool s_doSeqWriteThread;
//#define MAX_READ_SIZE (2000000)
#include <sys/types.h>
#include <sys/wait.h>
void seektest ( char *testdir, int32_t numThreads, int32_t maxReadSize ,
char *filename ) {
char *filename , bool doSeqWriteThread ) {
g_loop.init();
g_threads.init();
s_numThreads = numThreads;
s_maxReadSize = maxReadSize;
s_doSeqWriteThread = doSeqWriteThread;
if ( s_maxReadSize <= 0 ) s_maxReadSize = 1;
//if ( s_maxReadSize > MAX_READ_SIZE ) s_maxReadSize = MAX_READ_SIZE;
@ -11689,7 +11718,7 @@ void seektest ( char *testdir, int32_t numThreads, int32_t maxReadSize ,
"exist. Use ./gb thrutest ... to create speedtest* files.");
return;
skip:
s_f.open ( O_RDONLY );
s_f.open ( O_RDWR );
s_filesize = s_f.getFileSize();
log ( LOG_INIT, "admin: file size = %"INT64".",s_filesize);
// always block
@ -11719,6 +11748,30 @@ skip:
//s_lock = 1;
//pthread_t tid1 ; //, tid2;
//g_conf.m_logDebugThread = 1;
// garbage collection on ssds seems to be triggered by writes so
// that they do not hurt read times, do this:
g_conf.m_flushWrites = 1;
// disable linux file cache
// system("echo 1 > /proc/sys/vm/drop_caches");
// -o sync TOTAL WORKS!!!!!!!
// mount with -o sync to disable write page caching on linux
// disable on-disk write cache
// system("sudo hdparm -W 0 /dev/sda2");
// system("sudo hdparm -W 0 /dev/sdb1");
// system("sudo hdparm -W 0 /dev/sdc1");
// system("sudo hdparm -W 0 /dev/sdd1");
// disable read-ahead
// system("sudo hdparm -A 0 /dev/sda2");
// system("sudo hdparm -A 0 /dev/sdb1");
// system("sudo hdparm -A 0 /dev/sdc1");
// system("sudo hdparm -A 0 /dev/sdd1");
// set time
s_startTime = gettimeofdayInMilliseconds_force();
@ -11771,6 +11824,7 @@ void *startUp ( void *state , ThreadEntry *t ) {
// fprintf(stderr,"Threads::startUp: setpriority: failed\n");
// exit(-1);
//}
// read buf
//char buf [ MAX_READ_SIZE ];
#undef malloc
@ -11782,13 +11836,25 @@ void *startUp ( void *state , ThreadEntry *t ) {
}
// we got ourselves
s_launched++;
char *s = "reads";
if ( id == 0 && s_doSeqWriteThread )
s = "writes";
// msg
fprintf(stderr,"id=%"INT32" launched. Performing 100000 reads.\n",id);
fprintf(stderr,"threadid=%"INT32" launched. "
"Performing 100000 %s.\n",id,s);
// #undef sleep
// if ( id == 0 ) sleep(1000);
// #define sleep(a) { char *xx=NULL;*xx=0; }
// wait for lock to be unleashed
//while ( s_launched != s_numThreads ) usleep(10);
// now do a stupid loop
//int32_t j, off , size;
int64_t off , size;
int64_t seqOff = 0;
for ( int32_t i = 0 ; i < 100000 ; i++ ) {
uint64_t r = rand();
r <<= 32 ;
@ -11802,7 +11868,13 @@ void *startUp ( void *state , ThreadEntry *t ) {
int64_t start = gettimeofdayInMilliseconds_force();
//fprintf(stderr,"%"INT32") i=%"INT32" start\n",id,i );
//pread ( s_fd1 , buf , size , off );
s_f.read ( buf , size , off );
if ( id == 0 && s_doSeqWriteThread )
s_f.write ( buf , size , seqOff );
else
s_f.read ( buf , size , off );
seqOff += size;
if ( seqOff + size > s_filesize )
seqOff = 0;
//fprintf(stderr,"%"INT32") i=%"INT32" done\n",id,i );
int64_t now = gettimeofdayInMilliseconds_force();
#undef usleep
@ -11811,13 +11883,25 @@ void *startUp ( void *state , ThreadEntry *t ) {
s_count++;
float sps = (float)((float)s_count * 1000.0) /
(float)(now - s_startTime);
fprintf(stderr,"count=%"INT32" off=%012"INT64" size=%"INT32" time=%"INT32"ms "
"(%.2f seeks/sec)\n",
int64_t poff = off;
char *str = "seeks";
if ( id == 0 && s_doSeqWriteThread ) {
poff = seqOff;
str = "writes";
}
fprintf(stderr,"threadid=%i "
"count=%"INT32" "
"off=%012"INT64" "
"size=%"INT32" "
"time=%"INT32"ms "
"(%.2f %s/sec)\n",
(int)id,
(int32_t)s_count,
(int64_t)off,
(int64_t)poff,
(int32_t)size,
(int32_t)(now - start) ,
sps );
sps ,
str );
}
@ -16849,7 +16933,7 @@ void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool inclu
int32_t filenum = 0;
char filename[64];
sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
//int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
int32_t numDumped = 0;
uint32_t bytesDumped = 0;
loop:
@ -17016,7 +17100,7 @@ void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool inclu
filenum++;
sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
close(FD);
FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
//FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
bytesDumped = 0;
fprintf(stderr, "Started new file: %s. starts at docId: %"INT64".\n",filename, lastDocId);
}

43
qa.cpp

@ -248,10 +248,10 @@ void makeQADir ( ) {
char dir[1024];
snprintf(dir,1000,"%sqa",g_hostdb.m_dir);
log("mkdir mkdir %s",dir);
int32_t status = ::mkdir ( dir ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH );
int32_t status = ::mkdir ( dir ,getDirCreationFlags() );
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH );
if ( status == -1 && errno != EEXIST && errno )
log("qa: Failed to make directory %s: %s.",
dir,mstrerror(errno));
@ -1459,6 +1459,13 @@ bool qaTimeAxis ( ) {
"format=xml&u=");
sb.urlEncode ( s_urlPtrs[s_flags[URL_COUNTER]]);
sb.safePrintf("&hasmime=1");
// add some meta data now, the current time stamp so we can
// make sure the meta data is updated even if its EDOCUNCHANGED
sb.safePrintf("&metadata=");
static int32_t s_count9 = 0;
SafeBuf tmp;
tmp.safePrintf("{\"qatesttime\":%"INT32"}\n",s_count9++);
sb.urlEncode ( tmp.getBufStart(), tmp.getLength() );
sb.safePrintf("&content=");
sb.urlEncode(s_contentPtrs[contentIndex]);
@ -1494,13 +1501,17 @@ bool qaTimeAxis ( ) {
return false;
}
// if ( ! s_flags[EXAMINE_RESULTS] ) {
// s_flags[16] = true;
// if ( ! getUrl ( "/search?c=qatest123&qa=1&q=%2Bthe"
// "&dsrt=500",
// 702467314 ) )
// return false;
// }
// this doc should have qatesttime:197 and qatesttime:198
// since it had a EDOCUNCHANGED error the 2nd time around but
// different metadata.
if ( ! s_flags[EXAMINE_RESULTS1] ) {
s_flags[EXAMINE_RESULTS1] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&"
"format=json&"
"q=qatesttime:197",
702467314 ) )
return false;
}
return true;
}
@ -1534,6 +1545,8 @@ bool qaWarcFiles ( ) {
"&obeyRobots=0"
// This is what we are testing
"&usetimeaxis=1"
// we are indexing warc files
"&indexwarcs=1"
,
// checksum of reply expected
0 ) )
@ -1638,7 +1651,7 @@ bool qaInjectMetadata ( ) {
char* metadata = "{\"testtest\":42,\"a-hyphenated-name\":5, "
"\"a-string-value\":\"can we search for this\", "
"an array:['a','b', 'c', 1,2,3], "
"\"an array\":[\"a\",\"b\", \"c\", 1,2,3], "
"\"a field with spaces\":6, \"compound\":{\"field\":7}}";
s_flags[ADD_INITIAL_URLS]++;
@ -3401,9 +3414,9 @@ static QATest s_qatests[] = {
"when content has changed, even if the url is the same. "},
{qaWarcFiles,
"indexWarcFiles",
"Ensure the spider handles arc.gz and warc.gz file formats."},
// {qaWarcFiles,
// "indexWarcFiles",
// "Ensure the spider handles arc.gz and warc.gz file formats."},
{qaInjectMetadata,
"injectMetadata",

@ -11,13 +11,14 @@ import sqlite3
import datetime
import sys
import time
import flask
# import flask
import signal, os
import random
from itertools import repeat
staleTime = datetime.timedelta(90,0,0) # three month for now
app = flask.Flask(__name__)
app.secret_key = 'oaisj84alwsdkjhf9238u'
staleTime = datetime.timedelta(7,0,0) # one week for now
# app = flask.Flask(__name__)
# app.secret_key = 'oaisj84alwsdkjhf9238u'
def getDb(makeDates=True):
if makeDates:
@ -33,6 +34,9 @@ def handler(signum, frame):
#Generate environment with:
#pex -r requests -r multiprocessing -e inject:main -o warc-inject -s '.' --no-wheel
#pex -r requests -r multiprocessing -o warc-inject
# see the Makefile
# TODO: add argument parser
# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument('--foo', help='foo help')
@ -63,13 +67,16 @@ def reallyExecuteMany(c, query, qargs):
def injectItem(item, db, mode):
itemStart = time.time()
c = db.cursor()
res = reallyExecute(c, 'select * from items where item = ?', (item,)).fetchone()
db.commit()
itemId = None
if res:
if res[1] > (datetime.datetime.now() - staleTime):
print 'skipping %s because we checked recently' % item
return 0 # We checked recently
return time.time() - itemStart # We checked recently
itemId = res[0]
@ -83,7 +90,7 @@ def injectItem(item, db, mode):
except Exception, e:
print 'error: metadata feed went down (%s) for: %s' % (e, item)
time.sleep(10)
if itemId is None:
reallyExecute(c, "insert INTO items VALUES (?,?)", (item, datetime.datetime.now()))
@ -91,11 +98,12 @@ def injectItem(item, db, mode):
db.commit()
if 'files' not in md:
return
time.time() - itemStart
res = None
res = reallyExecute(c, "select fileName, updated, status, took from files where itemId = ?",
(itemId,)).fetchall()
db.commit()
lastUpdate = {}
for fileName, updated, status, took in res:
@ -105,22 +113,31 @@ def injectItem(item, db, mode):
dbUpdates = []
skipped = 0
for ff in md['files']:
if not ff['name'].endswith('arc.gz'): continue
warcs = filter(lambda x: 'name' in x and x['name'].endswith and x['name'].endswith('arc.gz'), md['files'])
collectionName = md['metadata'].get('archiveit-collection-name', '')
for ii, ff in enumerate(warcs):
#if not ff['name'].endswith('arc.gz'): continue
itemMetadata = {'mtime':ff['mtime']}
updateTime = datetime.datetime.fromtimestamp(float(ff['mtime']))
if ff['name'] in lastUpdate and updateTime <= lastUpdate[ff['name']]:
if mode != 'force' and ff['name'] in lastUpdate and updateTime <= lastUpdate[ff['name']]:
print "skip {0} because it is up to date".format(ff['name'])
skipped += 1
requests.post('http://localhost:10008/progress',
json={'item':item, 'total':len(warcs), 'done':ii+1,
'collection-name':collectionName})
continue
itemMetadata.update(md['metadata'])
postVars = {'url':'http://archive.org/download/%s/%s' %
(item,ff['name']),
'metadata':json.dumps(itemMetadata),
'c':'ait'}
'c':'ait',
'spiderlinks':0}
start = time.time()
if mode == 'production':
if mode == 'testing':
time.sleep(random.randint(1,4))
statusCode = 999
else:
try:
rp = requests.post("http://localhost:8000/admin/inject", postVars)
statusCode = rp.status_code
@ -129,49 +146,60 @@ def injectItem(item, db, mode):
print 'error: gb inject', postVars['url'], e
statusCode = -1
#print postVars['url'], rp.status_code
else:
time.sleep(random.randint(1,4))
statusCode = 999
took = time.time() - start
print "sent", ff['name'],'to gb, took', took
sys.stdout.flush()
dbUpdates.append((itemId, ff['name'], updateTime, statusCode, took))
requests.post('http://localhost:10008/progress',
json={'item':item, 'total':len(warcs), 'done':ii+1,
'collection-name':collectionName})
reallyExecuteMany(c, "DELETE FROM files where fileName = ? ", zip(lastUpdate.iterkeys()))
reallyExecuteMany(c, "INSERT INTO files VALUES (?,?,?,?,?)",
dbUpdates)
db.commit()
if len(dbUpdates):
reallyExecuteMany(c, "DELETE FROM files where fileName = ? ", zip(lastUpdate.iterkeys()))
reallyExecuteMany(c, "INSERT INTO files VALUES (?,?,?,?,?)",
dbUpdates)
db.commit()
print 'completed %s with %s items injected and %s skipped' % (item, len(dbUpdates), skipped)
return time.time() - itemStart
def getPage(zippedArgs):
page, mode = zippedArgs
page, mode, resultsPerPage, extraQuery = zippedArgs
query = 'collection%3Aarchiveitdigitalcollection+' + extraQuery
#r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&rows=1&page={0}&output=json&save=yes'.format(page))
r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=100&page={0}&output=json&save=yes'.format(page))
if r.status_code != 200:
return 0
url = 'https://archive.org/advancedsearch.php?q={1}&fl%5B%5D=identifier&sort[]=date+asc&rows={2}&page={0}&output=json'.format(page, query, resultsPerPage)
try:
r = requests.get(url)
if r.status_code != 200:
return 0
contents = r.content
jsonContents = json.loads(contents)
items = [x['identifier'] for x in jsonContents['response']['docs']]
numFound = jsonContents['response']['numFound']
if len(items) == 0:
print 'got 0 items for search page', page
return 0
print 'loading %s items, %s - %s of %s' % (len(items), items[0], items[-1], numFound)
db = getDb()
for item in items:
injectItem(item, db, mode)
db.close()
return len(items)
contents = r.content
jsonContents = json.loads(contents)
items = [x['identifier'] for x in jsonContents['response']['docs']]
numFound = jsonContents['response']['numFound']
if len(items) == 0:
requests.post('http://localhost:10008/progress', json={'total':numFound, 'completed':'', 'query':extraQuery})
print 'got 0 items for search page', page
return 0
print 'loading %s items, %s - %s of %s' % (len(items), items[0], items[-1], numFound)
for item in items:
db = getDb()
took = injectItem(item, db, mode)
db.close()
requests.post('http://localhost:10008/progress', json={'total':numFound,
'completed':item,
'query':extraQuery,
'took':took})
return len(items)
except Exception, e:
print 'Caught', e, 'sleep and retry', url
time.sleep(60)
return getPage(zippedArgs)
def dumpDb():
@ -197,6 +225,10 @@ def showItems():
def nuke(lastPid, fromOrbit=False):
try:
requests.post('http://localhost:10008/shutdown', {})
except:
pass
sig = signal.SIGTERM
if fromOrbit:
sig = signal.SIGKILL
@ -209,7 +241,7 @@ def nuke(lastPid, fromOrbit=False):
except:
pass
killed = subprocess.Popen("""kill `ps auxx |grep warc-inject|awk -e '{print $2}'`""" % sys.argv[0],
killed = subprocess.Popen("""kill `ps auxx |grep warc-inject|grep -v grep|awk -e '{print $2}'`""",
shell=True,stdout=subprocess.PIPE).communicate()[0]
if killed == 'Terminated':
@ -219,13 +251,47 @@ def nuke(lastPid, fromOrbit=False):
def main():
try:
lastPid = open('running.pid', 'r').read()
except:
lastPid = None
global staleTime
print 'arguments were', sys.argv, 'pid is', os.getpid()
open('running.pid', 'w').write(str(os.getpid()))
if sys.argv[1] != 'monitor':
try:
lastPid = open('running.pid', 'r').read()
except:
lastPid = None
open('running.pid', 'w').write(str(os.getpid()))
# p = multiprocessing.Process(target=serveForever)
# p.start()
if sys.argv[1] == 'test':
query = ''
if len(sys.argv) == 3:
query = sys.argv[2]
#subprocess.Popen(['python','inject', 'monitor'])
mode = 'testing'
runInjects(10, 'testing', query)
if sys.argv[1] == 'run':
query = ''
if len(sys.argv) == 4:
query = sys.argv[3]
#subprocess.Popen(['./warc-inject','monitor'])
threads = int(sys.argv[2])
runInjects(threads, 'production', query)
print "done running"
if len(sys.argv) == 2:
if sys.argv[1] == 'monitor':
import monitor
monitor.main()
if sys.argv[1] == 'init':
init()
print 'initialized'
@ -247,6 +313,8 @@ def main():
nuke(lastPid, fromOrbit=True)
if sys.argv[1] == 'test':
subprocess.Popen(['./warc-inject','monitor'])
mode = 'testing'
runInjects(10, 'testing')
@ -308,33 +376,106 @@ def main():
signal.alarm(0) # Disable the alarm
if sys.argv[1] == 'serve':
serveForever()
# if sys.argv[1] == 'serve':
# serveForever()
if len(sys.argv) == 3:
if sys.argv[1] == 'force':
itemName = sys.argv[2]
db = getDb()
injectItem(itemName, db, 'production')
sys.exit(0)
if len(sys.argv) == 4:
if sys.argv[1] == 'injectfile':
staleTime = datetime.timedelta(0,0,0)
from multiprocessing.pool import ThreadPool
fileName = sys.argv[2]
items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
threads = int(sys.argv[3])
pool = ThreadPool(processes=threads)
#print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
def injectItemTupleWrapper(itemName):
db = getDb()
ret = injectItem(itemName, db, 'production')
db.close()
return ret
answer = pool.map(injectItemTupleWrapper, items)
print 'finished: ', answer
sys.exit(0)
if sys.argv[1] == 'forcefile':
staleTime = datetime.timedelta(0,0,0)
from multiprocessing.pool import ThreadPool
fileName = sys.argv[2]
items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
threads = int(sys.argv[3])
pool = ThreadPool(processes=threads)
#print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
def injectItemTupleWrapper(itemName):
db = getDb()
ret = injectItem(itemName, db, 'force')
db.close()
return ret
answer = pool.map(injectItemTupleWrapper, items)
print 'finished: ', answer
sys.exit(0)
if sys.argv[1] == 'injectitems':
from multiprocessing.pool import ThreadPool
fileName = sys.argv[2]
items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
threads = int(sys.argv[3])
pool = ThreadPool(processes=threads)
#print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
def injectItemTupleWrapper(itemName):
db = getDb()
ret = injectItem(itemName, db, 'production')
db.close()
return ret
answer = pool.map(injectItemTupleWrapper, items)
sys.exit(0)
def getNumResults(query):
query = 'collection%3Aarchiveitdigitalcollection+' + query
r = requests.get('https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=identifier&sort[]=date+asc&rows=1&page=0&output=json'.format(query))
if r.status_code != 200:
return 0
contents = r.content
jsonContents = json.loads(contents)
numFound = jsonContents['response']['numFound']
return numFound
if sys.argv[1] == 'run':
threads = int(sys.argv[2])
runInjects(threads)
# else:
# #getPage(3)
# from multiprocessing.pool import ThreadPool
# pool = ThreadPool(processes=150)
# pool.map(getPage, xrange(1,1300))
def runInjects(threads, mode='production'):
def runInjects(threads, mode='production', query=''):
from multiprocessing.pool import ThreadPool
import math
pool = ThreadPool(processes=threads)
try:
from itertools import repeat
maxPages = 1300
answer = pool.map(getPage, zip(xrange(1,maxPages), repeat(mode, maxPages)))
totalResults = getNumResults(query)
resultsPerPage = 100
maxPages = int(math.ceil(totalResults / float(resultsPerPage)))
if maxPages < threads:
maxPages = threads
resultsPerPage = int(math.ceil(totalResults / float(maxPages)))
print threads, ' threads,', totalResults, 'total,', maxPages, 'pages', resultsPerPage, 'results per page'
answer = pool.map(getPage, zip(xrange(1,maxPages),
repeat(mode, maxPages),
repeat(resultsPerPage, maxPages),
repeat(query, maxPages)))
print "finished item pass", answer
except (KeyboardInterrupt, SystemExit):
print 'ok, caught'
raise
requests.post('http://localhost:10008/shutdown', {})
sys.exit(0)
#raise
def init():
@ -351,73 +492,67 @@ def init():
db.close()
def serveForever():
@app.route('/',
methods=['GET', 'POST'], endpoint='home')
def home():
db = getDb(makeDates=False)
res = db.execute('select * from items limit 10')
for item, checked in res.fetchall():
print item
try:
metadata = subprocess.Popen(['./ia','metadata', item],
stdout=subprocess.PIPE).communicate()[0]
# def serveForever():
# @app.route('/',
# methods=['GET', 'POST'], endpoint='home')
# def home():
# db = getDb(makeDates=False)
# res = db.execute('select * from items limit 10')
# for item, checked in res.fetchall():
# print item
# try:
# metadata = subprocess.Popen(['./ia','metadata', item],
# stdout=subprocess.PIPE).communicate()[0]
break
except Exception, e:
pass
db.close()
# break
# except Exception, e:
# pass
# db.close()
# return flask.make_response(metadata)
# @app.route('/progress',
# methods=['GET', 'POST'], endpoint='progress')
# def progress():
# r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=1&page=1&output=json')
# if r.status_code != 200:
# return flask.make_response(json.dumps({error:'ia search feed is down'}),
# 'application/json')
# contents = r.content
# jsonContents = json.loads(contents)
# numFound = jsonContents['response']['numFound']
# db = getDb()
# examinedItems = db.execute('select count(*) from items').fetchone()
# itemsWithWarc = db.execute('select count(*) from items where ROWID in (select itemId from files where files.status = 200)').fetchone()
# return flask.make_response(json.dumps({'totalItems':numFound,
# 'examinedItems':examinedItems,
# 'itemsWithWarc':itemsWithWarc
# }, indent=4), 'application/json')
# @app.route('/items',
# methods=['GET', 'POST'], endpoint='items')
# def items():
# db = getDb(makeDates=False)
# c = db.cursor()
# res = c.execute("select item, checked from items")
# out = []
# for item, checked in res.fetchall():
# out.append({'item':item, 'checked':checked})
# db.close()
return flask.make_response('hihih' + metadata)
# return flask.make_response(json.dumps(out), 'application/json')
@app.route('/progress',
methods=['GET', 'POST'], endpoint='progress')
def progress():
r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=1&page=1&output=json')
if r.status_code != 200:
return flask.make_response(json.dumps({error:'ia search feed is down'}),
'application/json')
contents = r.content
jsonContents = json.loads(contents)
numFound = jsonContents['response']['numFound']
db = getDb()
examinedItems = db.execute('select count(*) from items').fetchone()
itemsWithWarc = db.execute('select count(*) from items where ROWID in (select itemId from files where files.status = 200)').fetchone()
return flask.make_response(json.dumps({'totalItems':numFound,
'examinedItems':examinedItems,
'itemsWithWarc':itemsWithWarc
}, indent=4), 'application/json')
@app.route('/items',
methods=['GET', 'POST'], endpoint='items')
def items():
db = getDb(makeDates=False)
c = db.cursor()
res = c.execute("select item, checked from items")
out = []
for item, checked in res.fetchall():
out.append({'item':item, 'checked':checked})
db.close()
return flask.make_response(json.dumps(out), 'application/json')
app.run('0.0.0.0',
port=7999,
debug=True,
use_reloader=True,
use_debugger=True)
# app.run('0.0.0.0',
# port=7999,
# debug=False,
# use_reloader=False,
# use_debugger=False)
if __name__ == '__main__':

4109
script/inject/monitor.py Normal file

File diff suppressed because one or more lines are too long

@ -100,7 +100,7 @@ def getSplitTime():
def copyToTwins(fname):
def copyToTwins(fname, backToFront=False):
fh = open(fname, 'r')
ret = {}
hosts = []
@ -117,23 +117,25 @@ def copyToTwins(fname):
continue
#print directory, ip1, note
step = len(hosts)/2
hostPlex = {}
someIp = None
cmds = []
for hostId, dnsPort, httpsPort, httpPort, udbPort,ip1, ip2, directory, note in hosts[:step]:
if ip1 not in hostPlex:
hostPlex[ip1] = []
someIp = ip1
hostPlex[ip1].append('scp -r %s:%s* %s:%s. ' % (ip1, directory, (hosts[hostId + step][5]), (hosts[hostId + step][7])))
backHostId, backDnsPort, backHttpsPort, backHttpPort, backUdbPort,backIp1, backIp2, backDirectory, backNote = hosts[hostId + step]
if note != directory:
print 'oh looks like you overlooked host %s' % hostId
if backNote != backDirectory:
print 'oh looks like you overlooked host %s' % backHostId
if backToFront:
cmd = 'scp -r %s:%s* %s:%s. &' % (backIp1, backDirectory, ip1, directory )
else:
cmd = 'scp -r %s:%s* %s:%s. &' % (ip1, directory, backIp1, backDirectory)
cmds.append(cmd)
#print 'scp -r %s:%s* %s:%s. &' % (ip1, directory, (hosts[hostId + step][5]), (hosts[hostId + step][7]))
while len(hostPlex[someIp]) > 0:
cmd = []
for cmd in cmds:
print cmd
for ip in hostPlex.iterkeys():
cmd.append(hostPlex[ip].pop())
#print hostPlex[ip].pop()
print '&\n'.join(cmd), ';'
def testDiskSpeed(host, directory):

Binary file not shown.