Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing
Conflicts: Errno.cpp Errno.h
This commit is contained in:
BigFile.cppBigFile.hCollectiondb.cppCollectiondb.hConf.cppConf.hDns.cppErrno.cppErrno.hFile.cppFile.hHashTableX.cppHostdb.cppHostdb.hHttpServer.cppImages.cppJson.cppJson.hLanguage.cppLanguageIdentifier.cppLinkdb.cppLinkdb.hLog.cppLoop.cppLoop.hMakefileMatches.cppMatches.hMem.cppMsg13.cppMsg1f.cppMsg20.cppMsg22.cppMsg3.cppMsg3a.cppMsg4.cppMsg40.cppMsg40.hMsgC.cppMulticast.cppPageCrawlBot.cppPageGet.cppPageHosts.cppPageInject.cppPageLogView.cppPageResults.cppPageRoot.cppPageSockets.cppPageStatsdb.cppParms.cppPosdb.cppProcess.cppProfiler.cppPunycode.cppPunycode.hQuery.cppRdb.cppRdbBase.cppRdbBase.hRdbBuckets.cppRdbCache.cppRdbList.cppRdbTree.cppSafeBuf.cppSafeBuf.hSpeller.cppSpider.cppSpiderProxy.cppStatsdb.cppTagdb.cppTcpServer.cppThreads.cppThreads.hUdpServer.cppUdpServer.hUdpSlot.cppUdpSlot.hUnicode.hUrl.cppUrl.hXmlDoc.cppXmlDoc.hfctypes.cppfctypes.hgbfilter.cppmain.cppqa.cpp
script
118
BigFile.cpp
118
BigFile.cpp
@ -33,7 +33,7 @@ BigFile::~BigFile () {
|
||||
//#define O_DIRECT 040000
|
||||
|
||||
BigFile::BigFile () {
|
||||
m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
|
||||
//m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
|
||||
m_flags = O_RDWR ; // | O_DIRECT;
|
||||
m_usePartFiles = true;
|
||||
// NULLify all ptrs to files
|
||||
@ -289,7 +289,7 @@ bool BigFile::open ( int flags ,
|
||||
|
||||
m_flags = flags;
|
||||
//m_pc = pc;
|
||||
m_permissions = permissions;
|
||||
//m_permissions = permissions;
|
||||
m_isClosing = false;
|
||||
// this is true except when parsing big warc files
|
||||
m_usePartFiles = true;//usePartFiles;
|
||||
@ -363,7 +363,7 @@ int BigFile::getfd ( int32_t n , bool forReading ) { // , int64_t *vfd ) {
|
||||
}
|
||||
// open it if not opened
|
||||
if ( ! f->calledOpen() ) {
|
||||
if ( ! f->open ( m_flags , m_permissions ) ) {
|
||||
if ( ! f->open ( m_flags , getFileCreationFlags() ) ) {
|
||||
log("disk: Failed to open file part #%"INT32".",n);
|
||||
return -1;
|
||||
}
|
||||
@ -1481,6 +1481,15 @@ bool BigFile::chopHead ( int32_t part ,
|
||||
return unlinkRename ( NULL, part, true, callback, state );
|
||||
}
|
||||
|
||||
class UnlinkRenameState {
|
||||
public:
|
||||
char m_oldFilename [ 1024 ];
|
||||
char m_newFilename [ 1024 ];
|
||||
int m_fd;
|
||||
File *m_file;
|
||||
collnum_t m_collnum;
|
||||
};
|
||||
|
||||
static void *renameWrapper_r ( void *state , ThreadEntry *t ) ;
|
||||
static void *unlinkWrapper_r ( void *state , ThreadEntry *t ) ;
|
||||
static void doneRenameWrapper ( void *state , ThreadEntry *t ) ;
|
||||
@ -1604,6 +1613,38 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
|
||||
// save callback for when all parts are unlinked or renamed
|
||||
m_callback = callback;
|
||||
m_state = state;
|
||||
|
||||
#ifdef FIXBUG
|
||||
// now use a special state in case RdbBase gets nuked
|
||||
// because the collection gets deleted in the middle of this
|
||||
UnlinkRenameState stackUr;
|
||||
char *st =(char *)mmalloc( sizeof(UnlinkRenameState),"ulrnst");
|
||||
UnlinkRenameState *urs = (UnlinkRenameState *)st;
|
||||
if ( ! ur ) {
|
||||
log("disk: failed to alloc unlinkrename state. "
|
||||
"skipping thread.");
|
||||
ur = stackUr;
|
||||
}
|
||||
urs->m_fd = m_fd;
|
||||
urs->m_collnum = collnum; // can we supply this now?
|
||||
urs->m_file = this;
|
||||
urs->m_closedIt = false;
|
||||
makeFilename_r ( m_baseFilename.getBufStart() ,
|
||||
NULL ,
|
||||
i ,
|
||||
urs->m_oldFilename ,
|
||||
1024 );
|
||||
// rename also takes the new name
|
||||
if ( ! m_isUnlink )
|
||||
makeFilename_r ( m_newBaseFilename.getBufStart() ,
|
||||
m_newBaseFilenameDir.getBufStart(),
|
||||
i ,
|
||||
urs->m_newFilename ,
|
||||
1024 );
|
||||
if ( ur == stackUr )
|
||||
goto skipThread;
|
||||
#endif
|
||||
|
||||
// . we spawn the thread here now
|
||||
// . returns true on successful spawning
|
||||
// . we can't make a disk thread cuz Threads.cpp checks its
|
||||
@ -1668,6 +1709,30 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
|
||||
}
|
||||
|
||||
void *renameWrapper_r ( void *state , ThreadEntry *t ) {
|
||||
|
||||
#ifdef FIXBUG
|
||||
UnlinkRenameState *urs = (UnlinkRenameState *)state;
|
||||
if ( ::rename ( urs->m_oldFilename , urs->m_newFilename ) ) {
|
||||
// reset errno and return true if file does not exist
|
||||
if ( errno == ENOENT ) {
|
||||
log("disk: file %s does not exist.",oldFilename);
|
||||
errno = 0;
|
||||
}
|
||||
// otherwise, it's a more serious error i guess
|
||||
else log("disk: rename %s to %s: %s",
|
||||
oldFilename,newFilename,mstrerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
// we must close the file descriptor in the thread otherwise the
|
||||
// file will not actually be renamed in this thread
|
||||
//f->close1_r();
|
||||
// we can't call f->close1_r() because f might have been deleted
|
||||
// because the collection was deleted.
|
||||
if ( close1ByFd_r( urs->m_fd) )
|
||||
urs->m_closedIt = true;
|
||||
return;
|
||||
#endif
|
||||
|
||||
// extract our class
|
||||
File *f = (File *)state;
|
||||
// . by getting the inode in the cache space the call to f->close()
|
||||
@ -1721,6 +1786,16 @@ void *renameWrapper_r ( void *state , ThreadEntry *t ) {
|
||||
}
|
||||
|
||||
void *unlinkWrapper_r ( void *state , ThreadEntry *t ) {
|
||||
#ifdef FIXBUG
|
||||
UnlinkRenameState *urs = (UnlinkRenameState *)state;
|
||||
::unlink ( urs->m_oldFilename );
|
||||
// we can't call f->close1_r() because f might have been deleted
|
||||
// because the collection was deleted.
|
||||
if ( close1ByFd_r( urs->m_fd) )
|
||||
urs->m_closedIt = true;
|
||||
return;
|
||||
#endif
|
||||
|
||||
// get ourselves
|
||||
File *f = (File *)state;
|
||||
// . by getting the inode in the cache space the call to delete(f)
|
||||
@ -1742,6 +1817,25 @@ void *unlinkWrapper_r ( void *state , ThreadEntry *t ) {
|
||||
}
|
||||
|
||||
void doneRenameWrapper ( void *state , ThreadEntry *t ) {
|
||||
|
||||
#ifdef FIXBUG
|
||||
// if collection got nuked, then file will be invalid
|
||||
// so when we nuke a collection we scan all threads for unlink/rename
|
||||
// operations that reference files from the collection being nuked and
|
||||
// set their m_collectionGotNuked flag to true
|
||||
UnlinkRenameState *urs = (UnlinkRenameState *)state;
|
||||
File *f = urs->m_file;
|
||||
collnum_t cn = urs->m_collnum;
|
||||
RdbBase *base = getRdbBase ( cn );
|
||||
mfree ( urs , sizeof(UrlRenameState), "urnst" );
|
||||
if ( ! base ) { // urs->m_collectionGotNuked ) {
|
||||
log("bigfile: captured rename on nuked collection %i",(int)cn);
|
||||
g_unlinkRenameThreads--;
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// extract our class
|
||||
File *f = (File *)state;
|
||||
// . finish the close
|
||||
@ -1795,6 +1889,24 @@ void doneRenameWrapper ( void *state , ThreadEntry *t ) {
|
||||
}
|
||||
|
||||
void doneUnlinkWrapper ( void *state , ThreadEntry *t ) {
|
||||
|
||||
#ifdef FIXBUG
|
||||
// if collection got nuked, then file will be invalid
|
||||
// so when we nuke a collection we scan all threads for unlink/rename
|
||||
// operations that reference files from the collection being nuked and
|
||||
// set their m_collectionGotNuked flag to true
|
||||
UnlinkRenameState *urs = (UnlinkRenameState *)state;
|
||||
File *f = urs->m_file;
|
||||
collnum_t cn = urs->m_collnum;
|
||||
RdbBase *base = getRdbBase ( cn );
|
||||
mfree ( urs , sizeof(UrlRenameState), "urnst" );
|
||||
if ( ! base ) { // urs->m_collectionGotNuked ) {
|
||||
log("bigfile: captured unlink on nuked collection %i",(int)cn);
|
||||
g_unlinkRenameThreads--;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// extract our class
|
||||
File *f = (File *)state;
|
||||
// finish the close
|
||||
|
@ -353,7 +353,7 @@ class BigFile {
|
||||
SafeBuf m_newBaseFilenameDir ;//[256];
|
||||
|
||||
|
||||
int32_t m_permissions;
|
||||
//int32_t m_permissions;
|
||||
int32_t m_flags;
|
||||
|
||||
// determined in open() override
|
||||
|
@ -333,6 +333,9 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
|
||||
if ( cr->m_isCustomCrawl ) {
|
||||
cr->m_getLinkInfo = false;
|
||||
cr->m_computeSiteNumInlinks = false;
|
||||
// limit each shard to 5 spiders per collection to prevent
|
||||
// ppl from spidering the web and hogging up resources
|
||||
cr->m_maxNumSpiders = 5;
|
||||
}
|
||||
|
||||
// we need to compile the regular expressions or update the url
|
||||
@ -633,10 +636,11 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
|
||||
// MDW: create the new directory
|
||||
retry22:
|
||||
if ( ::mkdir ( dname ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) ) {
|
||||
if ( ::mkdir ( dname ,
|
||||
getDirCreationFlags() ) ) {
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) ) {
|
||||
// valgrind?
|
||||
if ( errno == EINTR ) goto retry22;
|
||||
g_errno = errno;
|
||||
@ -1401,10 +1405,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
log("admin: Trying to create collection %s but "
|
||||
"directory %s already exists on disk.",cr->m_coll,dname);
|
||||
}
|
||||
if ( ::mkdir ( dname ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) ) {
|
||||
if ( ::mkdir ( dname ,
|
||||
getDirCreationFlags() ) ) {
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) ) {
|
||||
// valgrind?
|
||||
//if ( errno == EINTR ) goto retry22;
|
||||
//g_errno = errno;
|
||||
@ -1971,6 +1976,29 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
|
||||
// it is binary now
|
||||
gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
|
||||
|
||||
// if it had corrupted data from saving corrupted mem zero it out
|
||||
CrawlInfo *stats = &m_localCrawlInfo;
|
||||
// point to the stats for that host
|
||||
int64_t *ss = (int64_t *)stats;
|
||||
// are stats crazy?
|
||||
bool crazy = false;
|
||||
for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
|
||||
// crazy stat?
|
||||
if ( *ss > 1000000000LL ||
|
||||
*ss < -1000000000LL ) {
|
||||
crazy = true;
|
||||
break;
|
||||
}
|
||||
ss++;
|
||||
}
|
||||
if ( m_localCrawlInfo.m_collnum != m_collnum )
|
||||
crazy = true;
|
||||
if ( crazy ) {
|
||||
log("coll: had crazy spider stats for coll %s. zeroing out.",
|
||||
m_coll);
|
||||
m_localCrawlInfo.reset();
|
||||
}
|
||||
|
||||
|
||||
if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
|
||||
log("coll: Loaded %s (%"INT32") local hasurlsready=%"INT32"",
|
||||
@ -3787,12 +3815,30 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
i++;
|
||||
}
|
||||
|
||||
// don't bother re-spidering old pages if hopcount == maxhopcount
|
||||
// and only process new urls is true. because we don't need to
|
||||
// harvest outlinks from them.
|
||||
if ( m_diffbotOnlyProcessIfNewUrl && m_diffbotMaxHops > 0 &&
|
||||
// only crawls, not bulk jobs
|
||||
m_isCustomCrawl == 1 ) {
|
||||
m_regExs[i].purge();
|
||||
m_regExs[i].safePrintf("isindexed && hopcount==%"INT32,
|
||||
m_diffbotMaxHops );
|
||||
m_spiderPriorities [i] = 14;
|
||||
m_spiderFreqs [i] = 0.0;
|
||||
m_maxSpidersPerRule [i] = 0; // turn off spiders
|
||||
m_harvestLinks [i] = false;
|
||||
i++;
|
||||
}
|
||||
|
||||
// diffbot needs to retry even on 500 or 404 errors since sometimes
|
||||
// a seed url gets a 500 error mistakenly and it haults the crawl.
|
||||
// so take out "!hastmperror".
|
||||
|
||||
m_regExs[i].set("errorcount>=1 && !hastmperror");
|
||||
m_spiderPriorities [i] = 15;
|
||||
m_spiderFreqs [i] = 0.0;
|
||||
m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
|
||||
m_spiderPriorities [i] = 14;
|
||||
m_spiderFreqs [i] = 0.0416; // every hour
|
||||
//m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
|
||||
i++;
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
|
@ -494,6 +494,7 @@ class CollectionRec {
|
||||
char m_useSimplifiedRedirects ;
|
||||
char m_useIfModifiedSince ;
|
||||
char m_useTimeAxis ;
|
||||
char m_indexWarcs;
|
||||
char m_buildVecFromCont ;
|
||||
int32_t m_maxPercentSimilarPublishDate;
|
||||
char m_useSimilarityPublishDate;
|
||||
|
19
Conf.cpp
19
Conf.cpp
@ -9,6 +9,25 @@
|
||||
|
||||
Conf g_conf;
|
||||
|
||||
static bool s_setUmask = false;;
|
||||
|
||||
mode_t getFileCreationFlags() {
|
||||
if ( ! s_setUmask ) {
|
||||
s_setUmask = true;
|
||||
umask ( 0 );
|
||||
}
|
||||
return S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
|
||||
}
|
||||
|
||||
mode_t getDirCreationFlags() {
|
||||
if ( ! s_setUmask ) {
|
||||
s_setUmask = true;
|
||||
umask ( 0 );
|
||||
}
|
||||
return S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH |
|
||||
S_IXUSR | S_IXGRP;
|
||||
}
|
||||
|
||||
Conf::Conf ( ) {
|
||||
m_save = true;
|
||||
m_doingCommandLine = false;
|
||||
|
7
Conf.h
7
Conf.h
@ -43,6 +43,9 @@
|
||||
|
||||
#define MAX_GEOCODERS 4
|
||||
|
||||
mode_t getFileCreationFlags();
|
||||
mode_t getDirCreationFlags ();
|
||||
|
||||
class Conf {
|
||||
|
||||
public:
|
||||
@ -180,7 +183,9 @@ class Conf {
|
||||
//bool m_tagdbUseSeals;
|
||||
//int32_t m_tagdbMinFilesToMerge;
|
||||
//bool m_tagdbSaveCache;
|
||||
|
||||
|
||||
//bool m_makeAllFilesGroupWritable;
|
||||
|
||||
// catdb parameters
|
||||
int32_t m_catdbMaxTreeMem;
|
||||
//int32_t m_catdbMaxDiskPageCacheMem;
|
||||
|
4
Dns.cpp
4
Dns.cpp
@ -2470,7 +2470,8 @@ Host *Dns::getResponsibleHost ( key_t key ) {
|
||||
// get the hostNum that should handle this
|
||||
int32_t hostId = key.n1 % hostdb->getNumHosts();
|
||||
// return it if it is alive
|
||||
if ( ! hostdb->isDead ( hostId ) ) return hostdb->getHost ( hostId );
|
||||
Host* h = hostdb->getHost ( hostId );
|
||||
if ( h->m_spiderEnabled && ! hostdb->isDead ( hostId ) ) return h;
|
||||
// how many are up?
|
||||
int32_t numAlive = hostdb->getNumHostsAlive();
|
||||
// NULL if none
|
||||
@ -2482,6 +2483,7 @@ Host *Dns::getResponsibleHost ( key_t key ) {
|
||||
for ( int32_t i = 0 ; i < hostdb->m_numHosts ; i++ ) {
|
||||
// get the ith host
|
||||
Host *host = &hostdb->m_hosts[i];
|
||||
if ( !host->m_spiderEnabled ) continue;
|
||||
// skip him if he is dead
|
||||
if ( hostdb->isDead ( host ) ) continue;
|
||||
// count it if alive, continue if not our number
|
||||
|
@ -196,6 +196,7 @@ case EDNSERROR : return "DNS lookup error";
|
||||
case ETHREADSDISABLED:return "Threads Disabled";
|
||||
case EMALFORMEDQUERY: return "Malformed query";
|
||||
case ESHARDDOWN: return "One or more shards are down";
|
||||
case EDOCWARC: return "Doc is WARC or ARC and support is disabled";
|
||||
case EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY: return "Diffbot request of third-party content timed out";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
|
2
Errno.h
2
Errno.h
@ -201,6 +201,8 @@ enum {
|
||||
ETHREADSDISABLED,
|
||||
EMALFORMEDQUERY,
|
||||
ESHARDDOWN,
|
||||
EDOCWARC,
|
||||
EWRONGSHARD,
|
||||
EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY
|
||||
};
|
||||
#endif
|
||||
|
9
File.cpp
9
File.cpp
@ -238,7 +238,10 @@ bool File::open ( int flags , int permissions ) {
|
||||
}
|
||||
// save these in case we need to reopen in getfd()
|
||||
m_flags = flags;
|
||||
m_permissions = permissions;
|
||||
//m_permissions = permissions;
|
||||
// just override and use system settings so we can get the group
|
||||
// writable/readable/executable bits if set that way in g_conf
|
||||
//m_permissions = getFileCreationFlags();
|
||||
m_calledOpen = true;
|
||||
// sanity check
|
||||
//int32_t ss = 0;
|
||||
@ -668,7 +671,7 @@ int File::getfd () {
|
||||
if ( fd == -1 ) {
|
||||
t1 = gettimeofdayInMilliseconds();
|
||||
retry7:
|
||||
fd = ::open ( getFilename() , m_flags , m_permissions );
|
||||
fd = ::open ( getFilename() , m_flags,getFileCreationFlags());
|
||||
// valgrind
|
||||
if ( fd == -1 && errno == EINTR ) goto retry7;
|
||||
// 0 means stdout, right? why am i seeing it get assigned???
|
||||
@ -676,7 +679,7 @@ int File::getfd () {
|
||||
log("disk: Got fd of 0 when opening %s.",
|
||||
getFilename());
|
||||
if ( fd == 0 )
|
||||
fd = ::open ( getFilename(), m_flags , m_permissions );
|
||||
fd=::open(getFilename(),m_flags,getFileCreationFlags());
|
||||
if ( fd == 0 )
|
||||
log("disk: Got fd of 0 when opening2 %s.",
|
||||
getFilename());
|
||||
|
2
File.h
2
File.h
@ -193,7 +193,7 @@ class File {
|
||||
|
||||
// save the permission and flag sets in case of re-opening
|
||||
int m_flags;
|
||||
int m_permissions;
|
||||
//int m_permissions;
|
||||
|
||||
char m_calledOpen;
|
||||
char m_calledSet;
|
||||
|
@ -623,8 +623,10 @@ bool HashTableX::save ( char *dir ,
|
||||
char s[1024];
|
||||
sprintf ( s , "%s/%s", dir , filename );
|
||||
int fd = ::open ( s ,
|
||||
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
|
||||
S_IRGRP | S_IWGRP | S_IROTH);
|
||||
O_RDWR | O_CREAT | O_TRUNC ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IROTH);
|
||||
if ( fd < 0 ) {
|
||||
//m_saveErrno = errno;
|
||||
return log("db: Could not open %s for writing: %s.",
|
||||
|
65
Hostdb.cpp
65
Hostdb.cpp
@ -691,16 +691,26 @@ bool Hostdb::init ( int32_t hostIdArg , char *netName ,
|
||||
|
||||
//skip:
|
||||
|
||||
h->m_queryEnabled = true;
|
||||
h->m_spiderEnabled = true;
|
||||
// check for something after the working dir
|
||||
h->m_note[0] = '\0';
|
||||
if ( *p != '\n' ) {
|
||||
// save the note
|
||||
char *n = p;
|
||||
while ( *n && *n != '\n' && n < pend ) n++;
|
||||
|
||||
int32_t noteSize = n - p;
|
||||
if ( noteSize > 127 ) noteSize = 127;
|
||||
gbmemcpy(h->m_note, p, noteSize);
|
||||
*p++ = '\0'; // NULL terminate for atoip
|
||||
|
||||
if(strstr(h->m_note, "noquery")) {
|
||||
h->m_queryEnabled = false;
|
||||
}
|
||||
if(strstr(h->m_note, "nospider")) {
|
||||
h->m_spiderEnabled = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
*p = '\0';
|
||||
@ -1642,6 +1652,56 @@ Host *Hostdb::getLiveHostInShard ( int32_t shardNum ) {
|
||||
return &shard[0];
|
||||
}
|
||||
|
||||
int32_t Hostdb::getHostIdWithSpideringEnabled ( uint32_t shardNum ) {
|
||||
Host *hosts = g_hostdb.getShard ( shardNum);
|
||||
int32_t numHosts = g_hostdb.getNumHostsPerShard();
|
||||
|
||||
int32_t hostNum = 0;
|
||||
int32_t numTried = 0;
|
||||
while( !hosts [ hostNum ].m_spiderEnabled && numTried < numHosts ) {
|
||||
hostNum = (hostNum+1) % numHosts;
|
||||
numTried++;
|
||||
}
|
||||
if( !hosts [ hostNum ].m_spiderEnabled) {
|
||||
log("build: cannot spider when entire shard has nospider enabled");
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
return hosts [ hostNum ].m_hostId ;
|
||||
}
|
||||
|
||||
// if niceness 0 can't pick noquery host.
|
||||
// if niceness 1 can't pick nospider host.
|
||||
Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum , char niceness ) {
|
||||
int32_t minOutstandingRequests = 0x7fffffff;
|
||||
int32_t minOutstandingRequestsIndex = -1;
|
||||
Host *shard = getShard ( shardNum );
|
||||
Host *bestDead = NULL;
|
||||
for(int32_t i = 0; i < m_numHostsPerShard; i++) {
|
||||
Host *hh = &shard[i];
|
||||
// don't pick a 'no spider' host if niceness is 1
|
||||
if ( niceness > 0 && ! hh->m_spiderEnabled ) continue;
|
||||
// don't pick a 'no query' host if niceness is 0
|
||||
if ( niceness == 0 && ! hh->m_queryEnabled ) continue;
|
||||
if ( ! bestDead ) bestDead = hh;
|
||||
if(isDead(hh)) continue;
|
||||
// log("host %"INT32 " numOutstanding is %"INT32, hh->m_hostId,
|
||||
// hh->m_pingInfo.m_udpSlotsInUseIncoming);
|
||||
if ( hh->m_pingInfo.m_udpSlotsInUseIncoming >
|
||||
minOutstandingRequests )
|
||||
continue;
|
||||
|
||||
minOutstandingRequests =hh->m_pingInfo.m_udpSlotsInUseIncoming;
|
||||
minOutstandingRequestsIndex = i;
|
||||
}
|
||||
// we should never return a nospider/noquery host depending on
|
||||
// the niceness, so return bestDead
|
||||
if(minOutstandingRequestsIndex == -1) return bestDead;//shard;
|
||||
return &shard[minOutstandingRequestsIndex];
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// if all are dead just return host #0
|
||||
Host *Hostdb::getFirstAliveHost ( ) {
|
||||
for ( int32_t i = 0 ; i < m_numHosts ; i++ )
|
||||
@ -1990,8 +2050,9 @@ bool Hostdb::saveHostsConf ( ) {
|
||||
sprintf ( filename, "%shosts.conf", m_dir );
|
||||
log ( LOG_INFO, "conf: Writing hosts.conf file to: %s",
|
||||
filename );
|
||||
int32_t fd = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
|
||||
int32_t fd = open ( filename, O_CREAT|O_WRONLY|O_TRUNC ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
|
||||
if ( !fd ) {
|
||||
log ( "conf: Failed to open %s for writing.", filename );
|
||||
return false;
|
||||
|
8
Hostdb.h
8
Hostdb.h
@ -211,6 +211,7 @@ class Host {
|
||||
int64_t m_lastPing;
|
||||
|
||||
char m_tmpBuf[4];
|
||||
int16_t m_tmpCount;
|
||||
|
||||
// . first time we sent an unanswered ping request to this host
|
||||
// . used so we can determine when to send an email alert
|
||||
@ -337,6 +338,10 @@ class Host {
|
||||
int32_t m_lastTryError;
|
||||
int32_t m_lastTryTime;
|
||||
|
||||
bool m_spiderEnabled;
|
||||
bool m_queryEnabled;
|
||||
|
||||
|
||||
//char m_requestBuf[MAX_PING_SIZE];
|
||||
PingInfo m_pingInfo;//RequestBuf;
|
||||
};
|
||||
@ -445,6 +450,8 @@ class Hostdb {
|
||||
|
||||
//Host *getLiveHostInGroup ( int32_t groupId );
|
||||
Host *getLiveHostInShard ( int32_t shardNum );
|
||||
Host *getLeastLoadedInShard ( uint32_t shardNum , char niceness );
|
||||
int32_t getHostIdWithSpideringEnabled ( uint32_t shardNum );
|
||||
|
||||
// in the entire cluster. return host #0 if its alive, otherwise
|
||||
// host #1, etc.
|
||||
@ -464,6 +471,7 @@ class Hostdb {
|
||||
return &m_hosts[shardNum * m_numHostsPerShard];
|
||||
};
|
||||
|
||||
|
||||
//Host *getGroupFromGroupId ( uint32_t gid ) {
|
||||
// return getGroup ( gid );
|
||||
//};
|
||||
|
@ -1778,8 +1778,8 @@ bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
|
||||
else now = getTimeLocal();
|
||||
// . buffer for the MIME request and brief html err msg
|
||||
// . NOTE: ctime appends a \n to the time, so we don't need to
|
||||
char msg[1024];
|
||||
SafeBuf sb(msg,1024,0,false);
|
||||
char msg[1524];
|
||||
SafeBuf sb(msg,1524,0,false);
|
||||
|
||||
char *tt = asctime(gmtime ( &now ));
|
||||
tt [ gbstrlen(tt) - 1 ] = '\0';
|
||||
@ -1838,7 +1838,7 @@ bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
|
||||
|
||||
// use this new function that will compress the reply now if the
|
||||
// request was a ZET instead of a GET
|
||||
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
|
||||
return sendReply2 ( sb.getBufStart(), sb.length() , NULL , 0 , s );
|
||||
}
|
||||
|
||||
bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
|
||||
@ -1851,8 +1851,8 @@ bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
|
||||
else now = getTimeLocal();
|
||||
|
||||
int32_t format = gr->m_hr.getReplyFormat();
|
||||
char msg[1024];
|
||||
SafeBuf sb(msg,1024,0,false);
|
||||
char msg[1524];
|
||||
SafeBuf sb(msg,1524,0,false);
|
||||
char *tt = asctime(gmtime ( &now ));
|
||||
tt [ gbstrlen(tt) - 1 ] = '\0';
|
||||
|
||||
@ -1904,7 +1904,7 @@ bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
|
||||
|
||||
// use this new function that will compress the reply now if the
|
||||
// request was a ZET instead of a GET
|
||||
return sendReply2 ( msg , sb.length() , NULL , 0 , gr->m_socket );
|
||||
return sendReply2 ( sb.getBufStart(),sb.length(),NULL,0,gr->m_socket );
|
||||
}
|
||||
|
||||
// . send an error reply, like "HTTP/1.1 404 Not Found"
|
||||
@ -1931,8 +1931,8 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , int32_t error , char *errmsg ,
|
||||
|
||||
// . buffer for the MIME request and brief html err msg
|
||||
// . NOTE: ctime appends a \n to the time, so we don't need to
|
||||
char msg[1024];
|
||||
SafeBuf sb(msg,1024,0,false);
|
||||
char msg[1524];
|
||||
SafeBuf sb(msg,1524,0,false);
|
||||
// if it's a 404, redirect to home page
|
||||
/*
|
||||
if ( error == 404 )
|
||||
@ -2000,8 +2000,8 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , int32_t error , char *errmsg ,
|
||||
// record it
|
||||
if ( bytesSent ) *bytesSent = sb.length();//sendBufSize;
|
||||
// use this new function that will compress the reply now if the
|
||||
// request was a ZET instead of a GET
|
||||
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
|
||||
// request was a ZET instead of a GET mdw
|
||||
return sendReply2 ( sb.getBufStart() , sb.length() , NULL , 0 , s );
|
||||
|
||||
/*
|
||||
// . this returns false if blocked, true otherwise
|
||||
|
@ -1007,7 +1007,10 @@ void Images::thumbStart_r ( bool amThread ) {
|
||||
|
||||
// Open/Create temporary file to store image to
|
||||
int fhndl;
|
||||
if( (fhndl = open( in, O_RDWR+O_CREAT, S_IWUSR+S_IRUSR )) < 0 ) {
|
||||
if( (fhndl = open( in, O_RDWR+O_CREAT ,
|
||||
getFileCreationFlags()
|
||||
// // S_IWUSR+S_IRUSR
|
||||
)) < 0 ) {
|
||||
log( "image: Could not open file, %s, for writing: %s - %d.",
|
||||
in, mstrerror( m_errno ), fhndl );
|
||||
m_imgDataSize = 0;
|
||||
|
63
Json.cpp
63
Json.cpp
@ -512,3 +512,66 @@ bool endsInCurly ( char *s , int32_t slen ) {
|
||||
if ( e >= m && *e == '}' ) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Accepts a json string which has a top level object and a "key":val pair
|
||||
// return false unless jsonStr has the new key:val
|
||||
bool Json::prependKey(SafeBuf& jsonStr, char* keyVal) {
|
||||
int32_t ndx = jsonStr.indexOf('{');
|
||||
// no object? try array? fail for now
|
||||
if( ndx == -1 || ndx == jsonStr.length() - 1 ) return false;
|
||||
ndx++; //the insert pos
|
||||
if(ndx == jsonStr.length()) return false;
|
||||
|
||||
// find if the object had any other keys
|
||||
int32_t jsonStrLen = jsonStr.length();
|
||||
int32_t i = ndx;
|
||||
while(i < jsonStrLen && isspace(jsonStr[i])) i++;
|
||||
if( i == jsonStrLen ) return false;
|
||||
|
||||
|
||||
|
||||
if (jsonStr[i] != '}') {
|
||||
jsonStr.insert(",\n", i);
|
||||
} //else we are the only item, no comma
|
||||
|
||||
return jsonStr.insert(keyVal, ndx);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
// bool Json::printToString(SafeBuf& out, JsonItem* ji = NULL) {
|
||||
// if(!ji) ji = getFirstItem();
|
||||
|
||||
// for ( ; ji ; ji = ji->m_next ) {
|
||||
// switch (ji->m_type) {
|
||||
// case JT_NULL:
|
||||
// out.safeMemcpy("null", 4);
|
||||
// break;
|
||||
// case JT_NUMBER:
|
||||
// int32_t vl;
|
||||
// char* v = ji->getValueAsString(&vl);
|
||||
// out.safeMemcpy(v, vl);
|
||||
// break;
|
||||
// case JT_STRING:
|
||||
// int32_t vl;
|
||||
// char* v = ji->getValueAsString(&vl);
|
||||
// out.pushChar('"');
|
||||
// out.safeMemcpy(v, vl);
|
||||
// out.pushChar('"');
|
||||
// break;
|
||||
// case JT_ARRAY:
|
||||
// // wha? really? I would've thought this would contain
|
||||
// // jsonitems and not a string
|
||||
// safeMemcpy(ji->m_valueArray, ji->m_valueArray);
|
||||
// break;
|
||||
// case JT_OBJECT:
|
||||
// out.pushChar('{');
|
||||
// out.safeMemcpy(v, vl);
|
||||
// out.pushChar("\"");
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// out->
|
||||
// }
|
||||
|
7
Json.h
7
Json.h
@ -24,6 +24,7 @@ class JsonItem {
|
||||
class JsonItem *m_next,*m_prev;
|
||||
class JsonItem *m_parent;//child;
|
||||
|
||||
|
||||
// the JT_* values above
|
||||
int m_type;
|
||||
|
||||
@ -43,7 +44,6 @@ class JsonItem {
|
||||
|
||||
char *m_valueArray;
|
||||
|
||||
|
||||
// for JT_String
|
||||
int32_t getValueLen() { return m_valueLen; };
|
||||
|
||||
@ -78,6 +78,8 @@ class Json {
|
||||
|
||||
JsonItem *parseJsonStringIntoJsonItems ( char *json , int32_t niceness );
|
||||
|
||||
bool printToString(SafeBuf& out);
|
||||
|
||||
JsonItem *getFirstItem ( ) ;
|
||||
|
||||
JsonItem *getItem ( char *name );
|
||||
@ -86,6 +88,9 @@ class Json {
|
||||
|
||||
Json() { m_stackPtr = 0; m_prev = NULL; };
|
||||
|
||||
static bool prependKey(SafeBuf& jsonString, char* newKey);
|
||||
|
||||
|
||||
SafeBuf m_sb;
|
||||
JsonItem *m_stack[MAXJSONPARENTS];
|
||||
int32_t m_stackPtr;
|
||||
|
26
Language.cpp
26
Language.cpp
@ -145,7 +145,7 @@ bool Language::convertLatin1DictToUTF8( char *infile ){
|
||||
// then open a new one for appending
|
||||
int fdw = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 ){
|
||||
return log("lang: Could not open for %s "
|
||||
"writing: %s.",ff, strerror(errno));
|
||||
@ -2763,7 +2763,7 @@ bool Language::makeWordFiles ( int32_t numWordsToDump , int32_t numWordsPerPhras
|
||||
// then open a new one for appending
|
||||
fds[i] = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fds[i] < 0 )
|
||||
return log("lang: Could not open %s for writing: "
|
||||
"%s.",ff, strerror(errno));
|
||||
@ -3146,7 +3146,7 @@ bool Language::makePopFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase
|
||||
// then open a new one for appending
|
||||
fds[i] = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fds[i] < 0 )
|
||||
return log("lang: Could not open %s for writing: "
|
||||
"%s.",ff, strerror(errno));
|
||||
@ -3683,7 +3683,7 @@ bool Language::makeQueryFiles ( ) {
|
||||
// then open a new one for appending
|
||||
int fdw = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 ){
|
||||
return log("lang: Could not open for %s "
|
||||
"writing: %s.",ff, strerror(errno));
|
||||
@ -3874,7 +3874,7 @@ bool Language::makeWikiFiles( ) {
|
||||
// then open a new one for appending
|
||||
int fdw = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 ){
|
||||
log("lang: Could not open for %s "
|
||||
"writing: %s.",ff, strerror(errno));
|
||||
@ -4250,7 +4250,7 @@ bool Language::gotTermFreqs( StateDict *st ){
|
||||
// then open a new one for appending
|
||||
fd = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fd < 0 ){
|
||||
log("lang: Could not open %s for writing: "
|
||||
"%s.",ff, strerror(errno));
|
||||
@ -4338,7 +4338,7 @@ bool StateAff::openAffinityFile( ){
|
||||
unlink ( ff );
|
||||
// then open a new one for appending
|
||||
m_fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( m_fdw < 0 ){
|
||||
log("lang: Could not open for %s "
|
||||
"writing: %s.",ff, strerror(errno));
|
||||
@ -4537,7 +4537,7 @@ bool Language::cleanDictFile ( ) {
|
||||
// then open a new one for appending
|
||||
int fdw = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 ){
|
||||
return log("lang: Could not open for %s "
|
||||
"writing: %s.",ff, strerror(errno));
|
||||
@ -4590,7 +4590,7 @@ bool Language::makePhonet( char *infile){
|
||||
// then open a new one for appending
|
||||
fdw = open ( outfile ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 )
|
||||
return log("lang: Could not open %s for writing: "
|
||||
"%s.", outfile, strerror(errno));
|
||||
@ -4711,7 +4711,7 @@ bool Language::genTopPopFile ( char *infile ){
|
||||
// then open a new one for appending
|
||||
fdw = open ( outfile ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 )
|
||||
return log("lang: Could not open %s for writing: "
|
||||
"%s.", outfile, strerror(errno));
|
||||
@ -4761,7 +4761,8 @@ bool Language::genDistributedPopFile ( char *infile, uint32_t myHash ){
|
||||
// then open a new one for appending
|
||||
fdw = open ( outfile ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 )
|
||||
return log("lang: Could not open %s for writing: "
|
||||
"%s.", outfile, strerror(errno));
|
||||
@ -4848,7 +4849,8 @@ int32_t Language::spellcheckDict(){
|
||||
// then open a new one for appending
|
||||
fdw = open ( outfile ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 )
|
||||
return log("lang: Could not open %s for writing: "
|
||||
"%s.", outfile, strerror(errno));
|
||||
|
@ -961,7 +961,7 @@ static bool s_isLangTag(char *str) {
|
||||
|
||||
static uint8_t s_getCountryFromSpec(char *str) {
|
||||
char code[6];
|
||||
memset(code, 6, 0);
|
||||
memset(code, 0,6);
|
||||
gbmemcpy(code, str, s_wordLen(str));
|
||||
for(int x = 0; x < 6; x++)
|
||||
if(code[x] > 'A' && code[x] < 'Z') code[x] -= ('A' - 'a');
|
||||
|
18
Linkdb.cpp
18
Linkdb.cpp
@ -603,6 +603,10 @@ bool getLinkInfo ( SafeBuf *reqBuf ,
|
||||
Host *hosts = g_hostdb.getShard ( shardNum); // Group ( groupId );
|
||||
if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
|
||||
int32_t hostId = hosts [ hostNum ].m_hostId ;
|
||||
if( !hosts [ hostNum ].m_spiderEnabled) {
|
||||
hostId = g_hostdb.getHostIdWithSpideringEnabled ( shardNum );
|
||||
}
|
||||
|
||||
|
||||
// . serialize the string buffers
|
||||
// . use Msg25Request::m_buf[MAX_NEEDED]
|
||||
@ -665,7 +669,16 @@ static void sendReplyWrapper ( void *state ) {
|
||||
// sanity
|
||||
if ( req->m_udpSlot != slot2 ) { char *xx=NULL;*xx=0;}
|
||||
// if in table, nuke it
|
||||
g_lineTable.removeKey ( &req->m_siteHash64 );
|
||||
// but only if it was in SITE mode, not PAGE. we've lost our
|
||||
// table entry like this before.
|
||||
// TODO: if this still doesn't work then ensure the stored 'req'
|
||||
// is the same!
|
||||
if ( req->m_mode == MODE_SITELINKINFO ) {
|
||||
g_lineTable.removeKey ( &req->m_siteHash64 );
|
||||
if ( g_conf.m_logDebugLinkInfo )
|
||||
log("linkdb: removing sitehash64=%"INT64"",
|
||||
req->m_siteHash64);
|
||||
}
|
||||
|
||||
nextLink:
|
||||
|
||||
@ -746,6 +759,7 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
|
||||
if ( head->m_next )
|
||||
req->m_next = head->m_next;
|
||||
head->m_next = req;
|
||||
req->m_waitingInLine = 1;
|
||||
// note it for debugging
|
||||
log("build: msg25 request waiting in line for %s "
|
||||
"udpslot=0x%"PTRFMT"",
|
||||
@ -755,6 +769,8 @@ void handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
|
||||
return;
|
||||
}
|
||||
|
||||
req->m_waitingInLine = 0;
|
||||
|
||||
// make a new Msg25
|
||||
Msg25 *m25;
|
||||
try { m25 = new ( Msg25 ); }
|
||||
|
9
Linkdb.h
9
Linkdb.h
@ -76,6 +76,15 @@ public:
|
||||
int32_t m_ourHostHash32 ;
|
||||
int32_t m_ourDomHash32 ;
|
||||
|
||||
uint8_t m_waitingInLine:1;
|
||||
uint8_t m_reserved1:1;
|
||||
uint8_t m_reserved2:1;
|
||||
uint8_t m_reserved3:1;
|
||||
uint8_t m_reserved4:1;
|
||||
uint8_t m_reserved5:1;
|
||||
uint8_t m_reserved6:1;
|
||||
uint8_t m_reserved7:1;
|
||||
|
||||
// new stuff
|
||||
int32_t m_siteHash32;
|
||||
int64_t m_siteHash64;
|
||||
|
10
Log.cpp
10
Log.cpp
@ -132,8 +132,9 @@ bool Log::init ( char *filename ) {
|
||||
// open it for appending.
|
||||
// create with -rw-rw-r-- permissions if it's not there.
|
||||
m_fd = open ( m_filename ,
|
||||
O_APPEND | O_CREAT | O_RDWR ,
|
||||
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
|
||||
O_APPEND | O_CREAT | O_RDWR ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
|
||||
if ( m_fd >= 0 ) return true;
|
||||
// bitch to stderr and return false on error
|
||||
fprintf(stderr,"could not open log file %s for appending\n",
|
||||
@ -422,8 +423,9 @@ bool Log::makeNewLogFile ( ) {
|
||||
// open it for appending.
|
||||
// create with -rw-rw-r-- permissions if it's not there.
|
||||
m_fd = open ( m_filename ,
|
||||
O_APPEND | O_CREAT | O_RDWR ,
|
||||
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
|
||||
O_APPEND | O_CREAT | O_RDWR ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
|
||||
if ( m_fd >= 0 ) return true;
|
||||
// bitch to stderr and return false on error
|
||||
fprintf(stderr,"could not open new log file %s for appending\n",
|
||||
|
49
Loop.cpp
49
Loop.cpp
@ -1014,7 +1014,7 @@ void printStackTrace ( int signum , siginfo_t *info , void *ptr ) {
|
||||
// right now only works for 32 bit
|
||||
//if ( arch != 32 ) return;
|
||||
|
||||
logf(LOG_DEBUG,"gb: seg fault. printing stack trace. use "
|
||||
logf(LOG_DEBUG,"gb: Printing stack trace. use "
|
||||
"'addr2line -e gb' to decode the hex below.");
|
||||
|
||||
if ( g_inMemFunction ) {
|
||||
@ -1035,6 +1035,16 @@ void printStackTrace ( int signum , siginfo_t *info , void *ptr ) {
|
||||
//,ba
|
||||
//,g_profiler.getFnName(ba,0));
|
||||
);
|
||||
#ifdef INLINEDECODE
|
||||
char cmd[256];
|
||||
sprintf(cmd,"addr2line -e gb 0x%"XINT64" > ./tmpout"
|
||||
,(uint64_t)s_bt[i]);
|
||||
gbsystem ( cmd );
|
||||
char obuf[1024];
|
||||
SafeBuf fb (obuf,1024);
|
||||
fb.load("./tmpout");
|
||||
log("stack: %s",fb.getBufStart());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -1171,7 +1181,8 @@ void sigvtalrmHandler ( int x , siginfo_t *info , void *y ) {
|
||||
//g_inSigHandler = true;
|
||||
// NOT SAFE for pthreads cuz we're in sig handler
|
||||
#ifndef PTHREADS
|
||||
log("loop: missed quickpoll");
|
||||
log("loop: missed quickpoll. Dumping stack.");
|
||||
printStackTrace( x , info , y );
|
||||
#endif
|
||||
//g_inSigHandler = false;
|
||||
// seems to core a lot in gbcompress() we need to
|
||||
@ -1183,15 +1194,19 @@ void sigvtalrmHandler ( int x , siginfo_t *info , void *y ) {
|
||||
}
|
||||
|
||||
// if it has been a while since heartbeat (> 10000ms) dump core so
|
||||
// we can see where the process was... that is a missed quick poll?
|
||||
// we can see where the process was... we are in a long niceness 0
|
||||
// function or a niceness 1 function without a quickpoll, so that
|
||||
// heartbeatWrapper() function never gets called.
|
||||
if ( g_process.m_lastHeartbeatApprox == 0 ) return;
|
||||
if ( g_conf.m_maxHeartbeatDelay <= 0 ) return;
|
||||
if ( g_nowApprox - g_process.m_lastHeartbeatApprox >
|
||||
g_conf.m_maxHeartbeatDelay ) {
|
||||
#ifndef PTHREADS
|
||||
logf(LOG_DEBUG,"gb: CPU seems blocked. Forcing core.");
|
||||
logf(LOG_DEBUG,"gb: CPU seems blocked. Dumping stack.");
|
||||
printStackTrace( x , info , y );
|
||||
#endif
|
||||
//char *xx=NULL; *xx=0;
|
||||
|
||||
}
|
||||
|
||||
//logf(LOG_DEBUG, "xxx now: %"INT64"! approx: %"INT64"", g_now, g_nowApprox);
|
||||
@ -2708,6 +2723,32 @@ void Loop::enableTimer() {
|
||||
}
|
||||
|
||||
|
||||
FILE* gbpopen(char* cmd) {
|
||||
// Block everything from interrupting this system call because
|
||||
// if there is an alarm or a child thread crashes (pdftohtml)
|
||||
// then this will hang forever.
|
||||
// We should actually write our own popen so that we do
|
||||
// fork, close all fds in the child, then exec.
|
||||
// These child processes can hold open the http server and
|
||||
// prevent a new gb from running even after it has died.
|
||||
g_loop.disableTimer();
|
||||
|
||||
sigset_t oldSigs;
|
||||
sigset_t sigs;
|
||||
sigfillset ( &sigs );
|
||||
|
||||
if ( sigprocmask ( SIG_BLOCK , &sigs, &oldSigs ) < 0 ) {
|
||||
log("build: had error blocking signals for popen");
|
||||
}
|
||||
FILE* fh = popen(cmd, "r");
|
||||
|
||||
if ( sigprocmask ( SIG_SETMASK , &oldSigs, NULL ) < 0 ) {
|
||||
log("build: had error unblocking signals for popen");
|
||||
}
|
||||
|
||||
g_loop.enableTimer();
|
||||
return fh;
|
||||
}
|
||||
|
||||
|
||||
//calling with a 0 niceness will turn off the timer interrupt
|
||||
|
4
Loop.h
4
Loop.h
@ -18,7 +18,9 @@
|
||||
#define QUERYPRIORITYWEIGHT 16
|
||||
#define QUICKPOLL_INTERVAL 10
|
||||
|
||||
int gbsystem(char *cmd ) ;
|
||||
int gbsystem(char *cmd);
|
||||
FILE* gbpopen(char* cmd);
|
||||
|
||||
|
||||
#define sleep(a) { char *xx=NULL;*xx=0; }
|
||||
//#define sleep(a) logf(LOG_INFO,"sleep: sleep");
|
||||
|
15
Makefile
15
Makefile
@ -67,7 +67,7 @@ OBJS = UdpSlot.o Rebalance.o \
|
||||
Dates.o Sections.o SiteGetter.o Syncdb.o qa.o \
|
||||
Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
|
||||
Cachedb.o Monitordb.o dlstubs.o PageCrawlBot.o Json.o PageBasic.o \
|
||||
Version.o
|
||||
Punycode.o Version.o
|
||||
|
||||
CHECKFORMATSTRING = -D_CHECK_FORMAT_STRING_
|
||||
|
||||
@ -407,7 +407,7 @@ Linkdb.o:
|
||||
|
||||
# final gigabit generation in here:
|
||||
Msg40.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
seo.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
|
||||
@ -788,14 +788,5 @@ install-pkgs-local:
|
||||
warcinjector:
|
||||
-rm -r /home/zak/.pex/build/inject-*
|
||||
-rm -r /home/zak/.pex/install/inject-*
|
||||
cd script && pex -v . requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject -o warc-inject --inherit-path --no-wheel
|
||||
cd script && pex -v . gevent gevent-socketio requests pyopenssl ndg-httpsclient pyasn1 multiprocessing -e inject -o warc-inject --inherit-path --no-wheel
|
||||
|
||||
|
||||
|
||||
#pex -v inject requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script' --inherit-path --no-wheel
|
||||
|
||||
|
||||
#pex -v inject requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script' --inherit-path --no-wheel
|
||||
|
||||
|
||||
# pex -r requests -r pyopenssl -r ndg-httpsclient -r pyasn1 -r multiprocessing -e inject.inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script/' --inherit-path --no-wheel
|
||||
|
70
Matches.cpp
70
Matches.cpp
@ -1736,6 +1736,75 @@ bool Matches::negTermsFound ( ) {
|
||||
}
|
||||
*/
|
||||
|
||||
bool Matches::docHasQueryTerms(int32_t totalInlinks) {
|
||||
// Loop through all matches keeping a count of query term matches
|
||||
// from link text.
|
||||
// If a match is not from a link text max it out.
|
||||
// Tally up the matched terms vs number of matches
|
||||
// if only one or two link text matches out of > 10 then
|
||||
// return false indicating that the doc does not
|
||||
// have the term
|
||||
|
||||
if(m_numMatches == 0) {
|
||||
// if there is no query and no matches then short circuit
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t qterms = 1024;
|
||||
int32_t tmpBuf[qterms];
|
||||
int32_t *numMatches = tmpBuf;
|
||||
|
||||
if(qterms < m_q->m_numTerms) {
|
||||
qterms = m_q->m_numTerms;
|
||||
numMatches = (int32_t *)mmalloc(qterms * sizeof(int32_t),
|
||||
"matchesAnomaly");
|
||||
}
|
||||
memset(numMatches, 0, qterms * sizeof(int32_t));
|
||||
|
||||
for ( int32_t i = 0 ; i < m_numMatches ; i++ ) {
|
||||
// get the match
|
||||
Match *m = &m_matches[i];
|
||||
if(m->m_flags & MF_LINK) {
|
||||
numMatches[m->m_qwordNum]++;
|
||||
continue;
|
||||
}
|
||||
numMatches[m->m_qwordNum] = m_numMatches;
|
||||
// log("match flag %x wordnum %"INT32 " totalinlinks:%"INT32,
|
||||
// m->m_flags, m->m_wordNum, totalInlinks);
|
||||
}
|
||||
|
||||
|
||||
// Assume the best, since we're really only after anomalous link text
|
||||
// at this point.
|
||||
bool hasTerms = true;
|
||||
int32_t nqt = m_q->m_numTerms;
|
||||
for ( int32_t i = 0 ; i < nqt ; i++ ) {
|
||||
QueryTerm *qt = &m_q->m_qterms[i];
|
||||
// skip if ignored *in certain ways only*
|
||||
if ( ! isMatchableTerm ( qt ) ) {
|
||||
continue;
|
||||
}
|
||||
// get the word it is from
|
||||
QueryWord *qw = qt->m_qword;
|
||||
|
||||
// It is a match if it matched something other than link text
|
||||
// or it matched at least 1 link text and there arent many link texts
|
||||
// or it matched more than 2 link texts and there are many link texts
|
||||
hasTerms &= ((numMatches[qw->m_wordNum] >= m_numMatches) ||
|
||||
(numMatches[qw->m_wordNum] > 0 && totalInlinks < 10) ||
|
||||
(numMatches[qw->m_wordNum] > 2 && totalInlinks > 10));
|
||||
}
|
||||
|
||||
if (numMatches != tmpBuf) {
|
||||
mfree(numMatches, qterms * sizeof(int32_t), "matchesAnomaly");
|
||||
}
|
||||
return hasTerms;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
MatchOffsets::MatchOffsets() {
|
||||
reset();
|
||||
}
|
||||
@ -1804,6 +1873,7 @@ bool MatchOffsets::set(Xml * xml, Words *words, Matches *matches,
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int32_t MatchOffsets::getStoredSize() {
|
||||
return m_numMatches * 5
|
||||
+ 4 //numMatches
|
||||
|
@ -148,6 +148,7 @@ class Matches {
|
||||
//int32_t getTermsFound ( bool *hadPhrases , bool *hadWords );
|
||||
uint32_t getTermsFound2(bool *hadPhrases, bool *hadWords);
|
||||
//bool negTermsFound ( );
|
||||
bool docHasQueryTerms(int32_t totalInlinks);
|
||||
|
||||
// used internally and by PageGet.cpp
|
||||
bool isMatchableTerm ( class QueryTerm *qt );//, int32_t i );
|
||||
|
2
Mem.cpp
2
Mem.cpp
@ -1718,7 +1718,7 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
|
||||
int32_t slot = g_mem.getMemSlot ( ptr );
|
||||
if ( slot < 0 ) {
|
||||
log(LOG_LOGIC,"mem: could not find slot (note=%s)",note);
|
||||
log(LOG_LOGIC,"mem: FIXME!!!");
|
||||
//log(LOG_LOGIC,"mem: FIXME!!!");
|
||||
// return for now so procog does not core all the time!
|
||||
return;
|
||||
//char *xx = NULL; *xx = 0;
|
||||
|
22
Msg13.cpp
22
Msg13.cpp
@ -282,6 +282,12 @@ bool Msg13::forwardRequest ( ) {
|
||||
int32_t nh = g_hostdb.m_numHosts;
|
||||
int32_t hostId = hash32h(((uint32_t)r->m_firstIp >> 8), 0) % nh;
|
||||
|
||||
if((uint32_t)r->m_firstIp >> 8 == 0) {
|
||||
// If the first IP is not set for the request then we don't
|
||||
// want to hammer the first host with spidering enabled.
|
||||
hostId = hash32n ( r->ptr_url ) % nh;
|
||||
}
|
||||
|
||||
// avoid host #0 for diffbot hack which is dropping some requests
|
||||
// because of the streaming bug methinks
|
||||
if ( hostId == 0 && nh >= 2 && g_conf.m_diffbotMsg13Hack )
|
||||
@ -295,12 +301,22 @@ bool Msg13::forwardRequest ( ) {
|
||||
// get that host
|
||||
//h = g_hostdb.getProxy ( hostId );;
|
||||
h = g_hostdb.getHost ( hostId );
|
||||
// stop if he is alive
|
||||
if ( ! g_hostdb.isDead ( h ) ) break;
|
||||
|
||||
// Get the other one in shard instead of getting the first
|
||||
// one we find sequentially because that makes the load
|
||||
// imbalanced to the lowest host with spidering enabled.
|
||||
if(!h->m_spiderEnabled) {
|
||||
h = g_hostdb.getHost(g_hostdb.getHostIdWithSpideringEnabled(
|
||||
h->m_hostId));
|
||||
}
|
||||
|
||||
// stop if he is alive and able to spider
|
||||
if ( h->m_spiderEnabled && ! g_hostdb.isDead ( h ) ) break;
|
||||
// get the next otherwise
|
||||
if ( ++hostId >= nh ) hostId = 0;
|
||||
}
|
||||
|
||||
|
||||
hostId = 0; // HACK!!
|
||||
|
||||
// forward it to self if we are the spider proxy!!!
|
||||
@ -2364,7 +2380,7 @@ bool getTestSpideredDate ( Url *u , int32_t *origSpideredDate , char *testDir )
|
||||
bool addTestSpideredDate ( Url *u , int32_t spideredTime , char *testDir ) {
|
||||
|
||||
// ensure dir exists
|
||||
::mkdir(testDir,S_IRWXU);
|
||||
::mkdir(testDir,getDirCreationFlags());
|
||||
|
||||
// set this
|
||||
int64_t uh64 = hash64(u->getUrl(),u->getUrlLen());
|
||||
|
@ -57,8 +57,9 @@ void handleRequest ( UdpSlot *slot , int32_t netnice ) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t fd = open ( filename , O_RDONLY,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
int32_t fd = open ( filename , O_RDONLY ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
if ( ! fd ) {
|
||||
log(LOG_DEBUG, "logviewer: Failed to open %s for reading: ",
|
||||
filename);
|
||||
|
24
Msg20.cpp
24
Msg20.cpp
@ -177,6 +177,12 @@ bool Msg20::getSummary ( Msg20Request *req ) {
|
||||
int32_t timeout = 9999999; // 10 million seconds, basically inf.
|
||||
if ( req->m_niceness == 0 ) timeout = 20;
|
||||
|
||||
// for diffbot make timeout super long so we aren't tripped up
|
||||
// by dead hosts that aren't really dead.
|
||||
// CollectionRec *cr = g_collectiondb.getRec ( req->m_collnum );
|
||||
// if ( cr && cr->m_isCustomCrawl && req->m_niceness == 0 )
|
||||
// timeout = 300;
|
||||
|
||||
// get our group
|
||||
int32_t allNumHosts = hostdb->getNumHostsPerShard();
|
||||
Host *allHosts = hostdb->getShard ( shardNum );//getGroup(groupId );
|
||||
@ -189,13 +195,29 @@ bool Msg20::getSummary ( Msg20Request *req ) {
|
||||
Host *hh = &allHosts[i];
|
||||
// skip if dead
|
||||
if ( g_hostdb.isDead(hh) ) continue;
|
||||
|
||||
// Respect no-spider, no-query directives from hosts.conf
|
||||
if ( !req->m_getLinkInfo && ! hh->m_queryEnabled ) continue;
|
||||
if ( req->m_getLinkInfo && ! hh->m_spiderEnabled ) continue;
|
||||
// add it if alive
|
||||
cand[nc++] = hh;
|
||||
}
|
||||
// if none alive, make them all candidates then
|
||||
bool allDead = (nc == 0);
|
||||
for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ )
|
||||
for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ ) {
|
||||
// NEVER add a noquery host to the candidate list, even
|
||||
// if the query host is dead
|
||||
if ( ! allHosts[i].m_queryEnabled ) continue;
|
||||
cand[nc++] = &allHosts[i];
|
||||
}
|
||||
|
||||
if ( nc == 0 ) {
|
||||
log("msg20: error sending mcast: no queryable hosts "
|
||||
"availble to handle summary generation");
|
||||
g_errno = EBADENGINEER;
|
||||
m_gotReply = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
// route based on docid region, not parity, because we want to hit
|
||||
// the urldb page cache as much as possible
|
||||
|
45
Msg22.cpp
45
Msg22.cpp
@ -157,46 +157,13 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
|
||||
if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
|
||||
firstHostId = hosts [ hostNum ].m_hostId ;
|
||||
*/
|
||||
|
||||
Host *firstHost ;
|
||||
// if niceness 0 can't pick noquery host.
|
||||
// if niceness 1 can't pick nospider host.
|
||||
firstHost = g_hostdb.getLeastLoadedInShard ( shardNum, r->m_niceness );
|
||||
int32_t firstHostId = firstHost->m_hostId;
|
||||
|
||||
// get our group
|
||||
int32_t allNumHosts = g_hostdb.getNumHostsPerShard();
|
||||
Host *allHosts = g_hostdb.getShard ( shardNum );//Group ( groupId );
|
||||
|
||||
// put all alive hosts in this array
|
||||
Host *cand[32];
|
||||
int64_t nc = 0;
|
||||
for ( int32_t i = 0 ; i < allNumHosts ; i++ ) {
|
||||
// get that host
|
||||
Host *hh = &allHosts[i];
|
||||
// skip if dead
|
||||
if ( g_hostdb.isDead(hh) ) continue;
|
||||
// add it if alive
|
||||
cand[nc++] = hh;
|
||||
}
|
||||
// if none alive, make them all candidates then
|
||||
bool allDead = (nc == 0);
|
||||
for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ )
|
||||
cand[nc++] = &allHosts[i];
|
||||
|
||||
// route based on docid region, not parity, because we want to hit
|
||||
// the urldb page cache as much as possible
|
||||
int64_t sectionWidth =((128LL*1024*1024)/nc)+1;//(DOCID_MASK/nc)+1LL;
|
||||
// we mod by 1MB since tied scores resort to sorting by docid
|
||||
// so we don't want to overload the host responsible for the lowest
|
||||
// range of docids. CAUTION: do this for msg22 too!
|
||||
// in this way we should still ensure a pretty good biased urldb
|
||||
// cache...
|
||||
// . TODO: fix the urldb cache preload logic
|
||||
int32_t hostNum = (docId % (128LL*1024*1024)) / sectionWidth;
|
||||
if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids
|
||||
if ( hostNum >= nc ) { char *xx = NULL; *xx = 0; }
|
||||
int32_t firstHostId = cand [ hostNum ]->m_hostId ;
|
||||
|
||||
// while this prevents tfndb seeks, it also causes bottlenecks
|
||||
// if one host is particularly slow, because load balancing is
|
||||
// bypassed.
|
||||
//if ( ! g_conf.m_useBiasedTfndb ) firstHostId = -1;
|
||||
// flag it
|
||||
m_outstanding = true;
|
||||
r->m_inUse = 1;
|
||||
|
||||
|
2
Msg3.cpp
2
Msg3.cpp
@ -1115,6 +1115,8 @@ bool Msg3::doneScanning ( ) {
|
||||
m_lists[i].getListSize() ,
|
||||
0 ); // timestamp. 0 = now
|
||||
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
// if from our 'page' cache, no need to constrain
|
||||
if ( ! m_lists[i].constrain ( m_startKey ,
|
||||
m_constrainKey , // m_endKey
|
||||
|
16
Msg3a.cpp
16
Msg3a.cpp
@ -470,6 +470,12 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
for ( int32_t i = 0; i < m_numHosts ; i++ ) { // m_indexdbSplit; i++ ) {
|
||||
// get that host
|
||||
Host *h = g_hostdb.getHost(i);
|
||||
|
||||
if(!h->m_queryEnabled) {
|
||||
m_numReplies++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// if not a full split, just round robin the group, i am not
|
||||
// going to sweat over performance on non-fully split indexes
|
||||
// because they suck really bad anyway compared to full
|
||||
@ -701,10 +707,12 @@ bool Msg3a::gotAllShardReplies ( ) {
|
||||
// bad reply?
|
||||
if ( ! mr || replySize < 29 ) {
|
||||
m_skippedShards++;
|
||||
log(LOG_LOGIC,"query: msg3a: Bad reply (size=%i) from "
|
||||
"host #%"INT32". Dead? Timeout? OOM?"
|
||||
,(int)replySize
|
||||
,i);
|
||||
if(g_hostdb.getHost(i)->m_queryEnabled) {
|
||||
log(LOG_LOGIC,"query: msg3a: Bad reply (size=%i) from "
|
||||
"host #%"INT32". Dead? Timeout? OOM?"
|
||||
,(int)replySize
|
||||
,i);
|
||||
}
|
||||
m_reply [i] = NULL;
|
||||
m_replyMaxSize[i] = 0;
|
||||
// it might have been timd out, just ignore it!!
|
||||
|
57
Msg4.cpp
57
Msg4.cpp
@ -1434,8 +1434,9 @@ bool saveAddsInProgress ( char *prefix ) {
|
||||
sprintf ( filename , "%s%saddsinprogress.saving",
|
||||
g_hostdb.m_dir , prefix );
|
||||
|
||||
int32_t fd = open ( filename, O_RDWR | O_CREAT | O_TRUNC ,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
|
||||
int32_t fd = open ( filename, O_RDWR | O_CREAT | O_TRUNC ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
|
||||
if ( fd < 0 ) {
|
||||
log ("build: Failed to open %s for writing: %s",
|
||||
filename,strerror(errno));
|
||||
@ -1460,6 +1461,12 @@ bool saveAddsInProgress ( char *prefix ) {
|
||||
// 4 bytes is how much of the total buffer is used, including
|
||||
// those 4 bytes.
|
||||
if ( used == 4 ) continue;
|
||||
// test it
|
||||
if ( used <= 4 || used > 300000000 ) { // > 300MB????
|
||||
log("msg4: saving addsinprogress. bad bucket "
|
||||
"used size of %"INT32,used);
|
||||
continue;
|
||||
}
|
||||
// the buf itself
|
||||
write ( fd , s_hostBufs[i] , used );
|
||||
}
|
||||
@ -1473,6 +1480,20 @@ bool saveAddsInProgress ( char *prefix ) {
|
||||
if ( ! slot->m_callback ) continue;
|
||||
// skip if got reply
|
||||
if ( slot->m_readBuf ) continue;
|
||||
// if not sending something, skip
|
||||
if ( ! slot->m_sendBuf ) continue;
|
||||
// test it
|
||||
int32_t used = *(int32_t *)slot->m_sendBuf;
|
||||
if ( used <= 4 || used > 300000000 ) { // > 300MB????
|
||||
log("msg4: saving addsinprogress. bad slot "
|
||||
"used size of %"INT32,used);
|
||||
continue;
|
||||
}
|
||||
if ( used != slot->m_sendBufSize ) {
|
||||
log("msg4: saving addsinprogress. bad used size of "
|
||||
"%"INT32" != %"INT32,used,slot->m_sendBufSize);
|
||||
continue;
|
||||
}
|
||||
// write hostid sent to
|
||||
write ( fd , &slot->m_hostId , 4 );
|
||||
// write that
|
||||
@ -1510,6 +1531,9 @@ bool saveAddsInProgress ( char *prefix ) {
|
||||
g_hostdb.m_dir , prefix );
|
||||
|
||||
::rename ( filename , newFilename );
|
||||
|
||||
log(LOG_INFO,"build: Renamed %s to %s",filename,newFilename);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1577,12 +1601,12 @@ bool loadAddsInProgress ( char *prefix ) {
|
||||
p += 4;
|
||||
if ( numHostBufs != s_numHostBufs ) {
|
||||
g_errno = EBADENGINEER;
|
||||
return log("build: addsinprogress.dat has wrong number of "
|
||||
"host bufs.");
|
||||
log("build: addsinprogress.dat has wrong number of "
|
||||
"host bufs.");
|
||||
}
|
||||
|
||||
// deserialize each hostbuf
|
||||
for ( int32_t i = 0 ; i < s_numHostBufs ; i++ ) {
|
||||
for ( int32_t i = 0 ; i < numHostBufs ; i++ ) {
|
||||
// break if nothing left to read
|
||||
if ( p >= pend ) break;
|
||||
// USED size of the buf
|
||||
@ -1595,6 +1619,8 @@ bool loadAddsInProgress ( char *prefix ) {
|
||||
s_hostBufSizes[i] = 0;
|
||||
continue;
|
||||
}
|
||||
if ( used < 4 || used > 300000000 )
|
||||
return log("msg4: bad used bytes in bucket 1");
|
||||
// malloc the min buf size
|
||||
int32_t allocSize = MAXHOSTBUFSIZE;
|
||||
if ( allocSize < used ) allocSize = used;
|
||||
@ -1620,6 +1646,12 @@ bool loadAddsInProgress ( char *prefix ) {
|
||||
log("build: file %s is bad.",filename);
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
if ( i >= s_numHostBufs ) {
|
||||
mfree ( buf , allocSize ,"hostbuf");
|
||||
log("build: skipping host buf #%"INT32,i);
|
||||
continue;
|
||||
}
|
||||
|
||||
// set the array
|
||||
s_hostBufs [i] = buf;
|
||||
s_hostBufSizes [i] = allocSize;
|
||||
@ -1635,15 +1667,12 @@ bool loadAddsInProgress ( char *prefix ) {
|
||||
p += 4;
|
||||
// get host
|
||||
Host *h = g_hostdb.getHost(hostId);
|
||||
// must be there
|
||||
if ( ! h ) {
|
||||
close (fd);
|
||||
return log("build: bad msg4 hostid %"INT32"",hostId);
|
||||
}
|
||||
// host many bytes
|
||||
int32_t numBytes;
|
||||
read ( fd , (char *)&numBytes , 4 );
|
||||
p += 4;
|
||||
if ( numBytes < 4 || numBytes > 300000000 )
|
||||
return log("msg4: bad used bytes in slot 1");
|
||||
// allocate buffer
|
||||
char *buf = (char *)mmalloc ( numBytes , "msg4loadbuf");
|
||||
if ( ! buf ) {
|
||||
@ -1657,6 +1686,14 @@ bool loadAddsInProgress ( char *prefix ) {
|
||||
return log("build: bad msg4 buf read");
|
||||
}
|
||||
p += numBytes;
|
||||
// must be there
|
||||
if ( ! h ) {
|
||||
//close (fd);
|
||||
log("build: bad msg4 hostid %"INT32" nb=%"INT32,
|
||||
hostId,nb);
|
||||
mfree ( buf , numBytes,"hostbuf");
|
||||
continue;
|
||||
}
|
||||
// send it!
|
||||
if ( ! g_udpServer.sendRequest ( buf ,
|
||||
numBytes ,
|
||||
|
45
Msg40.cpp
45
Msg40.cpp
@ -83,7 +83,7 @@ static bool gotSummaryWrapper ( void *state );
|
||||
bool isSubDom(char *s , int32_t len);
|
||||
|
||||
Msg40::Msg40() {
|
||||
m_firstTime = true;
|
||||
m_calledFacets = false;
|
||||
m_doneWithLookup = false;
|
||||
m_socketHadError = 0;
|
||||
m_buf = NULL;
|
||||
@ -109,6 +109,8 @@ Msg40::Msg40() {
|
||||
m_printCount = 0;
|
||||
//m_numGigabitInfos = 0;
|
||||
m_numCollsToSearch = 0;
|
||||
m_numMsg20sIn = 0;
|
||||
m_numMsg20sOut = 0;
|
||||
}
|
||||
|
||||
#define MAX2 50
|
||||
@ -1427,8 +1429,12 @@ bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
// hard limit
|
||||
if ( m_numRequests-m_numReplies >= maxOut ) break;
|
||||
// do not launch another until m_printi comes back because
|
||||
// all summaries are bottlenecked on printing him out now
|
||||
// all summaries are bottlenecked on printing him out now.
|
||||
if ( m_si->m_streamResults &&
|
||||
// must have at least one outstanding summary guy
|
||||
// otherwise we can return true below and cause
|
||||
// the stream to truncate results in gotSummary()
|
||||
//m_numReplies < m_numRequests &&
|
||||
i >= m_printi + MAX_OUTSTANDING_MSG20S - 1 )
|
||||
break;
|
||||
|
||||
@ -1499,8 +1505,21 @@ bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
// if to a dead host, skip it
|
||||
int64_t docId = m_msg3a.m_docIds[i];
|
||||
uint32_t shardNum = g_hostdb.getShardNumFromDocId ( docId );
|
||||
if ( g_hostdb.isShardDead ( shardNum ) ) {
|
||||
log("msg40: skipping summary lookup #%"INT32" of "
|
||||
// get the collection rec
|
||||
CollectionRec *cr = g_collectiondb.getRec(m_firstCollnum);
|
||||
// if shard is dead then do not send to it if not crawlbot
|
||||
if ( g_hostdb.isShardDead ( shardNum ) &&
|
||||
cr &&
|
||||
// diffbot urls.csv downloads often encounter dead
|
||||
// hosts that are not really dead, so wait for it
|
||||
! cr->m_isCustomCrawl &&
|
||||
// this is causing us to truncate streamed results
|
||||
// too early when we have false positives that a
|
||||
// host is dead because the server is locking up
|
||||
// periodically
|
||||
! m_si->m_streamResults ) {
|
||||
log("msg40: skipping summary "
|
||||
"lookup #%"INT32" of "
|
||||
"docid %"INT64" for dead shard #%"INT32""
|
||||
, i
|
||||
, docId
|
||||
@ -1547,8 +1566,6 @@ bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
// keep for-loops int16_ter with this
|
||||
//if ( i > m_maxiLaunched ) m_maxiLaunched = i;
|
||||
|
||||
// get the collection rec
|
||||
CollectionRec *cr =g_collectiondb.getRec(m_firstCollnum);
|
||||
//getRec(m_si->m_coll2,m_si->m_collLen2);
|
||||
if ( ! cr ) {
|
||||
log("msg40: missing coll");
|
||||
@ -1737,7 +1754,7 @@ Msg20 *Msg40::getAvailMsg20 ( ) {
|
||||
if ( m_msg20[i]->m_launched ) continue;
|
||||
return m_msg20[i];
|
||||
}
|
||||
// how can this happen???
|
||||
// how can this happen??? THIS HAPPEND
|
||||
char *xx=NULL;*xx=0;
|
||||
return NULL;
|
||||
}
|
||||
@ -1762,7 +1779,7 @@ bool gotSummaryWrapper ( void *state ) {
|
||||
THIS->m_numReplies,
|
||||
THIS->m_msg3a.m_numDocIds);
|
||||
// it returns false if we're still awaiting replies
|
||||
if ( ! THIS->gotSummary ( ) ) return false;
|
||||
if ( ! THIS->m_calledFacets && ! THIS->gotSummary ( ) ) return false;
|
||||
// lookup facets
|
||||
if ( THIS->m_si &&
|
||||
! THIS->m_si->m_streamResults &&
|
||||
@ -2215,12 +2232,11 @@ bool Msg40::gotSummary ( ) {
|
||||
|
||||
complete:
|
||||
|
||||
// . ok, now i wait for everybody.
|
||||
// . ok, now i wait for all msg20s (getsummary) to come back in.
|
||||
// . TODO: evaluate if this hurts us
|
||||
if ( m_numReplies < m_numRequests )
|
||||
return false;
|
||||
|
||||
|
||||
// if streaming results, we are done
|
||||
if ( m_si && m_si->m_streamResults ) {
|
||||
// unless waiting for last transmit to complete
|
||||
@ -2444,6 +2460,9 @@ bool Msg40::gotSummary ( ) {
|
||||
for ( int32_t i = 0 ; dedupPercent && i < m_numReplies ; i++ ) {
|
||||
// skip if already invisible
|
||||
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
|
||||
// Skip if invalid
|
||||
if ( m_msg20[i]->m_errno ) continue;
|
||||
|
||||
// start with the first docid we have not yet checked!
|
||||
//int32_t m = oldNumContiguous;
|
||||
// get it
|
||||
@ -2462,6 +2481,8 @@ bool Msg40::gotSummary ( ) {
|
||||
// skip if already invisible
|
||||
if ( *level != CR_OK ) continue;
|
||||
// get it
|
||||
if ( m_msg20[m]->m_errno ) continue;
|
||||
|
||||
Msg20Reply *mrm = m_msg20[m]->m_r;
|
||||
// do not dedup CT_STATUS results, those are
|
||||
// spider reply "documents" that indicate the last
|
||||
@ -6280,8 +6301,8 @@ bool Msg40::lookupFacets ( ) {
|
||||
|
||||
if ( m_doneWithLookup ) return true;
|
||||
|
||||
if ( m_firstTime ) {
|
||||
m_firstTime = false;
|
||||
if ( !m_calledFacets ) {
|
||||
m_calledFacets = true;
|
||||
m_numMsg20sOut = 0;
|
||||
m_numMsg20sIn = 0;
|
||||
m_j = 0;
|
||||
|
2
Msg40.h
2
Msg40.h
@ -223,7 +223,7 @@ class Msg40 {
|
||||
bool m_doneWithLookup;
|
||||
HashTableX m_facetTextTable;
|
||||
SafeBuf m_facetTextBuf;
|
||||
bool m_firstTime;
|
||||
bool m_calledFacets;
|
||||
int32_t m_omitCount;
|
||||
|
||||
bool printFacetTables ( class SafeBuf *sb ) ;
|
||||
|
2
MsgC.cpp
2
MsgC.cpp
@ -88,7 +88,7 @@ bool MsgC::getIp(char *hostname , int32_t hostnameLen ,
|
||||
if ( g_dns.isInCache ( key , ip ) ) {
|
||||
if ( *ip == 3 ) { char *xx=NULL;*xx=0; }
|
||||
// debug msg
|
||||
//log("dns::getIp: %s (key=%"UINT64") has ip=%s in cache!!!",
|
||||
//log(LOG_DEBUG, "dns::getIp: %s (key=%"UINT64") has ip=%s in cache!!!",
|
||||
// tmp,key.n0,iptoa(*ip));
|
||||
return true;
|
||||
}
|
||||
|
@ -607,6 +607,11 @@ loop:
|
||||
// debug msg
|
||||
//log("Multicast:: no hosts left to send to");
|
||||
g_errno = ENOHOSTS; return false; }
|
||||
|
||||
// log("build: msg %x sent to host %"INT32 " first hostId is %"INT32
|
||||
// " oustanding msgs %"INT32,
|
||||
// m_msgType, i, firstHostId, m_hostPtrs[i]->m_numOutstandingRequests);
|
||||
|
||||
// . send to this guy, if we haven't yet
|
||||
// . returns false and sets g_errno on error
|
||||
// . if it returns true, we sent ok, so we should return true
|
||||
|
@ -3570,6 +3570,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
" <a href=/v3/crawl/download/%s_urls.csv>"
|
||||
"new csv format</a>"
|
||||
|
||||
" <a href=/search?q=gbsortby"
|
||||
"int%%3AgbssSpiderTime&n=50&c=%s>"
|
||||
"last 50 download attempts</a>"
|
||||
|
||||
"</td>"
|
||||
"</tr>"
|
||||
@ -3645,6 +3649,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
// urls.csv new format v3
|
||||
, cr->m_coll
|
||||
|
||||
|
||||
// last 50 downloaded urls
|
||||
, cr->m_coll
|
||||
|
||||
// latest objects in html
|
||||
, cr->m_coll
|
||||
, rand64
|
||||
|
@ -483,7 +483,7 @@ bool processLoop ( void *state ) {
|
||||
"><td>"
|
||||
//"<font face=times,sans-serif color=black size=-1>"
|
||||
"<span style=\"%s\">"
|
||||
"This is Gigablast's cached page of </span>"
|
||||
"This is <a href=/>Gigablast<a>'s cached page of </span>"
|
||||
"<a href=\"%s\" style=\"%s\">%s</a>"
|
||||
"" , styleTitle, f->getUrl(), styleLink,
|
||||
f->getUrl() );
|
||||
|
@ -200,7 +200,7 @@ skipReplaceHost:
|
||||
|
||||
//"<td><b>resends sent</td>"
|
||||
//"<td><b>errors recvd</td>"
|
||||
"<td><b>try agains recvd</b></td>"
|
||||
"<td><b>try agains sent</b></td>"
|
||||
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=3\">"
|
||||
"<b>dgrams resent</b></a></td>"
|
||||
@ -630,6 +630,15 @@ skipReplaceHost:
|
||||
if ( !(flags & PFLAG_MERGEMODE0) )
|
||||
fb.safePrintf ( "y");
|
||||
|
||||
|
||||
if ( format == FORMAT_HTML && !h->m_spiderEnabled) {
|
||||
fb.safePrintf("<span title=\"Spider Disabled\" style=\"text-decoration:line-through;\">S</span>");
|
||||
}
|
||||
if ( format == FORMAT_HTML && !h->m_queryEnabled) {
|
||||
fb.safePrintf("<span title=\"Query Disabled\" style=\"text-decoration:line-through;\">Q</span>");
|
||||
}
|
||||
|
||||
|
||||
// clear it if it is us, this is invalid
|
||||
if ( ! h->m_gotPingReply ) {
|
||||
fb.reset();
|
||||
@ -758,6 +767,13 @@ skipReplaceHost:
|
||||
sb.safePrintf("\t\t<note>%s</note>\n",
|
||||
h->m_note );
|
||||
|
||||
sb.safePrintf("\t\t<spider>%"INT32"</spider>\n",
|
||||
(int32_t)h->m_spiderEnabled );
|
||||
|
||||
|
||||
sb.safePrintf("\t\t<query>%"INT32"</query>\n",
|
||||
(int32_t)h->m_queryEnabled );
|
||||
|
||||
sb.safePrintf("\t</host>\n");
|
||||
|
||||
continue;
|
||||
@ -859,6 +875,14 @@ skipReplaceHost:
|
||||
sb.safePrintf("\t\t\"note\":\"%s\"\n",
|
||||
h->m_note );
|
||||
|
||||
sb.safePrintf("\t\t\"spider\":\"%"INT32"\"\n",
|
||||
(int32_t)h->m_spiderEnabled );
|
||||
|
||||
sb.safePrintf("\t\t\"query\":\"%"INT32"\"\n",
|
||||
(int32_t)h->m_queryEnabled );
|
||||
|
||||
|
||||
|
||||
sb.safePrintf("\t},\n");
|
||||
|
||||
continue;
|
||||
@ -1313,12 +1337,14 @@ skipReplaceHost:
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>try agains recvd</td>"
|
||||
"<td>try agains sent</td>"
|
||||
"<td>How many ETRYAGAIN errors "
|
||||
"were received in response to a "
|
||||
"has this host sent out? they are sent out some times "
|
||||
"in response to a "
|
||||
"request to add data. Usually because the host's memory "
|
||||
"is full and it is dumping its data to disk. This number "
|
||||
"can be high if the host if failing to dump the data "
|
||||
"can be relatively high if the host if failing to dump "
|
||||
"the data "
|
||||
"to disk because of some malfunction, and it can therefore "
|
||||
"bottleneck the entire cluster."
|
||||
"</td>"
|
||||
|
@ -131,6 +131,53 @@ Host *getHostToHandleInjection ( char *url ) {
|
||||
Host *group = g_hostdb.getShard ( shardNum );
|
||||
int32_t hostNum = docId % g_hostdb.m_numHostsPerShard;
|
||||
Host *host = &group[hostNum];
|
||||
|
||||
bool isWarcInjection = false;
|
||||
int32_t ulen = gbstrlen(url);
|
||||
if ( ulen > 10 && strcmp(url+ulen-8,".warc.gz") == 0 )
|
||||
isWarcInjection = true;
|
||||
if ( ulen > 10 && strcmp(url+ulen-5,".warc") == 0 )
|
||||
isWarcInjection = true;
|
||||
|
||||
if ( ! isWarcInjection ) return host;
|
||||
|
||||
// warc files end up calling XmlDoc::indexWarcOrArc() which spawns
|
||||
// a msg7 injection request for each doc in the warc/arc file
|
||||
// so let's do load balancing differently for them so one host
|
||||
// doesn't end up doing a bunch of wget/gunzips on warc files
|
||||
// thereby bottlenecking the cluster. get the first hostid that
|
||||
// we have not sent a msg7 injection request to that is still out
|
||||
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
||||
Host *h = g_hostdb.getHost(i);
|
||||
h->m_tmpCount = 0;
|
||||
}
|
||||
for ( UdpSlot *slot = g_udpServer.m_head2 ;
|
||||
slot ;
|
||||
slot = slot->m_next2 ) {
|
||||
// skip if not injection request
|
||||
if ( slot->m_msgType != 0x07 ) continue;
|
||||
//if ( ! slot->m_weInitiated ) continue;
|
||||
// if we did not initiate the injection request, i.e. if
|
||||
// it is to us, skip it
|
||||
if ( ! slot->m_callback ) continue;
|
||||
// who is it from?
|
||||
int32_t hostId = slot->m_hostId;
|
||||
if ( hostId < 0 ) continue;
|
||||
Host *h = g_hostdb.getHost ( hostId );
|
||||
if ( ! h ) continue;
|
||||
h->m_tmpCount++;
|
||||
}
|
||||
int32_t min = 999999;
|
||||
Host *minh = NULL;
|
||||
for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
|
||||
Host *h = g_hostdb.getHost(i);
|
||||
if ( h->m_tmpCount == 0 ) return h;
|
||||
if ( h->m_tmpCount >= min ) continue;
|
||||
min = h->m_tmpCount;
|
||||
minh = h;
|
||||
}
|
||||
if ( minh ) return minh;
|
||||
// how can this happen?
|
||||
return host;
|
||||
}
|
||||
|
||||
@ -182,6 +229,9 @@ bool Msg7::sendInjectionRequestToHost ( InjectionRequest *ir ,
|
||||
return log("inject: url too big.");
|
||||
}
|
||||
|
||||
// hack fix core
|
||||
if ( ir->size_metadata == 0 ) ir->ptr_metadata = NULL;
|
||||
|
||||
int32_t sirSize = 0;
|
||||
char *sir = serializeMsg2 ( ir ,
|
||||
sizeof(InjectionRequest),
|
||||
@ -615,7 +665,7 @@ void sendUdpReply7 ( void *state ) {
|
||||
|
||||
uint32_t statColor = 0xccffcc;
|
||||
if(xd->m_indexCode) {
|
||||
statColor = 0x4e99e9;
|
||||
statColor = 0xaaddaa;//0x4e99e9;
|
||||
}
|
||||
g_stats.addStat_r ( xd->m_rawUtf8ContentSize,
|
||||
xd->m_injectStartTime,
|
||||
@ -652,11 +702,29 @@ void sendUdpReply7 ( void *state ) {
|
||||
|
||||
void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
|
||||
|
||||
|
||||
InjectionRequest *ir = (InjectionRequest *)slot->m_readBuf;
|
||||
|
||||
// now just supply the first guy's char ** and size ptr
|
||||
deserializeMsg2 ( &ir->ptr_url, &ir->size_url );
|
||||
if ( ! deserializeMsg2 ( &ir->ptr_url, &ir->size_url ) ) {
|
||||
log("inject: error deserializing inject request from "
|
||||
"host ip %s port %i",iptoa(slot->m_ip),(int)slot->m_port);
|
||||
g_errno = EBADREQUEST;
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
//g_corruptCount++;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// the url can be like xyz.com. so need to do another corruption
|
||||
// test for ia
|
||||
if ( ! ir->ptr_url ) { // || strncmp(ir->ptr_url,"http",4) != 0 ) {
|
||||
//log("inject: trying to inject NULL or non http url.");
|
||||
log("inject: trying to inject NULL url.");
|
||||
g_errno = EBADURL;
|
||||
//g_corruptCount++;
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
return;
|
||||
}
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum );
|
||||
if ( ! cr ) {
|
||||
@ -692,6 +760,10 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
|
||||
s_injectHead = xd;
|
||||
s_injectTail = xd;
|
||||
}
|
||||
if(ir->ptr_content && ir->ptr_content[ir->size_content - 1]) {
|
||||
// XmlDoc expects this buffer to be null terminated.
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
|
||||
if ( ! xd->injectDoc ( ir->ptr_url , // m_injectUrlBuf.getBufStart() ,
|
||||
cr ,
|
||||
@ -722,7 +794,8 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
|
||||
ir->m_injectDocIp ,
|
||||
ir->ptr_contentDelim,
|
||||
ir->ptr_metadata,
|
||||
ir->size_metadata
|
||||
ir->size_metadata,
|
||||
ir->size_content - 1 // there should be a null in that last byte
|
||||
) )
|
||||
// we blocked...
|
||||
return;
|
||||
|
@ -240,7 +240,7 @@ bool showLine ( SafeBuf *sb , char *s , int32_t len ) {
|
||||
|
||||
return sb->brify ( s , len ,
|
||||
0 , // niceness
|
||||
80 , // cols
|
||||
8000 , // cols
|
||||
"<br>",
|
||||
false ); // isHtml?
|
||||
}
|
||||
|
@ -600,6 +600,14 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
// ! cr->m_isCustomCrawl )
|
||||
// si->m_docsWanted = maxpp;
|
||||
|
||||
// BUT if it is a custom diffbot crawl with no &stream=1 option,
|
||||
// then to prevent a results page of 1.6GB, limit it here
|
||||
if ( si->m_docsWanted > 1000 && ! si->m_streamResults ) {
|
||||
si->m_docsWanted = 1000;
|
||||
log("query: limiting query %s without &stream=1 option to "
|
||||
"%"INT32" results.",st->m_si.m_displayQuery,1000);
|
||||
}
|
||||
|
||||
st->m_numDocIds = si->m_docsWanted;
|
||||
|
||||
// watch out for cowboys
|
||||
@ -5008,26 +5016,31 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
// print the URL
|
||||
//
|
||||
////////////
|
||||
|
||||
StackBuf(tmpBuf);
|
||||
char* displayUrl = Url::getDisplayUrl(url, &tmpBuf);
|
||||
uint32_t displayUrlLen = tmpBuf.length();
|
||||
|
||||
// hack off the http:// if any for displaying it on screen
|
||||
if ( urlLen > 8 && strncmp ( url , "http://" , 7 )==0 ) {
|
||||
url += 7; urlLen -= 7; }
|
||||
if ( displayUrlLen > 8 && strncmp ( displayUrl , "http://" , 7 )==0 ) {
|
||||
displayUrl += 7; displayUrlLen -= 7; }
|
||||
// . remove trailing /
|
||||
// . only remove from root urls in case user cuts and
|
||||
// pastes it for link: search
|
||||
if ( url [ urlLen - 1 ] == '/' ) {
|
||||
if ( displayUrl [ displayUrlLen - 1 ] == '/' ) {
|
||||
// see if any other slash before us
|
||||
int32_t j;
|
||||
for ( j = urlLen - 2 ; j >= 0 ; j-- )
|
||||
if ( url[j] == '/' ) break;
|
||||
for ( j = displayUrlLen - 2 ; j >= 0 ; j-- )
|
||||
if ( displayUrl[j] == '/' ) break;
|
||||
// if there wasn't, we must have been a root url
|
||||
// so hack off the last slash
|
||||
if ( j < 0 ) urlLen--;
|
||||
if ( j < 0 ) displayUrlLen--;
|
||||
}
|
||||
if ( si->m_format == FORMAT_HTML ) {
|
||||
sb->safePrintf ("<font color=gray>" );
|
||||
//sb->htmlEncode ( url , gbstrlen(url) , false );
|
||||
// 20 for the date after it
|
||||
sb->safeTruncateEllipsis ( url , 50 ); // cols - 30 );
|
||||
sb->safeTruncateEllipsis ( displayUrl , 50 ); // cols - 30 );
|
||||
// turn off the color
|
||||
sb->safePrintf ( "</font>\n" );
|
||||
}
|
||||
@ -5058,12 +5071,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
|
||||
if ( si->m_format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t\t<url><![CDATA[");
|
||||
sb->safeMemcpy ( url , urlLen );
|
||||
sb->safeMemcpy ( displayUrl , displayUrlLen );
|
||||
sb->safePrintf("]]></url>\n");
|
||||
}
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"url\":\"");
|
||||
sb->jsonEncode ( url , urlLen );
|
||||
sb->jsonEncode ( displayUrl , displayUrlLen );
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
|
||||
@ -5717,10 +5730,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
*/
|
||||
|
||||
if ( mr->size_metadataBuf && si->m_format == FORMAT_JSON) {
|
||||
sb->safePrintf("\t\t\"metadata\":");
|
||||
sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
|
||||
sb->pushChar(',');
|
||||
|
||||
sb->safePrintf("\t\t\"metadata\":[");
|
||||
//sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
|
||||
sb->safeStrcpy(mr->ptr_metadataBuf);
|
||||
// without this \n we seem to lose our ] i guess it gets
|
||||
// backed up over
|
||||
sb->safePrintf("],\n");
|
||||
}
|
||||
|
||||
|
||||
|
12
PageRoot.cpp
12
PageRoot.cpp
@ -2523,7 +2523,17 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
|
||||
ir->ptr_url = hr->getString("u",NULL);
|
||||
if ( ! ir->ptr_url ) ir->ptr_url = hr->getString("url",NULL);
|
||||
|
||||
if ( ! ir->ptr_url ) ir->ptr_url = hr->getString("urls",NULL);
|
||||
|
||||
if ( ! ir->ptr_url ) {
|
||||
g_errno = EBADURL;
|
||||
doneInjectingWrapper3 ( st1 );
|
||||
return true;
|
||||
}
|
||||
|
||||
// include \0 in size
|
||||
ir->size_url = gbstrlen(ir->ptr_url)+1;
|
||||
|
||||
// get back a short reply so we can show the status code easily
|
||||
ir->m_shortReply = 1;
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "Dns.h"
|
||||
#include "SafeBuf.h"
|
||||
#include "Msg13.h"
|
||||
#include "Linkdb.h" // Msg25Request
|
||||
|
||||
static void printTcpTable (SafeBuf *p,char *title,TcpServer *server);
|
||||
static void printUdpTable (SafeBuf *p,char *title,UdpServer *server,
|
||||
@ -554,6 +555,62 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
if ( msgType == 0x50 ) desc = "get root quality";
|
||||
if ( msgType == 0x25 ) desc = "get link info";
|
||||
if ( msgType == 0xfd ) desc = "proxy forward";
|
||||
|
||||
char *req = NULL;
|
||||
int32_t reqSize = 0;
|
||||
if ( s->m_callback ) {
|
||||
req = s->m_sendBuf;
|
||||
reqSize = s->m_sendBufSize;
|
||||
}
|
||||
// are we receiving the request?
|
||||
else {
|
||||
req = s->m_readBuf;
|
||||
reqSize = s->m_readBufSize;
|
||||
// if not completely read in yet...
|
||||
if ( s->hasDgramsToRead ())
|
||||
req = NULL;
|
||||
}
|
||||
|
||||
SafeBuf tmp;
|
||||
char *altText = "";
|
||||
|
||||
// MSG25
|
||||
if ( req && msgType == 0x25 ) {
|
||||
Msg25Request *mr = (Msg25Request *)req;
|
||||
// it doesn't hurt if we call Msg25Request::deserialize
|
||||
// again if it has already been called
|
||||
mr->deserialize();
|
||||
if ( mr->m_mode == 2 ) { // MODE_SITELINKINFO ) {
|
||||
tmp.safePrintf(" title=\""
|
||||
"getting site link info for "
|
||||
"%s "
|
||||
"in collnum %i.\n"
|
||||
"sitehash64=%"UINT64" "
|
||||
"waitinginline=%i"
|
||||
"\""
|
||||
,mr->ptr_site
|
||||
,(int)mr->m_collnum
|
||||
,mr->m_siteHash64
|
||||
,(int)mr->m_waitingInLine
|
||||
);
|
||||
desc = "getting site link info";
|
||||
}
|
||||
else {
|
||||
tmp.safePrintf(" title=\""
|
||||
"getting page link info for "
|
||||
"%s "
|
||||
"in collnum %i."
|
||||
"\""
|
||||
,mr->ptr_url
|
||||
,(int)mr->m_collnum
|
||||
);
|
||||
desc = "getting page link info";
|
||||
}
|
||||
}
|
||||
|
||||
if ( tmp.getLength() )
|
||||
altText = tmp.getBufStart();
|
||||
|
||||
|
||||
p->safePrintf ( "<tr bgcolor=#%s>"
|
||||
"<td>%s</td>" // age
|
||||
@ -609,12 +666,14 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
if ( ! s->m_callback ) toFrom = "from";
|
||||
//"<td><a href=http://%s:%hu/cgi/15.cgi>%"INT32"</a></td>"
|
||||
p->safePrintf ( "<td>0x%hhx</td>" // msgtype
|
||||
"<td><nobr>%s</nobr></td>" // desc
|
||||
"<td%s><nobr>"
|
||||
"%s</nobr></td>" // desc
|
||||
"<td><nobr>%s <a href=http://%s:%hu/"
|
||||
"admin/sockets?"
|
||||
"c=%s>%s</a></nobr></td>"
|
||||
"<td>%s%"INT32"%s</td>" , // niceness
|
||||
s->m_msgType ,
|
||||
altText,
|
||||
desc,
|
||||
//iptoa(s->m_ip) ,
|
||||
//s->m_port ,
|
||||
|
@ -49,9 +49,18 @@ class StateStatsdb {
|
||||
static time_t genDate( char *date, int32_t dateLen ) ;
|
||||
static void sendReply ( void *st ) ;
|
||||
|
||||
static bool s_graphInUse = false;
|
||||
|
||||
// . returns false if blocked, otherwise true
|
||||
// . sets g_errno on error
|
||||
bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
|
||||
|
||||
if ( s_graphInUse ) {
|
||||
char *msg = "stats graph calculating for another user. "
|
||||
"Try again later.";
|
||||
g_httpServer.sendErrorReply(s,500,msg);
|
||||
return true;
|
||||
}
|
||||
|
||||
char *cgi;
|
||||
int32_t cgiLen;
|
||||
@ -121,7 +130,6 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
|
||||
st->m_endDate = st->m_endDateR;
|
||||
}
|
||||
|
||||
g_statsdb.addDocsIndexed();
|
||||
//
|
||||
// this is no longer a gif, but an html graph in g_statsdb.m_sb
|
||||
//
|
||||
@ -130,8 +138,10 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
|
||||
st->m_samples ,
|
||||
&st->m_sb2 ,
|
||||
st ,
|
||||
sendReply ) )
|
||||
sendReply ) ) {
|
||||
s_graphInUse = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// if we didn't block call it ourselves directly
|
||||
sendReply ( st );
|
||||
@ -139,6 +149,15 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void genStatsDataset(SafeBuf *buf, StateStatsdb *st) {
|
||||
if ( ! g_conf.m_useStatsdb ) {
|
||||
buf->safePrintf("{\"error\":\"statsdb disabled\"}\n" );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void writeControls ( SafeBuf *buf, StateStatsdb *st ) ;
|
||||
void genStatsGraphTable(SafeBuf *buf, StateStatsdb *st) {
|
||||
if ( ! g_conf.m_useStatsdb )
|
||||
@ -186,6 +205,8 @@ void genStatsGraphTable(SafeBuf *buf, StateStatsdb *st) {
|
||||
|
||||
void sendReply ( void *state ) {
|
||||
|
||||
s_graphInUse = false;
|
||||
|
||||
StateStatsdb *st = (StateStatsdb *)state;
|
||||
|
||||
if ( g_errno ) {
|
||||
@ -196,6 +217,10 @@ void sendReply ( void *state ) {
|
||||
|
||||
TcpSocket *s = st->m_socket;
|
||||
|
||||
if(st->m_request.getLong("json", 0)) {
|
||||
//xxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
}
|
||||
|
||||
if(st->m_request.getLong("justgraph", 0)) {
|
||||
SafeBuf buf( 1024*32 , "tmpbuf0" );
|
||||
genStatsGraphTable(&buf, st);
|
||||
|
72
Parms.cpp
72
Parms.cpp
@ -6800,7 +6800,7 @@ void Parms::init ( ) {
|
||||
m->m_off = (char *)&cr.m_maxSearchResultsPerQuery - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "100";
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_flags = 0;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
@ -10548,7 +10548,7 @@ void Parms::init ( ) {
|
||||
m->m_off = (char *)&g_conf.m_maxHeartbeatDelay - g;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_flags = PF_CLONE; // PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
@ -12401,12 +12401,31 @@ void Parms::init ( ) {
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "files group writable";
|
||||
m->m_desc = "Make all created files group writable? If you have "
|
||||
"multiple user accounts starting Gigablast processes you "
|
||||
"will want the files to be group writable. You will "
|
||||
"need to make sure you run gigablast under the "
|
||||
"primary group you want to use for gigablast administration.";
|
||||
m->m_cgi = "afgw";
|
||||
m->m_off = (char *)&g_conf.m_makeAllFilesGroupWritable - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "verify disk writes";
|
||||
m->m_desc = "Read what was written in a verification step. Decreases "
|
||||
"performance, but may help fight disk corruption mostly on "
|
||||
@ -16655,6 +16674,21 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
m->m_title = "index warc or arc files";
|
||||
m->m_desc = "If this is true Gigablast will index .warc and .arc "
|
||||
"files by injecting the pages contained in them as if they "
|
||||
"were spidered with the content in the .warc or .arc file. "
|
||||
"The spidered time will be taken from the archive file "
|
||||
"as well.";
|
||||
m->m_cgi = "indexwarcs";
|
||||
m->m_off = (char *)&cr.m_indexWarcs - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "add url enabled";
|
||||
m->m_desc = "If this is enabled others can add "
|
||||
@ -21338,9 +21372,23 @@ void tryToCallCallbacks ( ) {
|
||||
if ( pn->m_calledCallback ) continue;
|
||||
// should we call the callback?
|
||||
bool callIt = false;
|
||||
// 8 seconds is enough to wait for all replies to come in
|
||||
if ( now - pn->m_startTime > 8 ) callIt = true;
|
||||
if ( pn->m_numReplies >= pn->m_numRequests ) callIt = true;
|
||||
// sometimes we don't launch any requests to update parms
|
||||
// because we are jammed up. same logic as we use for
|
||||
// freeing the pn below.
|
||||
if ( pn->m_numGoodReplies < pn->m_numHostsTotal )
|
||||
callIt = false;
|
||||
|
||||
// 8 seconds is enough to wait for all replies to come in.
|
||||
// a host might be dead, so we need this here lest the
|
||||
// underlying page handler (i.e. sendPageCrawlbot()) never
|
||||
// get called if a host is dead. if you are updating some
|
||||
// parms you want the page to return.
|
||||
if ( now - pn->m_startTime > 8 &&
|
||||
! callIt &&
|
||||
g_hostdb.hasDeadHost() )
|
||||
callIt = true;
|
||||
|
||||
if ( ! callIt ) continue;
|
||||
// callback is NULL for updating parms like spiderRoundNum
|
||||
// in Spider.cpp
|
||||
@ -21475,6 +21523,8 @@ bool Parms::doParmSendingLoop ( ) {
|
||||
|
||||
if ( ! s_headNode ) return true;
|
||||
|
||||
if ( g_isDumpingRdbFromMain ) return true;
|
||||
|
||||
if ( s_inLoop ) return true;
|
||||
|
||||
s_inLoop = true;
|
||||
@ -21551,8 +21601,8 @@ bool Parms::doParmSendingLoop ( ) {
|
||||
}
|
||||
|
||||
// debug log
|
||||
log(LOG_INFO,"parms: sending parm request "
|
||||
"to hostid %"INT32"",h->m_hostId);
|
||||
log(LOG_INFO,"parms: sending parm request id %i "
|
||||
"to hostid %"INT32"",(int)pn->m_parmId,h->m_hostId);
|
||||
|
||||
// count it
|
||||
pn->m_numRequests++;
|
||||
@ -22946,6 +22996,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"\"temporary\" errors like DNS timeouts."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>errorcode==32880</td>"
|
||||
"<td>"
|
||||
"If the last time it was spidered it had this "
|
||||
"numeric error code. See the error codes in "
|
||||
"Errno.cpp. In this particular example 32880 is "
|
||||
"for EBADURL."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>hastmperror</td>"
|
||||
"<td>"
|
||||
"This is true if the last spider attempt resulted "
|
||||
|
14
Posdb.cpp
14
Posdb.cpp
@ -6019,7 +6019,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
#define RINGBUFSIZE 4096
|
||||
//#define RINGBUFSIZE 1024
|
||||
unsigned char ringBuf[RINGBUFSIZE+10];
|
||||
unsigned char *ringBufEnd = ringBuf + RINGBUFSIZE;
|
||||
// for overflow conditions in loops below
|
||||
ringBuf[RINGBUFSIZE+0] = 0xff;
|
||||
ringBuf[RINGBUFSIZE+1] = 0xff;
|
||||
@ -6363,18 +6362,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// for 'search engine'. it might save time!
|
||||
|
||||
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
|
||||
for ( int32_t *rb = (int32_t *)ringBuf ; ; ) {
|
||||
rb[0] = 0xffffffff;
|
||||
rb[1] = 0xffffffff;
|
||||
rb[2] = 0xffffffff;
|
||||
rb[3] = 0xffffffff;
|
||||
rb[4] = 0xffffffff;
|
||||
rb[5] = 0xffffffff;
|
||||
rb[6] = 0xffffffff;
|
||||
rb[7] = 0xffffffff;
|
||||
rb += 8;
|
||||
if ( rb >= (int32_t *)ringBufEnd ) break;
|
||||
}
|
||||
memset ( ringBuf, 0xff, RINGBUFSIZE );
|
||||
|
||||
// now to speed up 'time enough for love' query which does not
|
||||
// have many super high scoring guys on top we need a more restrictive
|
||||
|
108
Process.cpp
108
Process.cpp
@ -885,6 +885,9 @@ void hdtempWrapper ( int fd , void *state ) {
|
||||
// or if haven't waited int32_t enough
|
||||
if ( now < s_nextTime ) return;
|
||||
|
||||
// see if this fixes the missed heartbeats
|
||||
//return;
|
||||
|
||||
// set it
|
||||
g_process.m_threadOut = true;
|
||||
// . call thread to call popen
|
||||
@ -968,7 +971,11 @@ float getDiskUsage ( int64_t *diskAvail ) {
|
||||
char cmd[10048];
|
||||
char out[1024];
|
||||
sprintf(out,"%sdiskusage",g_hostdb.m_dir);
|
||||
snprintf(cmd,10000,"df -ka %s | tail -1 | "
|
||||
snprintf(cmd,10000,
|
||||
// "ulimit -v 25000 ; "
|
||||
// "ulimit -t 30 ; "
|
||||
// "ulimit -a; "
|
||||
"df -ka %s | tail -1 | "
|
||||
"awk '{print $4\" \"$5}' > %s",
|
||||
g_hostdb.m_dir,
|
||||
out);
|
||||
@ -982,7 +989,9 @@ float getDiskUsage ( int64_t *diskAvail ) {
|
||||
return -1.0; // unknown
|
||||
}
|
||||
// this will happen if you don't upgrade glibc to 2.2.4-32 or above
|
||||
if ( err != 0 ) {
|
||||
// for some reason it returns no mem but the file is ok.
|
||||
// something to do with being in a thread?
|
||||
if ( err != 0 && errno != ENOMEM ) {
|
||||
log("build: Call to system(\"%s\") had error: %s",
|
||||
cmd,mstrerror(errno));
|
||||
return -1.0; // unknown
|
||||
@ -1175,8 +1184,12 @@ void heartbeatWrapper ( int fd , void *state ) {
|
||||
// check the "cat /proc/<pid>/status | grep SigQ" output
|
||||
// to see if its overflowed. hopefully i will fix this by
|
||||
// queue the signals myself in Loop.cpp.
|
||||
log("db: missed heartbeat by %"INT64" ms. Num elapsed alarms = "
|
||||
"%"INT32"", elapsed-100,(int32_t)(g_numAlarms - s_lastNumAlarms));
|
||||
log("db: missed calling niceness 0 heartbeatWrapper "
|
||||
"function by %"INT64" ms. Either you need a quickpoll "
|
||||
"somewhere or a niceness 0 function is taking too long. "
|
||||
"Num elapsed alarms = "
|
||||
"%"INT32"", elapsed-100,(int32_t)(g_numAlarms -
|
||||
s_lastNumAlarms));
|
||||
s_last = now;
|
||||
s_lastNumAlarms = g_numAlarms;
|
||||
|
||||
@ -1524,21 +1537,32 @@ bool Process::shutdown2 ( ) {
|
||||
|
||||
static bool s_printed = false;
|
||||
|
||||
// wait for all threads to return
|
||||
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
|
||||
int32_t n = g_threads.getNumWriteThreadsOut();
|
||||
waitLoop:
|
||||
|
||||
// wait for all 'write' threads to be done. they can be done
|
||||
// and just waiting for a join, in which case we won't coun them.
|
||||
int32_t n = g_threads.getNumActiveWriteUnlinkRenameThreadsOut();
|
||||
// we can't wait for the write thread if we had a seg fault, but
|
||||
// do print a msg in the log
|
||||
if ( n != 0 && m_urgent ) {
|
||||
log(LOG_INFO,"gb: Has %"INT32" write/unlink/rename "
|
||||
"threads active. Waiting.",n);
|
||||
sleep(1);
|
||||
goto waitLoop;
|
||||
}
|
||||
|
||||
if ( n != 0 && ! m_urgent ) {
|
||||
log(LOG_INFO,"gb: Has %"INT32" write threads out. Waiting for "
|
||||
log(LOG_INFO,"gb: Has %"INT32" write/unlink/rename "
|
||||
"threads out. Waiting for "
|
||||
"them to finish.",n);
|
||||
return false;
|
||||
}
|
||||
else if ( ! s_printed && ! m_urgent ) {
|
||||
s_printed = true;
|
||||
log(LOG_INFO,"gb: No write threads out.");
|
||||
log(LOG_INFO,"gb: No write/unlink/rename threads active.");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// disable all spidering
|
||||
// we can exit while spiders are in the queue because
|
||||
// if they are in the middle of being added they will be
|
||||
@ -1650,11 +1674,18 @@ bool Process::shutdown2 ( ) {
|
||||
|
||||
// urgent means we need to dump core, SEGV or something
|
||||
if ( m_urgent ) {
|
||||
// log it
|
||||
log("gb: Dumping core after saving.");
|
||||
// at least destroy the page caches that have shared memory
|
||||
// because they seem to not clean it up
|
||||
resetPageCaches();
|
||||
|
||||
if ( g_threads.amThread() ) {
|
||||
uint64_t tid = (uint64_t)getpidtid();
|
||||
log("gb: calling abort from thread with tid of "
|
||||
"%"UINT64" (thread)",tid);
|
||||
}
|
||||
else {
|
||||
pid_t pid = getpid();
|
||||
log("gb: calling abort from main process "
|
||||
"with pid of %"UINT64" (main process)",
|
||||
(uint64_t)pid);
|
||||
}
|
||||
|
||||
// let's ensure our core file can dump
|
||||
struct rlimit lim;
|
||||
@ -1662,9 +1693,48 @@ bool Process::shutdown2 ( ) {
|
||||
if ( setrlimit(RLIMIT_CORE,&lim) )
|
||||
log("gb: setrlimit: %s.", mstrerror(errno) );
|
||||
|
||||
// if we are in this code then we are the main process
|
||||
// and not a thread.
|
||||
// see if this makes it so we always dump core again.
|
||||
// joins with all threads, too.
|
||||
log("gb: Joining with all threads");
|
||||
g_threads.killAllThreads();
|
||||
|
||||
// log it
|
||||
log("gb: Dumping core after saving.");
|
||||
// at least destroy the page caches that have shared memory
|
||||
// because they seem to not clean it up
|
||||
//resetPageCaches();
|
||||
|
||||
// use the default segmentation fault handler which should
|
||||
// dump core rather than call abort() which doesn't always
|
||||
// work because perhaps of threads doing something
|
||||
int signum = SIGSEGV;
|
||||
signal(signum, SIG_DFL);
|
||||
kill(getpid(), signum);
|
||||
|
||||
// this is the trick: it will trigger the core dump by
|
||||
// calling the original SIGSEGV handler.
|
||||
//int signum = SIGSEGV;
|
||||
//signal(signum, SIG_DFL);
|
||||
//kill(getpid(), signum);
|
||||
|
||||
// try resetting the SEGV sig handle to default. when
|
||||
// we return it should call the default handler.
|
||||
// struct sigaction sa;
|
||||
// sigemptyset (&sa.sa_mask);
|
||||
// sa.sa_flags = SA_RESETHAND;
|
||||
// sa.sa_sigaction = NULL;
|
||||
// sigaction ( SIGSEGV, &sa, 0 ) ;
|
||||
// return true;
|
||||
|
||||
// . force an abnormal termination which will cause a core dump
|
||||
// . do not dump core on SIGHUP signals any more though
|
||||
abort();
|
||||
//abort();
|
||||
|
||||
// return from this signal handler so we can execute
|
||||
// original SIGSEGV handler right afterwards
|
||||
// default handler should be called after we return now
|
||||
// keep compiler happy
|
||||
return true;
|
||||
}
|
||||
@ -1674,6 +1744,12 @@ bool Process::shutdown2 ( ) {
|
||||
// cleanup threads, this also launches them too
|
||||
g_threads.timedCleanUp(0x7fffffff,MAX_NICENESS);
|
||||
|
||||
// there's no write/unlink/rename threads active,
|
||||
// so just kill the remaining threads and join
|
||||
// with them so we can try to get a proper exit status code
|
||||
log("gb: Joining with all threads");
|
||||
g_threads.killAllThreads();
|
||||
|
||||
// wait for all threads to complete...
|
||||
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
|
||||
//if ( n > 0 )
|
||||
|
@ -1866,7 +1866,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
|
||||
ff.safePrintf("%strash/profile.txt",g_hostdb.m_dir);
|
||||
char *filename = ff.getBufStart();
|
||||
unlink ( filename );
|
||||
int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
|
||||
int fd = open ( filename , O_RDWR | O_CREAT , getFileCreationFlags() );
|
||||
if ( fd < 0 ) {
|
||||
sb->safePrintf("FAILED TO OPEN %s for writing: %s"
|
||||
,ff.getBufStart(),mstrerror(errno));
|
||||
@ -2094,7 +2094,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
|
||||
ff.reset();
|
||||
ff.safePrintf("%strash/qp.txt",g_hostdb.m_dir);
|
||||
filename = ff.getBufStart();
|
||||
fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
|
||||
//fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
|
||||
if ( fd < 0 ) {
|
||||
sb->safePrintf("FAILED TO OPEN %s for writing: %s"
|
||||
,ff.getBufStart(),strerror(errno));
|
||||
|
268
Punycode.cpp
Normal file
268
Punycode.cpp
Normal file
@ -0,0 +1,268 @@
|
||||
#include "Punycode.h"
|
||||
#include <string.h>
|
||||
|
||||
/* #include "punycode.h" */
|
||||
|
||||
/*** Bootstring parameters for Punycode ***/
|
||||
|
||||
enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
|
||||
initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
|
||||
|
||||
/* basic(cp) tests whether cp is a basic code point: */
|
||||
#define basic(cp) ((punycode_uint)(cp) < 0x80)
|
||||
|
||||
/* delim(cp) tests whether cp is a delimiter: */
|
||||
#define delim(cp) ((cp) == delimiter)
|
||||
|
||||
/* decode_digit(cp) returns the numeric value of a basic code */
|
||||
/* point (for use in representing integers) in the range 0 to */
|
||||
/* base-1, or base if cp does not represent a value. */
|
||||
|
||||
static punycode_uint decode_digit(punycode_uint cp)
|
||||
{
|
||||
return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
|
||||
cp - 97 < 26 ? cp - 97 : base;
|
||||
}
|
||||
|
||||
/* encode_digit(d,flag) returns the basic code point whose value */
|
||||
/* (when used for representing integers) is d, which needs to be in */
|
||||
/* the range 0 to base-1. The lowercase form is used unless flag is */
|
||||
/* nonzero, in which case the uppercase form is used. The behavior */
|
||||
/* is undefined if flag is nonzero and digit d has no uppercase form. */
|
||||
|
||||
static char encode_digit(punycode_uint d, int flag)
|
||||
{
|
||||
return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
|
||||
/* 0..25 map to ASCII a..z or A..Z */
|
||||
/* 26..35 map to ASCII 0..9 */
|
||||
}
|
||||
|
||||
/* flagged(bcp) tests whether a basic code point is flagged */
|
||||
/* (uppercase). The behavior is undefined if bcp is not a */
|
||||
/* basic code point. */
|
||||
|
||||
#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
|
||||
|
||||
/* encode_basic(bcp,flag) forces a basic code point to lowercase */
|
||||
/* if flag is zero, uppercase if flag is nonzero, and returns */
|
||||
/* the resulting code point. The code point is unchanged if it */
|
||||
/* is caseless. The behavior is undefined if bcp is not a basic */
|
||||
/* code point. */
|
||||
|
||||
static char encode_basic(punycode_uint bcp, int flag)
|
||||
{
|
||||
bcp -= (bcp - 97 < 26) << 5;
|
||||
return bcp + ((!flag && (bcp - 65 < 26)) << 5);
|
||||
}
|
||||
|
||||
/*** Platform-specific constants ***/
|
||||
|
||||
/* maxint is the maximum value of a punycode_uint variable: */
|
||||
static const punycode_uint maxint = -1;
|
||||
/* Because maxint is unsigned, -1 becomes the maximum value. */
|
||||
|
||||
/*** Bias adaptation function ***/
|
||||
|
||||
static punycode_uint adapt(
|
||||
punycode_uint delta, punycode_uint numpoints, int firsttime )
|
||||
{
|
||||
punycode_uint k;
|
||||
|
||||
delta = firsttime ? delta / damp : delta >> 1;
|
||||
/* delta >> 1 is a faster way of doing delta / 2 */
|
||||
delta += delta / numpoints;
|
||||
|
||||
for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
|
||||
delta /= base - tmin;
|
||||
}
|
||||
|
||||
return k + (base - tmin + 1) * delta / (delta + skew);
|
||||
}
|
||||
|
||||
/*** Main encode function ***/
|
||||
|
||||
enum punycode_status punycode_encode(
|
||||
size_t input_length_orig,
|
||||
const punycode_uint input[],
|
||||
const unsigned char case_flags[],
|
||||
size_t *output_length,
|
||||
char output[] )
|
||||
{
|
||||
punycode_uint input_length, n, delta, h, b, bias, j, m, q, k, t;
|
||||
size_t out, max_out;
|
||||
|
||||
/* The Punycode spec assumes that the input length is the same type */
|
||||
/* of integer as a code point, so we need to convert the size_t to */
|
||||
/* a punycode_uint, which could overflow. */
|
||||
|
||||
if (input_length_orig > maxint) return punycode_overflow;
|
||||
input_length = (punycode_uint) input_length_orig;
|
||||
|
||||
/* Initialize the state: */
|
||||
|
||||
n = initial_n;
|
||||
delta = 0;
|
||||
out = 0;
|
||||
max_out = *output_length;
|
||||
bias = initial_bias;
|
||||
|
||||
/* Handle the basic code points: */
|
||||
|
||||
for (j = 0; j < input_length; ++j) {
|
||||
if (basic(input[j])) {
|
||||
if (max_out - out < 2) return punycode_big_output;
|
||||
output[out++] = case_flags ?
|
||||
encode_basic(input[j], case_flags[j]) : (char) input[j];
|
||||
}
|
||||
/* else if (input[j] < n) return punycode_bad_input; */
|
||||
/* (not needed for Punycode with unsigned code points) */
|
||||
}
|
||||
|
||||
h = b = (punycode_uint) out;
|
||||
/* cannot overflow because out <= input_length <= maxint */
|
||||
|
||||
/* h is the number of code points that have been handled, b is the */
|
||||
/* number of basic code points, and out is the number of ASCII code */
|
||||
/* points that have been output. */
|
||||
|
||||
if (b > 0) output[out++] = delimiter;
|
||||
|
||||
/* Main encoding loop: */
|
||||
|
||||
while (h < input_length) {
|
||||
/* All non-basic code points < n have been */
|
||||
/* handled already. Find the next larger one: */
|
||||
|
||||
for (m = maxint, j = 0; j < input_length; ++j) {
|
||||
/* if (basic(input[j])) continue; */
|
||||
/* (not needed for Punycode) */
|
||||
if (input[j] >= n && input[j] < m) m = input[j];
|
||||
}
|
||||
|
||||
/* Increase delta enough to advance the decoder's */
|
||||
/* <n,i> state to <m,0>, but guard against overflow: */
|
||||
|
||||
if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
|
||||
delta += (m - n) * (h + 1);
|
||||
n = m;
|
||||
|
||||
for (j = 0; j < input_length; ++j) {
|
||||
/* Punycode does not need to check whether input[j] is basic: */
|
||||
if (input[j] < n /* || basic(input[j]) */ ) {
|
||||
if (++delta == 0) return punycode_overflow;
|
||||
}
|
||||
|
||||
if (input[j] == n) {
|
||||
/* Represent delta as a generalized variable-length integer: */
|
||||
|
||||
for (q = delta, k = base; ; k += base) {
|
||||
if (out >= max_out) return punycode_big_output;
|
||||
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
|
||||
k >= bias + tmax ? tmax : k - bias;
|
||||
if (q < t) break;
|
||||
output[out++] = encode_digit(t + (q - t) % (base - t), 0);
|
||||
q = (q - t) / (base - t);
|
||||
}
|
||||
|
||||
output[out++] = encode_digit(q, case_flags && case_flags[j]);
|
||||
bias = adapt(delta, h + 1, h == b);
|
||||
delta = 0;
|
||||
++h;
|
||||
}
|
||||
}
|
||||
|
||||
++delta, ++n;
|
||||
}
|
||||
|
||||
*output_length = out;
|
||||
return punycode_success;
|
||||
}
|
||||
|
||||
/*** Main decode function ***/
|
||||
|
||||
enum punycode_status punycode_decode(
|
||||
size_t input_length,
|
||||
const char input[],
|
||||
size_t *output_length,
|
||||
punycode_uint output[],
|
||||
unsigned char case_flags[] )
|
||||
{
|
||||
punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
|
||||
size_t b, j, in;
|
||||
|
||||
/* Initialize the state: */
|
||||
|
||||
n = initial_n;
|
||||
out = i = 0;
|
||||
max_out = *output_length > maxint ? maxint
|
||||
: (punycode_uint) *output_length;
|
||||
bias = initial_bias;
|
||||
|
||||
/* Handle the basic code points: Let b be the number of input code */
|
||||
/* points before the last delimiter, or 0 if there is none, then */
|
||||
/* copy the first b code points to the output. */
|
||||
|
||||
for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j;
|
||||
if (b > max_out) return punycode_big_output;
|
||||
|
||||
for (j = 0; j < b; ++j) {
|
||||
if (case_flags) case_flags[out] = flagged(input[j]);
|
||||
if (!basic(input[j])) return punycode_bad_input;
|
||||
output[out++] = input[j];
|
||||
}
|
||||
|
||||
/* Main decoding loop: Start just after the last delimiter if any */
|
||||
/* basic code points were copied; start at the beginning otherwise. */
|
||||
|
||||
for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
|
||||
|
||||
/* in is the index of the next ASCII code point to be consumed, */
|
||||
/* and out is the number of code points in the output array. */
|
||||
|
||||
/* Decode a generalized variable-length integer into delta, */
|
||||
/* which gets added to i. The overflow checking is easier */
|
||||
/* if we increase i as we go, then subtract off its starting */
|
||||
/* value at the end to obtain delta. */
|
||||
|
||||
for (oldi = i, w = 1, k = base; ; k += base) {
|
||||
if (in >= input_length) return punycode_bad_input;
|
||||
digit = decode_digit(input[in++]);
|
||||
if (digit >= base) return punycode_bad_input;
|
||||
if (digit > (maxint - i) / w) return punycode_overflow;
|
||||
i += digit * w;
|
||||
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
|
||||
k >= bias + tmax ? tmax : k - bias;
|
||||
if (digit < t) break;
|
||||
if (w > maxint / (base - t)) return punycode_overflow;
|
||||
w *= (base - t);
|
||||
}
|
||||
|
||||
bias = adapt(i - oldi, out + 1, oldi == 0);
|
||||
|
||||
/* i was supposed to wrap around from out+1 to 0, */
|
||||
/* incrementing n each time, so we'll fix that now: */
|
||||
|
||||
if (i / (out + 1) > maxint - n) return punycode_overflow;
|
||||
n += i / (out + 1);
|
||||
i %= (out + 1);
|
||||
|
||||
/* Insert n at position i of the output: */
|
||||
|
||||
/* not needed for Punycode: */
|
||||
/* if (basic(n)) return punycode_bad_input; */
|
||||
if (out >= max_out) return punycode_big_output;
|
||||
|
||||
if (case_flags) {
|
||||
memmove(case_flags + i + 1, case_flags + i, out - i);
|
||||
/* Case of last ASCII code point determines case flag: */
|
||||
case_flags[i] = flagged(input[in - 1]);
|
||||
}
|
||||
|
||||
memmove(output + i + 1, output + i, (out - i) * sizeof *output);
|
||||
output[i++] = n;
|
||||
}
|
||||
|
||||
*output_length = (size_t) out;
|
||||
/* cannot overflow because out <= old value of *output_length */
|
||||
return punycode_success;
|
||||
}
|
154
Punycode.h
Normal file
154
Punycode.h
Normal file
@ -0,0 +1,154 @@
|
||||
/*
|
||||
punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
|
||||
http://www.nicemice.net/idn/
|
||||
Adam M. Costello
|
||||
http://www.nicemice.net/amc/
|
||||
|
||||
This is ANSI C code (C89) implementing Punycode 1.0.x.
|
||||
*/
|
||||
#include <limits.h>
|
||||
#include <stddef.h>
|
||||
|
||||
enum punycode_status {
|
||||
punycode_success = 0,
|
||||
punycode_bad_input = 1, /* Input is invalid. */
|
||||
punycode_big_output = 2, /* Output would exceed the space provided. */
|
||||
punycode_overflow = 3 /* Wider integers needed to process input. */
|
||||
};
|
||||
|
||||
/* punycode_uint needs to be unsigned and needs to be */
|
||||
/* at least 26 bits wide. The particular type can be */
|
||||
/* specified by defining PUNYCODE_UINT, otherwise a */
|
||||
/* suitable type will be chosen automatically. */
|
||||
|
||||
#ifdef PUNYCODE_UINT
|
||||
typedef PUNYCODE_UINT punycode_uint;
|
||||
#elif UINT_MAX >= (1 << 26) - 1
|
||||
typedef unsigned int punycode_uint;
|
||||
#else
|
||||
typedef unsigned long punycode_uint;
|
||||
#endif
|
||||
|
||||
enum punycode_status punycode_encode(
|
||||
size_t, /* input_length */
|
||||
const punycode_uint [], /* input */
|
||||
const unsigned char [], /* case_flags */
|
||||
size_t *, /* output_length */
|
||||
char [] /* output */
|
||||
);
|
||||
|
||||
/*
|
||||
punycode_encode() converts a sequence of code points (presumed to be
|
||||
Unicode code points) to Punycode.
|
||||
|
||||
Input arguments (to be supplied by the caller):
|
||||
|
||||
input_length
|
||||
The number of code points in the input array and the number
|
||||
of flags in the case_flags array.
|
||||
|
||||
input
|
||||
An array of code points. They are presumed to be Unicode
|
||||
code points, but that is not strictly necessary. The
|
||||
array contains code points, not code units. UTF-16 uses
|
||||
code units D800 through DFFF to refer to code points
|
||||
10000..10FFFF. The code points D800..DFFF do not occur in
|
||||
any valid Unicode string. The code points that can occur in
|
||||
Unicode strings (0..D7FF and E000..10FFFF) are also called
|
||||
Unicode scalar values.
|
||||
|
||||
case_flags
|
||||
A null pointer or an array of boolean values parallel to
|
||||
the input array. Nonzero (true, flagged) suggests that the
|
||||
corresponding Unicode character be forced to uppercase after
|
||||
being decoded (if possible), and zero (false, unflagged)
|
||||
suggests that it be forced to lowercase (if possible).
|
||||
ASCII code points (0..7F) are encoded literally, except that
|
||||
ASCII letters are forced to uppercase or lowercase according
|
||||
to the corresponding case flags. If case_flags is a null
|
||||
pointer then ASCII letters are left as they are, and other
|
||||
code points are treated as unflagged.
|
||||
|
||||
Output arguments (to be filled in by the function):
|
||||
|
||||
output
|
||||
An array of ASCII code points. It is *not* null-terminated;
|
||||
it will contain zeros if and only if the input contains
|
||||
zeros. (Of course the caller can leave room for a
|
||||
terminator and add one if needed.)
|
||||
|
||||
Input/output arguments (to be supplied by the caller and overwritten
|
||||
by the function):
|
||||
|
||||
output_length
|
||||
The caller passes in the maximum number of ASCII code points
|
||||
that it can receive. On successful return it will contain
|
||||
the number of ASCII code points actually output.
|
||||
|
||||
Return value:
|
||||
|
||||
Can be any of the punycode_status values defined above except
|
||||
punycode_bad_input. If not punycode_success, then output_size
|
||||
and output might contain garbage.
|
||||
*/
|
||||
|
||||
enum punycode_status punycode_decode(
|
||||
size_t, /* input_length */
|
||||
const char [], /* input */
|
||||
size_t *, /* output_length */
|
||||
punycode_uint [], /* output */
|
||||
unsigned char [] /* case_flags */
|
||||
);
|
||||
|
||||
/*
|
||||
punycode_decode() converts Punycode to a sequence of code points
|
||||
(presumed to be Unicode code points).
|
||||
|
||||
Input arguments (to be supplied by the caller):
|
||||
|
||||
input_length
|
||||
The number of ASCII code points in the input array.
|
||||
|
||||
input
|
||||
An array of ASCII code points (0..7F).
|
||||
|
||||
Output arguments (to be filled in by the function):
|
||||
|
||||
output
|
||||
An array of code points like the input argument of
|
||||
punycode_encode() (see above).
|
||||
|
||||
case_flags
|
||||
A null pointer (if the flags are not needed by the caller)
|
||||
or an array of boolean values parallel to the output array.
|
||||
Nonzero (true, flagged) suggests that the corresponding
|
||||
Unicode character be forced to uppercase by the caller (if
|
||||
possible), and zero (false, unflagged) suggests that it
|
||||
be forced to lowercase (if possible). ASCII code points
|
||||
(0..7F) are output already in the proper case, but their
|
||||
flags will be set appropriately so that applying the flags
|
||||
would be harmless.
|
||||
|
||||
Input/output arguments (to be supplied by the caller and overwritten
|
||||
by the function):
|
||||
|
||||
output_length
|
||||
The caller passes in the maximum number of code points
|
||||
that it can receive into the output array (which is also
|
||||
the maximum number of flags that it can receive into the
|
||||
case_flags array, if case_flags is not a null pointer). On
|
||||
successful return it will contain the number of code points
|
||||
actually output (which is also the number of flags actually
|
||||
output, if case_flags is not a null pointer). The decoder
|
||||
will never need to output more code points than the number
|
||||
of ASCII code points in the input, because of the way the
|
||||
encoding is defined. The number of code points output
|
||||
cannot exceed the maximum possible value of a punycode_uint,
|
||||
even if the supplied output_length is greater than that.
|
||||
|
||||
Return value:
|
||||
|
||||
Can be any of the punycode_status values defined above. If not
|
||||
punycode_success, then output_length, output, and case_flags
|
||||
might contain garbage.
|
||||
*/
|
@ -3212,7 +3212,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// no punct, alnum only
|
||||
if ( words.isPunct(i) ) continue;
|
||||
// skip if not a stop word
|
||||
if ( ! bits.m_bits[i] & D_IS_STOPWORD ) continue;
|
||||
if ( ! (bits.m_bits[i] & D_IS_STOPWORD) ) continue;
|
||||
// continue if you can still pair across prev punct word
|
||||
if ( bits.m_bits[i-1] & D_CAN_PAIR_ACROSS ) continue;
|
||||
// otherwise, we can now start a phrase
|
||||
|
70
Rdb.cpp
70
Rdb.cpp
@ -374,16 +374,16 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
|
||||
char dstDir[256];
|
||||
// make the trash dir if not there
|
||||
sprintf ( dstDir , "%s/trash/" , g_hostdb.m_dir );
|
||||
int32_t status = ::mkdir ( dstDir ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) ;
|
||||
int32_t status = ::mkdir ( dstDir , getDirCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) ;
|
||||
// we have to create it
|
||||
sprintf ( dstDir , "%s/trash/rebuilt%"UINT32"/" , g_hostdb.m_dir , t );
|
||||
status = ::mkdir ( dstDir ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) ;
|
||||
status = ::mkdir ( dstDir , getDirCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) ;
|
||||
if ( status && errno != EEXIST ) {
|
||||
g_errno = errno;
|
||||
return log("repair: Could not mkdir(%s): %s",dstDir,
|
||||
@ -643,10 +643,10 @@ bool Rdb::deleteAllRecs ( collnum_t collnum ) {
|
||||
bool makeTrashDir() {
|
||||
char trash[1024];
|
||||
sprintf(trash, "%strash/",g_hostdb.m_dir);
|
||||
if ( ::mkdir ( trash,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) == -1 ) {
|
||||
if ( ::mkdir ( trash , getDirCreationFlags() ) ) {
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) == -1 ) {
|
||||
if ( errno != EEXIST ) {
|
||||
log("dir: mkdir %s had error: %s",
|
||||
trash,mstrerror(errno));
|
||||
@ -1424,10 +1424,12 @@ bool Rdb::gotTokenForDump ( ) {
|
||||
RdbBucket *b = m_buckets.m_buckets[i];
|
||||
collnum_t cn = b->getCollnum();
|
||||
int32_t nk = b->getNumKeys();
|
||||
for ( int32_t j = 0 ; j < nk; j++ ) {
|
||||
cr = g_collectiondb.m_recs[cn];
|
||||
if ( cr ) cr->m_treeCount++;
|
||||
}
|
||||
// for ( int32_t j = 0 ; j < nk; j++ ) {
|
||||
// cr = g_collectiondb.m_recs[cn];
|
||||
// if ( cr ) cr->m_treeCount++;
|
||||
// }
|
||||
cr = g_collectiondb.m_recs[cn];
|
||||
if ( cr ) cr->m_treeCount += nk;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1542,6 +1544,20 @@ bool Rdb::dumpCollLoop ( ) {
|
||||
"available secondary id for titledb: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
}
|
||||
|
||||
// if we add to many files then we can not merge, because merge op
|
||||
// needs to add a file and it calls addNewFile() too
|
||||
static int32_t s_flag = 0;
|
||||
if ( base->m_numFiles + 1 >= MAX_RDB_FILES ) {
|
||||
if ( s_flag < 10 )
|
||||
log("db: could not dump tree to disk for cn="
|
||||
"%i %s because it has %"INT32" files on disk. "
|
||||
"Need to wait for merge operation.",
|
||||
(int)m_dumpCollnum,m_dbname,base->m_numFiles);
|
||||
s_flag++;
|
||||
goto loop;
|
||||
}
|
||||
|
||||
// this file must not exist already, we are dumping the tree into it
|
||||
m_fn = base->addNewFile ( id2 ) ;
|
||||
if ( m_fn < 0 ) return log(LOG_LOGIC,"db: rdb: Failed to add new file "
|
||||
@ -1797,6 +1813,8 @@ void attemptMergeAll2 ( ) {
|
||||
|
||||
tryLoop:
|
||||
|
||||
QUICKPOLL(niceness);
|
||||
|
||||
// if a collection got deleted, reset this to 0
|
||||
if ( s_lastCollnum >= g_collectiondb.m_numRecs ) {
|
||||
s_lastCollnum = 0;
|
||||
@ -1836,6 +1854,26 @@ void attemptMergeAll2 ( ) {
|
||||
if ( base && base->attemptMerge(niceness,force,true) )
|
||||
return;
|
||||
|
||||
// also try to merge on rdbs being rebuilt
|
||||
base = cr->getBasePtr(RDB2_POSDB2);
|
||||
if ( base && base->attemptMerge(niceness,force,true) )
|
||||
return;
|
||||
base = cr->getBasePtr(RDB2_TITLEDB2);
|
||||
if ( base && base->attemptMerge(niceness,force,true) )
|
||||
return;
|
||||
base = cr->getBasePtr(RDB2_TAGDB2);
|
||||
if ( base && base->attemptMerge(niceness,force,true) )
|
||||
return;
|
||||
base = cr->getBasePtr(RDB2_LINKDB2);
|
||||
if ( base && base->attemptMerge(niceness,force,true) )
|
||||
return;
|
||||
base = cr->getBasePtr(RDB2_SPIDERDB2);
|
||||
if ( base && base->attemptMerge(niceness,force,true) )
|
||||
return;
|
||||
base = cr->getBasePtr(RDB2_CLUSTERDB2);
|
||||
if ( base && base->attemptMerge(niceness,force,true) )
|
||||
return;
|
||||
|
||||
// try next collection
|
||||
s_lastCollnum++;
|
||||
|
||||
|
46
RdbBase.cpp
46
RdbBase.cpp
@ -165,10 +165,10 @@ bool RdbBase::init ( char *dir ,
|
||||
}
|
||||
// make a special "cat" dir for it if we need to
|
||||
sprintf ( tmp , "%s%s" , dir , dbname );
|
||||
int32_t status = ::mkdir ( tmp ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH );
|
||||
int32_t status = ::mkdir ( tmp , getDirCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH );
|
||||
if ( status == -1 && errno != EEXIST && errno )
|
||||
return log("db: Failed to make directory %s: %s.",
|
||||
tmp,mstrerror(errno));
|
||||
@ -186,9 +186,9 @@ bool RdbBase::init ( char *dir ,
|
||||
// make a special "cat" dir for it if we need to
|
||||
sprintf ( tmp , "%scat" , dir );
|
||||
if ( ::mkdir ( tmp ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
|
||||
return log("db: Failed to make directory %s: %s.",
|
||||
tmp,mstrerror(errno));
|
||||
}
|
||||
@ -202,9 +202,9 @@ bool RdbBase::init ( char *dir ,
|
||||
// make a special "stats" dir for it if necessary
|
||||
sprintf ( tmp , "%sstats" , dir );
|
||||
if ( ::mkdir ( tmp ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
|
||||
return log( "db: Failed to make directory %s: %s.",
|
||||
tmp, mstrerror( errno ) );
|
||||
}
|
||||
@ -218,9 +218,9 @@ bool RdbBase::init ( char *dir ,
|
||||
// make a special "stats" dir for it if necessary
|
||||
sprintf ( tmp , "%saccess" , dir );
|
||||
if ( ::mkdir ( tmp ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
|
||||
return log( "db: Failed to make directory %s: %s.",
|
||||
tmp, mstrerror( errno ) );
|
||||
}
|
||||
@ -234,9 +234,9 @@ bool RdbBase::init ( char *dir ,
|
||||
// make a special "stats" dir for it if necessary
|
||||
sprintf ( tmp , "%ssyncdb" , dir );
|
||||
if ( ::mkdir ( tmp ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
|
||||
return log( "db: Failed to make directory %s: %s.",
|
||||
tmp, mstrerror( errno ) );
|
||||
}
|
||||
@ -1607,8 +1607,8 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
|
||||
if ( ! m_mergeUrgent && numFiles - 14 >= m_minToMerge ) {
|
||||
m_mergeUrgent = true;
|
||||
if ( doLog )
|
||||
log(LOG_INFO,"merge: Entering urgent merge mode for %s.",
|
||||
m_dbname);
|
||||
log(LOG_INFO,"merge: Entering urgent merge mode for %s "
|
||||
"coll=%s.", m_dbname,m_coll);
|
||||
g_numUrgentMerges++;
|
||||
}
|
||||
|
||||
@ -1811,7 +1811,8 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
m_mergeUrgent = true;
|
||||
if ( m_doLog )
|
||||
log(LOG_INFO,
|
||||
"merge: Entering urgent merge mode for %s.", m_dbname);
|
||||
"merge: Entering urgent merge mode (2) for %s coll=%s.",
|
||||
m_dbname,m_coll);
|
||||
g_numUrgentMerges++;
|
||||
}
|
||||
// tfndb has his own merge class since titledb merges write tfndb recs
|
||||
@ -1892,8 +1893,13 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
// sanity check
|
||||
if ( n <= 1 ) {
|
||||
log(LOG_LOGIC,"merge: attemptMerge: Resuming. bad "
|
||||
"engineer");
|
||||
"engineer for %s coll=%s",m_dbname,m_coll);
|
||||
//g_msg35.releaseToken();
|
||||
if ( m_mergeUrgent ) {
|
||||
log("merge: leaving urgent merge mode");
|
||||
g_numUrgentMerges--;
|
||||
m_mergeUrgent = false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// make a log note
|
||||
|
@ -338,10 +338,10 @@ class RdbBase {
|
||||
// . older files are listed first (lower fileIds)
|
||||
// . filenames should include the directory (full filenames)
|
||||
// . TODO: RdbMgr should control what rdb gets merged?
|
||||
BigFile *m_files [ MAX_RDB_FILES ];
|
||||
int32_t m_fileIds [ MAX_RDB_FILES ];
|
||||
int32_t m_fileIds2 [ MAX_RDB_FILES ]; // for titledb/tfndb linking
|
||||
RdbMap *m_maps [ MAX_RDB_FILES ];
|
||||
BigFile *m_files [ MAX_RDB_FILES+1 ];
|
||||
int32_t m_fileIds [ MAX_RDB_FILES+1 ];
|
||||
int32_t m_fileIds2 [ MAX_RDB_FILES+1 ]; // for titledb/tfndb linking
|
||||
RdbMap *m_maps [ MAX_RDB_FILES+1 ];
|
||||
int32_t m_numFiles;
|
||||
|
||||
// this class contains a ptr to us
|
||||
|
@ -2060,8 +2060,10 @@ bool RdbBuckets::fastSave_r() {
|
||||
char s[1024];
|
||||
sprintf ( s , "%s/%s-buckets-saving.dat", m_dir , m_dbname );
|
||||
int fd = ::open ( s ,
|
||||
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
|
||||
S_IRGRP | S_IWGRP | S_IROTH);
|
||||
O_RDWR | O_CREAT | O_TRUNC ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IROTH);
|
||||
if ( fd < 0 ) {
|
||||
m_saveErrno = errno;
|
||||
return log("db: Could not open %s for writing: %s.",
|
||||
|
@ -1484,7 +1484,7 @@ bool RdbCache::save_r ( ) {
|
||||
//f.set ( g_hostdb.m_dir , filename );
|
||||
// open the file
|
||||
//if ( ! f.open ( O_RDWR | O_CREAT ) )
|
||||
int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
|
||||
int fd = open ( filename , O_RDWR | O_CREAT , getFileCreationFlags() );
|
||||
if ( fd < 0 )
|
||||
return log("db: Had opening file to save cache to: %s.",
|
||||
mstrerror(errno));
|
||||
|
32
RdbList.cpp
32
RdbList.cpp
@ -1340,6 +1340,7 @@ bool RdbList::constrain ( char *startKey ,
|
||||
// ensure we our first key is 12 bytes if m_useHalfKeys is true
|
||||
if ( m_useHalfKeys && isHalfBitOn ( m_list ) ) {
|
||||
g_errno = ECORRUPTDATA;
|
||||
g_numCorrupt++;
|
||||
return log("db: First key is 6 bytes. Corrupt data "
|
||||
"file.");
|
||||
}
|
||||
@ -1347,12 +1348,14 @@ bool RdbList::constrain ( char *startKey ,
|
||||
// sanity. hint key should be full key
|
||||
if ( m_ks == 18 && hintKey && (hintKey[0]&0x06)){
|
||||
g_errno = ECORRUPTDATA;
|
||||
g_numCorrupt++;
|
||||
return log("db: Hint key is corrupt.");
|
||||
//char *xx=NULL;*xx=0;}
|
||||
}
|
||||
|
||||
if ( hintOffset > m_listSize ) { //char *xx=NULL;*xx=0; }
|
||||
g_errno = ECORRUPTDATA;
|
||||
g_numCorrupt++;
|
||||
return log("db: Hint offset %"INT32" > %"INT32" is corrupt."
|
||||
,hintOffset,
|
||||
m_listSize);
|
||||
@ -1418,6 +1421,7 @@ bool RdbList::constrain ( char *startKey ,
|
||||
m_listPtrHi = savelistPtrHi ;
|
||||
m_listPtrLo = savelistPtrLo ;
|
||||
g_errno = ECORRUPTDATA;
|
||||
g_numCorrupt++;
|
||||
return log("db: Got record size of %"INT32" < 0. "
|
||||
"Corrupt data file.",recSize);
|
||||
}
|
||||
@ -1525,13 +1529,16 @@ bool RdbList::constrain ( char *startKey ,
|
||||
if ( minRecSizes < 0 ) maxPtr = m_listEnd;
|
||||
// size of last rec we read in the list
|
||||
int32_t size = -1 ;
|
||||
// char *savedp = p;
|
||||
// if ( savedp == (char *)0x001 ) { char *xx=NULL;*xx=0;}
|
||||
// advance until endKey or minRecSizes kicks us out
|
||||
//while ( p < m_listEnd && getKey(p) <= endKey && p < maxPtr ) {
|
||||
while ( p < m_listEnd ) {
|
||||
QUICKPOLL(niceness);
|
||||
getKey(p,k);
|
||||
if ( KEYCMP(k,endKey,m_ks)>0 ) break;
|
||||
if ( p >= maxPtr ) break;
|
||||
// only break out if we've set the size AND are >= maxPtr
|
||||
if ( p >= maxPtr && size > 0 ) break;
|
||||
size = getRecSize ( p );
|
||||
// watch out for corruption, let Msg5 fix it
|
||||
if ( size < 0 ) {
|
||||
@ -1540,6 +1547,7 @@ bool RdbList::constrain ( char *startKey ,
|
||||
m_listPtrLo = savelistPtrLo;
|
||||
m_listPtr = savelist;
|
||||
g_errno = ECORRUPTDATA;
|
||||
g_numCorrupt++;
|
||||
return log("db: Corrupt record size of %"INT32" "
|
||||
"bytes in %s.",size,filename);
|
||||
}
|
||||
@ -1559,6 +1567,7 @@ bool RdbList::constrain ( char *startKey ,
|
||||
m_listPtrLo = savelistPtrLo;
|
||||
m_listPtr = savelist;
|
||||
g_errno = ECORRUPTDATA;
|
||||
g_numCorrupt++;
|
||||
return log("db: Corrupt record size of %"INT32" "
|
||||
"bytes in %s.",size,filename);
|
||||
}
|
||||
@ -1580,6 +1589,7 @@ bool RdbList::constrain ( char *startKey ,
|
||||
m_listPtrLo = savelistPtrLo;
|
||||
m_listPtr = savelist;
|
||||
g_errno = ECORRUPTDATA;
|
||||
g_numCorrupt++;
|
||||
return log("db: Corrupt record size of %"INT32" "
|
||||
"bytes in %s.",size,filename);
|
||||
}
|
||||
@ -1587,17 +1597,23 @@ bool RdbList::constrain ( char *startKey ,
|
||||
//endKey = getKey ( p - size );
|
||||
getKey(p-size,endKey);
|
||||
}
|
||||
// bitch if size is -1 still
|
||||
if ( size == -1 ) {
|
||||
log("db: Corruption. Encountered bad endkey in %s.",filename);
|
||||
char *xx=NULL;*xx=0;
|
||||
m_list = savelist;
|
||||
m_listPtrHi = savelistPtrHi;
|
||||
m_listPtrLo = savelistPtrLo;
|
||||
m_listPtr = savelist;
|
||||
g_errno = ECORRUPTDATA;
|
||||
g_numCorrupt++;
|
||||
return false;
|
||||
}
|
||||
// cut the tail
|
||||
m_listEnd = p;
|
||||
m_listSize = m_listEnd - m_list;
|
||||
// bitch if size is -1 still
|
||||
if ( size == -1 ) {
|
||||
log("db: Encountered bad endkey in %s. listSize=%"INT32"",
|
||||
filename,m_listSize);
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
// otherwise store the last key if size is not -1
|
||||
else if ( m_listSize > 0 ) {
|
||||
if ( m_listSize > 0 ) {
|
||||
//m_lastKey = getKey ( p - size );
|
||||
getKey(p-size,m_lastKey);
|
||||
m_lastKeyIsValid = true;
|
||||
|
@ -2488,8 +2488,10 @@ bool RdbTree::fastSave_r() {
|
||||
char s[1024];
|
||||
sprintf ( s , "%s/%s-saving.dat", m_dir , m_dbname );
|
||||
int fd = ::open ( s ,
|
||||
O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR |
|
||||
S_IRGRP | S_IWGRP | S_IROTH);
|
||||
O_RDWR | O_CREAT | O_TRUNC ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IROTH);
|
||||
if ( fd < 0 ) {
|
||||
m_saveErrno = errno;
|
||||
return log("db: Could not open %s for writing: %s.",
|
||||
|
50
SafeBuf.cpp
50
SafeBuf.cpp
@ -198,6 +198,15 @@ bool SafeBuf::safeMemcpy ( Words *w , int32_t a , int32_t b ) {
|
||||
return safeMemcpy ( p , pend - p );
|
||||
}
|
||||
|
||||
char* SafeBuf::pushStr (char* str, uint32_t len) {
|
||||
int32_t initLen = m_length;
|
||||
bool status = safeMemcpy ( str , len );
|
||||
status &= nullTerm();
|
||||
m_length++; //count the null so it isn't overwritten
|
||||
if(!status) return NULL;
|
||||
return m_buf + initLen;
|
||||
}
|
||||
|
||||
bool SafeBuf::pushPtr ( void *ptr ) {
|
||||
if ( m_length + (int32_t)sizeof(char *) > m_capacity )
|
||||
if(!reserve(sizeof(char *)))//2*m_capacity + 1))
|
||||
@ -431,7 +440,7 @@ bool SafeBuf::reserve(int32_t i , char *label, bool clearIt ) {
|
||||
//buffer size.
|
||||
bool SafeBuf::reserve2x(int32_t i, char *label) {
|
||||
//watch out for overflow!
|
||||
if((m_capacity << 1) + i < 0) return false;
|
||||
if((m_capacity << 1) + i < m_capacity) return false;
|
||||
if(i + m_length >= m_capacity)
|
||||
return reserve(m_capacity + i,label);
|
||||
else return true;
|
||||
@ -449,8 +458,9 @@ int32_t SafeBuf::save ( char *fullFilename ) {
|
||||
|
||||
int32_t SafeBuf::dumpToFile(char *filename ) {
|
||||
retry22:
|
||||
int32_t fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
int32_t fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC ,
|
||||
getFileCreationFlags() );
|
||||
//S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
if ( fd < 0 ) {
|
||||
// valgrind
|
||||
if ( errno == EINTR ) goto retry22;
|
||||
@ -484,8 +494,9 @@ int32_t SafeBuf::safeSave (char *filename ) {
|
||||
fn.safePrintf( "%s.saving",filename );
|
||||
|
||||
int32_t fd = open ( fn.getBufStart() ,
|
||||
O_CREAT | O_WRONLY | O_TRUNC,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
O_CREAT | O_WRONLY | O_TRUNC ,
|
||||
getFileCreationFlags() );
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
if ( fd < 0 ) {
|
||||
// valgrind
|
||||
if ( errno == EINTR ) goto retry22;
|
||||
@ -571,8 +582,8 @@ int32_t SafeBuf::fillFromFile(char *filename) {
|
||||
reserve(results.st_size+1);
|
||||
|
||||
retry:
|
||||
int32_t fd = open ( filename , O_RDONLY,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
int32_t fd = open ( filename , O_RDONLY , getFileCreationFlags() );
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
if ( ! fd ) {
|
||||
// valgrind
|
||||
if ( errno == EINTR ) goto retry;
|
||||
@ -862,6 +873,22 @@ bool SafeBuf::utf8Encode2(char *s, int32_t len, bool encodeHTML,int32_t nicenes
|
||||
return htmlEncode(m_length-tmp,niceness);
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool SafeBuf::utf32Encode(UChar32* codePoints, int32_t cpLen) {
|
||||
if(m_encoding != csUTF8) return safePrintf("FIXME %s:%i", __FILE__, __LINE__);
|
||||
|
||||
int32_t need = 0;
|
||||
for(int32_t i = 0; i < cpLen;i++) need += utf8Size(codePoints[i]);
|
||||
if(!reserve(need)) return false;
|
||||
|
||||
for(int32_t i = 0; i < cpLen;i++) {
|
||||
m_length += ::utf8Encode(codePoints[i], m_buf + m_length);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
bool SafeBuf::utf32Encode(UChar32 c) {
|
||||
if(!reserve2x(8)) return false;
|
||||
@ -3666,3 +3693,12 @@ bool SafeBuf::hasDigits() {
|
||||
if ( is_digit(m_buf[i]) ) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
int32_t SafeBuf::indexOf(char c) {
|
||||
char* p = m_buf;
|
||||
char* pend = m_buf + m_length;
|
||||
while (p < pend && *p != c) p++;
|
||||
if (p == pend) return -1;
|
||||
return p - m_buf;
|
||||
}
|
||||
|
@ -259,6 +259,7 @@ public:
|
||||
int32_t niceness=0);
|
||||
bool latin1Encode(char *s, int32_t len, bool htmlEncode=false,
|
||||
int32_t niceness=0);
|
||||
bool utf32Encode(UChar32* codePoints, int32_t cpLen);
|
||||
//bool utf16Encode(UChar *s, int32_t len, bool htmlEncode=false);
|
||||
//bool utf16Encode(char *s, int32_t len, bool htmlEncode=false) {
|
||||
// return utf16Encode((UChar*)s, len>>1, htmlEncode); };
|
||||
@ -327,6 +328,7 @@ public:
|
||||
return true;
|
||||
};
|
||||
|
||||
int32_t indexOf(char c);
|
||||
|
||||
bool safeCdataMemcpy(char *s, int32_t len);
|
||||
bool pushChar (char i) {
|
||||
@ -346,6 +348,7 @@ public:
|
||||
// hack off trailing 0's
|
||||
bool printFloatPretty ( float f ) ;
|
||||
|
||||
char* pushStr (char* str, uint32_t len);
|
||||
bool pushPtr ( void *ptr );
|
||||
bool pushLong (int32_t i);
|
||||
bool pushLongLong (int64_t i);
|
||||
|
@ -1805,7 +1805,8 @@ bool Speller::createUnifiedDict (){
|
||||
// then open a new one for appending
|
||||
int fdw = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
getFileCreationFlags());
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 ){
|
||||
return log("lang: Could not open for %s "
|
||||
"writing: %s.",ff, strerror(errno));
|
||||
|
209
Spider.cpp
209
Spider.cpp
@ -2759,6 +2759,7 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
|
||||
// remove all his keys just because we restarted and think he
|
||||
// is alive even though we have gotten no ping from him.
|
||||
//if ( hp->m_numPingRequests > 0 )
|
||||
removeFromTree:
|
||||
// these operations should fail if writes have been disabled
|
||||
// and becase the trees/tables for spidercache are saving
|
||||
// in Process.cpp's g_spiderCache::save() call
|
||||
@ -2793,7 +2794,15 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
|
||||
m_waitingTreeKeyValid = true;
|
||||
m_scanningIp = firstIp;
|
||||
// sanity
|
||||
if ( firstIp == 0 || firstIp == -1 ) { char *xx=NULL;*xx=0; }
|
||||
if ( firstIp == 0 || firstIp == -1 ) {
|
||||
//char *xx=NULL;*xx=0; }
|
||||
log("spider: removing corrupt spiderreq firstip of %"INT32
|
||||
" from waiting tree collnum=%i",
|
||||
firstIp,(int)m_collnum);
|
||||
goto removeFromTree;
|
||||
}
|
||||
// avoid corruption
|
||||
|
||||
// we set this to true when done
|
||||
//m_isReadDone = false;
|
||||
// compute the best request from spiderdb list, not valid yet
|
||||
@ -2877,6 +2886,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
if ( m_deleteMyself ) { char *xx=NULL;*xx=0; }
|
||||
// skip if spiders off
|
||||
if ( ! m_cr->m_spideringEnabled ) return;
|
||||
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
|
||||
// skip if udp table is full
|
||||
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
|
||||
// if entering for the first time, we need to read list from spiderdb
|
||||
@ -3160,6 +3170,8 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
|
||||
// since addSpiderRequest() calls addToWaitingTree() which then calls
|
||||
// this.
|
||||
if ( ! g_conf.m_spideringEnabled ) return;
|
||||
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
|
||||
|
||||
|
||||
// skip if udp table is full
|
||||
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
|
||||
@ -4106,6 +4118,20 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
//srep = NULL;
|
||||
continue;
|
||||
}
|
||||
// ignore these to fix diffbot's malformed url bug
|
||||
if ( tmp->m_errCode == 32880 &&
|
||||
// and is before about dec 18th 2015
|
||||
tmp->m_spideredTime < 1450488447 )
|
||||
continue;
|
||||
// ignore these to fix diffbot's ebadtitlerec error
|
||||
// 'bad cached document'.
|
||||
// ignore them so we can respider the urls and
|
||||
// the new logic in xmldoc.cpp can ignore them.
|
||||
// i fixed xmldoc.cpp to index these status docs.
|
||||
if ( tmp->m_errCode == 32792 &&
|
||||
// and is before about dec 22nd 2015
|
||||
tmp->m_spideredTime < 1450897197 )
|
||||
continue;
|
||||
// bad langid?
|
||||
if ( ! getLanguageAbbr (tmp->m_langId) ) {
|
||||
log("spider: got corrupt 4 spiderReply in "
|
||||
@ -4268,7 +4294,18 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
m_lastCBlockIp = cblock;
|
||||
|
||||
// only add firstip if manually added and not fake
|
||||
|
||||
// if ( uh48 == 272628060426254 )
|
||||
// log("spider: got special seed");
|
||||
|
||||
// #undef sleep
|
||||
// if ( uh48 == 272628060426254 ) {
|
||||
// log("spider: got special seed");
|
||||
// bool flag = true;
|
||||
// sleepLoop:
|
||||
// sleep(1);
|
||||
// if ( flag ) goto sleepLoop;
|
||||
// }
|
||||
// #define sleep(a) { char *xx=NULL;*xx=0; }
|
||||
|
||||
//
|
||||
// just calculating page counts? if the url filters are based
|
||||
@ -5889,6 +5926,8 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
|
||||
if ( ! srep && sreq->m_isInjecting ) return spiderTimeMS;
|
||||
if ( ! srep && sreq->m_isPageReindex ) return spiderTimeMS;
|
||||
|
||||
|
||||
//log("spider: getting spider time %"INT64, spiderTimeMS);
|
||||
// to avoid hammering an ip, get last time we spidered it...
|
||||
int64_t lastMS ;
|
||||
lastMS = m_lastDownloadCache.getLongLong ( m_collnum ,
|
||||
@ -6073,6 +6112,8 @@ bool isAssignedToUs ( int32_t firstIp ) {
|
||||
// . ignore lower 8 bits of ip since one guy often owns a whole block!
|
||||
//int32_t hostId=(((uint32_t)firstIp) >> 8) % g_hostdb.getNumHosts();
|
||||
|
||||
if( !g_hostdb.getMyHost()->m_spiderEnabled ) return false;
|
||||
|
||||
// get our group
|
||||
//Host *group = g_hostdb.getMyGroup();
|
||||
Host *shard = g_hostdb.getMyShard();
|
||||
@ -6097,22 +6138,30 @@ bool isAssignedToUs ( int32_t firstIp ) {
|
||||
int32_t i = ((uint32_t)h64) % hpg;
|
||||
Host *h = &shard[i];
|
||||
// return that if alive
|
||||
if ( ! g_hostdb.isDead(h) ) return (h->m_hostId == g_hostdb.m_hostId);
|
||||
if ( ! g_hostdb.isDead(h) && h->m_spiderEnabled) {
|
||||
return (h->m_hostId == g_hostdb.m_hostId);
|
||||
}
|
||||
// . select another otherwise
|
||||
// . put all alive in an array now
|
||||
Host *alive[64];
|
||||
int32_t upc = 0;
|
||||
|
||||
for ( int32_t j = 0 ; j < hpg ; j++ ) {
|
||||
Host *h = &shard[i];
|
||||
Host *h = &shard[j];
|
||||
if ( g_hostdb.isDead(h) ) continue;
|
||||
if( ! h->m_spiderEnabled ) continue;
|
||||
alive[upc++] = h;
|
||||
}
|
||||
// if none, that is bad! return the first one that we wanted to
|
||||
if ( upc == 0 ) return (h->m_hostId == g_hostdb.m_hostId);
|
||||
if ( upc == 0 ) {
|
||||
log("spider: no hosts can handle spider request for ip=%s", iptoa(firstIp));
|
||||
return false;
|
||||
//return (h->m_hostId == g_hostdb.m_hostId);
|
||||
}
|
||||
// select from the good ones now
|
||||
i = ((uint32_t)firstIp) % hpg;
|
||||
i = ((uint32_t)firstIp) % upc;
|
||||
// get that
|
||||
h = &shard[i];
|
||||
h = alive[i]; //&shard[i];
|
||||
// guaranteed to be alive... kinda
|
||||
return (h->m_hostId == g_hostdb.m_hostId);
|
||||
}
|
||||
@ -6217,7 +6266,11 @@ void SpiderLoop::startLoop ( ) {
|
||||
// in case host when dead.
|
||||
// now that we only send the info on startup and if changed,
|
||||
// let's move back down to 1 second
|
||||
if ( !g_loop.registerSleepCallback(3000,
|
||||
// . make it 20 seconds because handlerequestc1 is always on
|
||||
// profiler when we have thousands of collections
|
||||
// . let's try 10 seconds so as not to think a job is done when
|
||||
// it is not
|
||||
if ( !g_loop.registerSleepCallback(10000,
|
||||
this,
|
||||
updateAllCrawlInfosSleepWrapper))
|
||||
log("build: failed to register updatecrawlinfowrapper");
|
||||
@ -6232,6 +6285,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
|
||||
// if spidering disabled then do not do this crap
|
||||
if ( ! g_conf.m_spideringEnabled ) return;
|
||||
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
|
||||
|
||||
//if ( ! g_conf.m_webSpideringEnabled ) return;
|
||||
// or if trying to exit
|
||||
if ( g_process.m_mode == EXIT_MODE ) return;
|
||||
@ -6250,6 +6305,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
return;
|
||||
}
|
||||
|
||||
//if ( g_hostdb.hasDeadHost() ) return;
|
||||
|
||||
static int32_t s_count = -1;
|
||||
// count these calls
|
||||
s_count++;
|
||||
@ -6299,6 +6356,7 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
// if ( ! cr ) continue;
|
||||
// skip if not enabled
|
||||
if ( ! crp->m_spideringEnabled ) continue;
|
||||
|
||||
// get it
|
||||
//SpiderColl *sc = cr->m_spiderColl;
|
||||
SpiderColl *sc = g_spiderCache.getSpiderColl(crp->m_collnum);
|
||||
@ -6694,6 +6752,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
|
||||
// must be spidering to dole out
|
||||
if ( ! g_conf.m_spideringEnabled ) return;
|
||||
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
|
||||
|
||||
// or if trying to exit
|
||||
if ( g_process.m_mode == EXIT_MODE ) return;
|
||||
// if we don't have all the url counts from all hosts, then wait.
|
||||
@ -7543,7 +7603,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
// note it
|
||||
if ( (g_corruptCount % 1000) == 0 )
|
||||
log("spider: got corrupt doledb record. ignoring. "
|
||||
"pls fix!!!");
|
||||
"pls fix!!! cn=%i",(int)m_collnum);
|
||||
g_corruptCount++;
|
||||
// skip for now....!! what is causing this???
|
||||
m_list.skipCurrentRecord();
|
||||
@ -8278,7 +8338,13 @@ bool SpiderLoop::spiderUrl2 ( ) {
|
||||
// count it as a hit
|
||||
//g_stats.m_spiderUrlsHit++;
|
||||
// sanity check
|
||||
if (m_sreq->m_priority <= -1 ) { char *xx=NULL;*xx=0; }
|
||||
if (m_sreq->m_priority <= -1 ) {
|
||||
log("spider: fixing bogus spider req priority of %i for "
|
||||
"url %s",
|
||||
(int)m_sreq->m_priority,m_sreq->m_url);
|
||||
m_sreq->m_priority = 0;
|
||||
//char *xx=NULL;*xx=0;
|
||||
}
|
||||
//if(m_sreq->m_priority >= MAX_SPIDER_PRIORITIES){char *xx=NULL;*xx=0;}
|
||||
// update this
|
||||
m_sc->m_outstandingSpiders[(unsigned char)m_sreq->m_priority]++;
|
||||
@ -9588,7 +9654,10 @@ bool printList ( State11 *st ) {
|
||||
if ( list->getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0;}
|
||||
// sanity check. requests ONLY in doledb
|
||||
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec )) {
|
||||
char*xx=NULL;*xx=0;}
|
||||
log("spider: not printing spiderreply");
|
||||
continue;
|
||||
//char*xx=NULL;*xx=0;
|
||||
}
|
||||
// get the spider rec, encapsed in the data of the doledb rec
|
||||
SpiderRequest *sreq = (SpiderRequest *)rec;
|
||||
// print it into sbTable
|
||||
@ -11428,7 +11497,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
if ( langId >= 0 ) { // if ( srep ) {
|
||||
// this is NULL on corruption
|
||||
lang = getLanguageAbbr ( langId );//srep->m_langId );
|
||||
langLen = gbstrlen(lang);
|
||||
if (lang) langLen = gbstrlen(lang);
|
||||
}
|
||||
|
||||
// . get parent language in the request
|
||||
@ -12919,6 +12988,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
if ( sign == SIGN_LT && a >= b ) continue;
|
||||
if ( sign == SIGN_GE && a < b ) continue;
|
||||
if ( sign == SIGN_LE && a > b ) continue;
|
||||
// skip fast
|
||||
p += 10;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// EBADURL malformed url is ... 32880
|
||||
if ( *p=='e' && strncmp(p,"errorcode",9) == 0 ) {
|
||||
// if we do not have enough info for outlink, all done
|
||||
if ( isOutlink ) return -1;
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// reply based
|
||||
if ( ! srep ) continue;
|
||||
// int16_tcut
|
||||
int32_t a = srep->m_errCode;
|
||||
// make it point to the retry count
|
||||
int32_t b = atoi(s);
|
||||
// compare
|
||||
if ( sign == SIGN_EQ && a != b ) continue;
|
||||
if ( sign == SIGN_NE && a == b ) continue;
|
||||
if ( sign == SIGN_GT && a <= b ) continue;
|
||||
if ( sign == SIGN_LT && a >= b ) continue;
|
||||
if ( sign == SIGN_GE && a < b ) continue;
|
||||
if ( sign == SIGN_LE && a > b ) continue;
|
||||
// skip fast
|
||||
p += 9;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
@ -13810,6 +13910,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// . TODO: do not update on error???
|
||||
for ( ; ptr < end ; ptr++ ) {
|
||||
|
||||
QUICKPOLL ( slot->m_niceness );
|
||||
|
||||
// get collnum
|
||||
collnum_t collnum = (collnum_t)(ptr->m_collnum);
|
||||
|
||||
@ -13875,6 +13977,12 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// loop over
|
||||
for ( int32_t x = 0 ; x < g_collectiondb.m_numRecs ; x++ ) {
|
||||
|
||||
QUICKPOLL ( slot->m_niceness );
|
||||
|
||||
// a niceness 0 routine could have nuked it?
|
||||
if ( x >= g_collectiondb.m_numRecs )
|
||||
break;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.m_recs[x];
|
||||
if ( ! cr ) continue;
|
||||
|
||||
@ -13897,20 +14005,35 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
if ( ! cia ) continue;
|
||||
|
||||
for ( int32_t k = 0 ; k < g_hostdb.m_numHosts; k++ ) {
|
||||
QUICKPOLL ( slot->m_niceness );
|
||||
// get the CrawlInfo for the ith host
|
||||
CrawlInfo *stats = &cia[k];
|
||||
// point to the stats for that host
|
||||
int64_t *ss = (int64_t *)stats;
|
||||
int64_t *gs = (int64_t *)gi;
|
||||
// add each hosts counts into the global accumulators
|
||||
// are stats crazy?
|
||||
bool crazy = false;
|
||||
for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
|
||||
*gs = *gs + *ss;
|
||||
// crazy stat?
|
||||
if ( *ss > 1000000000LL ||
|
||||
*ss < -1000000000LL )
|
||||
*ss < -1000000000LL ) {
|
||||
log("spider: crazy stats %"INT64" "
|
||||
"from host #%"INT32" coll=%s",
|
||||
"from host #%"INT32" coll=%s. "
|
||||
"ignoring.",
|
||||
*ss,k,cr->m_coll);
|
||||
crazy = true;
|
||||
break;
|
||||
}
|
||||
ss++;
|
||||
}
|
||||
// reset ptr to accumulate
|
||||
ss = (int64_t *)stats;
|
||||
for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
|
||||
// do not accumulate if corrupted.
|
||||
// probably mem got corrupted and it saved
|
||||
// to disk.
|
||||
if ( crazy ) break;
|
||||
*gs = *gs + *ss;
|
||||
gs++;
|
||||
ss++;
|
||||
}
|
||||
@ -14177,7 +14300,7 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
|
||||
|
||||
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
||||
|
||||
QUICKPOLL(MAX_NICENESS);
|
||||
QUICKPOLL(slot->m_niceness);
|
||||
|
||||
CollectionRec *cr = g_collectiondb.m_recs[i];
|
||||
if ( ! cr ) continue;
|
||||
@ -14370,10 +14493,27 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
|
||||
uint32_t now = (uint32_t)getTimeGlobal();
|
||||
|
||||
|
||||
// hit crawl round max? this could be SP_ROUNDDONE and it doesn't
|
||||
// get converted to SP_MAXROUNDS until we call spiderDoledUrls()
|
||||
// so fix the crawlbot nightly smoke test by setting this here
|
||||
// to SP_MAXROUNDS.
|
||||
// smoketest msg = FAIL: testCrawlRounds (__main__.TestRepeatCrawl)
|
||||
// self.assertEqual(j['jobs'][0]['jobStatus']['status'],1,msg=self.name
|
||||
// AssertionError: 4 != 1 : 1227151934RepeatCrawlself.
|
||||
// assertEqual(j['jobs'][0]['jobStatus']['status'],1,msg=self.name)
|
||||
int32_t spiderStatus = cx->m_spiderStatus;
|
||||
if ( spiderStatus == SP_ROUNDDONE &&
|
||||
cx->m_maxCrawlRounds > 0 &&
|
||||
cx->m_isCustomCrawl &&
|
||||
cx->m_spiderRoundNum >= cx->m_maxCrawlRounds )
|
||||
spiderStatus = SP_MAXROUNDS;
|
||||
|
||||
|
||||
// try to fix crawlbot nightly test complaining about job status
|
||||
// for TestRepeatCrawlWithMaxToCrawl
|
||||
if ( (cx->m_spiderStatus == SP_MAXTOCRAWL ||
|
||||
cx->m_spiderStatus == SP_MAXTOPROCESS ) &&
|
||||
if ( (spiderStatus == SP_MAXTOCRAWL ||
|
||||
spiderStatus == SP_MAXTOPROCESS ) &&
|
||||
cx->m_collectiveRespiderFrequency > 0.0 &&
|
||||
now < cx->m_spiderRoundStartTime &&
|
||||
cx->m_spiderRoundNum >= cx->m_maxCrawlRounds ) {
|
||||
@ -14384,7 +14524,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
|
||||
// . 0 means not to RE-crawl
|
||||
// . indicate if we are WAITING for next round...
|
||||
if ( cx->m_spiderStatus == SP_MAXTOCRAWL &&
|
||||
if ( spiderStatus == SP_MAXTOCRAWL &&
|
||||
cx->m_collectiveRespiderFrequency > 0.0 &&
|
||||
now < cx->m_spiderRoundStartTime ) {
|
||||
*status = SP_ROUNDDONE;
|
||||
@ -14395,7 +14535,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
now));
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_MAXTOPROCESS &&
|
||||
if ( spiderStatus == SP_MAXTOPROCESS &&
|
||||
cx->m_collectiveRespiderFrequency > 0.0 &&
|
||||
now < cx->m_spiderRoundStartTime ) {
|
||||
*status = SP_ROUNDDONE;
|
||||
@ -14407,19 +14547,19 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
}
|
||||
|
||||
|
||||
if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
|
||||
if ( spiderStatus == SP_MAXTOCRAWL ) {
|
||||
*status = SP_MAXTOCRAWL;
|
||||
return msg->safePrintf ( "Job has reached maxToCrawl "
|
||||
"limit." );
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
|
||||
if ( spiderStatus == SP_MAXTOPROCESS ) {
|
||||
*status = SP_MAXTOPROCESS;
|
||||
return msg->safePrintf ( "Job has reached maxToProcess "
|
||||
"limit." );
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
|
||||
if ( spiderStatus == SP_MAXROUNDS ) {
|
||||
*status = SP_MAXROUNDS;
|
||||
return msg->safePrintf ( "Job has reached maxRounds "
|
||||
"limit." );
|
||||
@ -14453,7 +14593,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
// return msg->safePrintf("Crawl is waiting for urls.");
|
||||
//}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_INITIALIZING ) {
|
||||
if ( spiderStatus == SP_INITIALIZING ) {
|
||||
*status = SP_INITIALIZING;
|
||||
return msg->safePrintf("Job is initializing.");
|
||||
}
|
||||
@ -14479,7 +14619,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
"repeat is scheduled.");
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
|
||||
if ( spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
|
||||
*status = SP_ROUNDDONE;
|
||||
return msg->safePrintf ( "Nothing currently "
|
||||
"available to spider. "
|
||||
@ -14502,7 +14642,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
}
|
||||
|
||||
|
||||
if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
|
||||
if ( spiderStatus == SP_ROUNDDONE ) {
|
||||
*status = SP_ROUNDDONE;
|
||||
return msg->safePrintf ( "Job round completed.");
|
||||
}
|
||||
@ -14755,12 +14895,21 @@ bool SpiderRequest::isCorrupt ( ) {
|
||||
}
|
||||
|
||||
// sanity check. check for http(s)://
|
||||
if ( m_url[0] != 'h' &&
|
||||
// might be a docid from a pagereindex.cpp
|
||||
! is_digit(m_url[0]) ) {
|
||||
if ( m_url[0] == 'h' )
|
||||
return false;
|
||||
// might be a docid from a pagereindex.cpp
|
||||
if ( ! is_digit(m_url[0]) ) {
|
||||
log("spider: got corrupt 1 spiderRequest");
|
||||
return true;
|
||||
}
|
||||
// if it is a digit\0 it is ok, not corrupt
|
||||
if ( ! m_url[1] )
|
||||
return false;
|
||||
// if it is not a digit after the first digit, that is bad
|
||||
if ( ! is_digit(m_url[1]) ) {
|
||||
log("spider: got corrupt 2 spiderRequest");
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -143,7 +143,10 @@ bool buildProxyTable ( ) {
|
||||
*s = '\0';
|
||||
log("buf: %s for %s",msg,p);
|
||||
*s = c;
|
||||
return false;
|
||||
//return false;
|
||||
// advance p
|
||||
p = s;
|
||||
continue;
|
||||
}
|
||||
|
||||
// convert it
|
||||
@ -706,6 +709,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
int32_t hslot = s_loadTable.getSlot ( &urlIp );
|
||||
// scan all proxies that have this urlip outstanding
|
||||
for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){
|
||||
QUICKPOLL(niceness);
|
||||
// get the bucket
|
||||
LoadBucket *lb;
|
||||
lb = (LoadBucket *)s_loadTable.getValueFromSlot(i);
|
||||
@ -736,6 +740,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
// get the min of the counts
|
||||
int32_t minCount = 999999;
|
||||
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
|
||||
QUICKPOLL(niceness);
|
||||
// skip empty slots
|
||||
if ( ! s_iptab.m_flags[i] ) continue;
|
||||
// get the spider proxy
|
||||
@ -824,6 +829,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
int32_t slotCount = s_iptab.getNumSlots();
|
||||
// . now find the best proxy wih the minCount
|
||||
for ( int32_t i = start ; ; i++ ) {
|
||||
QUICKPOLL(niceness);
|
||||
// scan all slots in hash table, then stop
|
||||
if ( slotCount-- <= 0 ) break;
|
||||
// wrap around to zero if we hit the end
|
||||
@ -896,8 +902,8 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
static int32_t s_lbid = 0;
|
||||
// add it now, iff not for passing to diffbot backend
|
||||
if ( preq->m_opCode != OP_GETPROXYFORDIFFBOT ) {
|
||||
s_loadTable.addKey ( &urlIp , &bb );
|
||||
bb.m_id = s_lbid++;
|
||||
s_loadTable.addKey ( &urlIp , &bb );
|
||||
// winner count update
|
||||
winnersp->m_timesUsed++;
|
||||
}
|
||||
@ -931,12 +937,29 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
// and the loadbucket id
|
||||
//*(int32_t *)p = bb.m_id; p += 4;
|
||||
|
||||
//int32_t sanityCount = 0;//s_loadTable.getNumSlots();
|
||||
// top:
|
||||
// with dup keys we end up with long chains of crap and this
|
||||
// takes forever. so just flush the whole thing every 2 minutes AND
|
||||
// when 20000+ entries are in there
|
||||
static time_t s_lastTime = 0;
|
||||
time_t now = nowms / 1000;
|
||||
if ( s_lastTime == 0 ) s_lastTime = now;
|
||||
time_t elapsed = now - s_lastTime;
|
||||
if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) {
|
||||
log("sproxy: flushing %i entries from proxy loadtable that "
|
||||
"have accumulated since %i seconds ago",
|
||||
(int)s_loadTable.m_numSlotsUsed,(int)elapsed);
|
||||
s_loadTable.clear();
|
||||
// only do this one per minute
|
||||
s_lastTime = now;
|
||||
}
|
||||
|
||||
|
||||
int32_t sanityCount = 0;//s_loadTable.getNumSlots();
|
||||
// top:
|
||||
// now remove old entries from the load table. entries that
|
||||
// have completed and have a download end time more than 10 mins ago
|
||||
for ( int32_t i = 0 ; i < s_loadTable.getNumSlots() ; i++ ) {
|
||||
// have completed and have a download end time more than 10 mins ago.
|
||||
for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) {
|
||||
QUICKPOLL(niceness);
|
||||
// skip if empty
|
||||
if ( ! s_loadTable.m_flags[i] ) continue;
|
||||
// get the bucket
|
||||
@ -948,8 +971,8 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
// < 10 mins? now it's < 15 seconds to prevent clogging.
|
||||
if ( took < LOADPOINT_EXPIRE_MS ) continue;
|
||||
|
||||
// 100 at a time
|
||||
//if ( sanityCount++ > 100 ) break;
|
||||
// 100 at a time so we don't slam cpu
|
||||
if ( sanityCount++ > 100 ) break;
|
||||
|
||||
// ok, its too old, nuke it to save memory
|
||||
s_loadTable.removeSlot(i);
|
||||
@ -957,7 +980,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
// mis out on analyzing any keys if we just keep looping here
|
||||
// should we? TODO: figure it out. if we miss a few it's not
|
||||
// a big deal.
|
||||
i--;
|
||||
//i--;
|
||||
//goto top;
|
||||
}
|
||||
|
||||
|
77
Statsdb.cpp
77
Statsdb.cpp
@ -65,21 +65,19 @@ static Label s_labels[] = {
|
||||
// . 300MB/s is max read rate regardless to stop graph shrinkage
|
||||
// . use 1KB as the min resolution per pixel
|
||||
// . stored in Bps so use 1/1000 as scalar to get into KBps
|
||||
{ GRAPH_QUANTITY,200,"disk_read",1,"%.0f MBps",1.0/(1000.0*1000.0),0x000000,
|
||||
"disk read"},
|
||||
{ GRAPH_QUANTITY,200,"disk_read",1,"%.0f MBps",1.0/(1000.0*1000.0),0x000000,"disk read"},
|
||||
|
||||
// . 300MB/s is max write rate regardless to stop graph shrinkage
|
||||
// . use 1KB as the min resolution per pixel
|
||||
// . stored in Bps so use 1/1000 as scalar to get into KBps
|
||||
{GRAPH_QUANTITY,200,"disk_write",1,"%.0f Mbps",1.0/(1000.0*1000.0), 0xff0000,
|
||||
"disk write"},
|
||||
{GRAPH_QUANTITY,200,"disk_write",1,"%.0f Mbps",1.0/(1000.0*1000.0), 0xff0000, "disk write"},
|
||||
|
||||
// . 20 is the max dps regardless to stop graph shrinkage
|
||||
// . use .03 qps as the min resolution per pixel
|
||||
{GRAPH_OPS,20,"parse_doc", .005,"%.1f dps" , 1.0 , 0x00fea915,"parsed doc" },
|
||||
|
||||
|
||||
{GRAPH_QUANTITY_PER_OP,1000,"docs_per_second", .005,"%.1f docs" , .001 , 0x1F2F5C,"docs per second" },
|
||||
{GRAPH_QUANTITY_PER_OP,-1,"docs_per_second", .1,"%.1f docs per second" , -1 , 0x1F2F5C,"*successfully* indexed docs per second" },
|
||||
|
||||
// . use .1 * 1000 docs as the min resolution per pixel
|
||||
// . max = -1, means dynamic size the ymax!
|
||||
@ -88,7 +86,7 @@ static Label s_labels[] = {
|
||||
// . make it 2M now not 50M. seems like it is per pixel and theres
|
||||
// like 1000 pixels vertically. but we need to autoscale it
|
||||
// eventually
|
||||
{GRAPH_QUANTITY,2000000.0,"docs_indexed", .1,"%.0fK docs" , .001 , 0x00cc0099,"docs indexed" }
|
||||
{GRAPH_QUANTITY,-1,"docs_indexed", .1,"%.0f docs" , -1, 0x00cc0099,"docs indexed" }
|
||||
|
||||
|
||||
//{ "termlist_intersect",0x0000ff00},
|
||||
@ -122,6 +120,7 @@ Label *Statsdb::getLabel ( int32_t labelHash ) {
|
||||
return *label;
|
||||
}
|
||||
|
||||
|
||||
Statsdb::Statsdb ( ) {
|
||||
m_init = false;
|
||||
m_disabled = true;
|
||||
@ -246,6 +245,8 @@ void flushStatsWrapper ( int fd , void *state ) {
|
||||
void Statsdb::addDocsIndexed ( ) {
|
||||
|
||||
if ( ! isClockInSync() ) return;
|
||||
if ( g_hostdb.hasDeadHost() ) return;
|
||||
|
||||
|
||||
// only host #0 needs this
|
||||
if ( g_hostdb.m_hostId != 0 ) return;
|
||||
@ -270,18 +271,23 @@ void Statsdb::addDocsIndexed ( ) {
|
||||
// divide by # of groups
|
||||
total /= g_hostdb.getNumHostsPerShard();
|
||||
// skip if no change
|
||||
|
||||
if ( total == s_lastTotal ) return;
|
||||
|
||||
int32_t docsIndexedInInterval = total - s_lastTotal;
|
||||
float docsPerSecond = docsIndexedInInterval / (float)interval;
|
||||
|
||||
s_lastTotal = total;
|
||||
log("build: total docs indexed: %f. docs per second %f %i %i", (float)total, docsPerSecond, docsIndexedInInterval, interval);
|
||||
|
||||
// add it if changed though
|
||||
int64_t nowms = gettimeofdayInMillisecondsGlobal();
|
||||
addStat ( MAX_NICENESS,"docs_indexed", nowms, nowms, (float)total );
|
||||
addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond );
|
||||
// Prevent a datapoint which adds all of the docs indexed to date.
|
||||
if( s_lastTotal != 0 ) {
|
||||
addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond );
|
||||
}
|
||||
|
||||
s_lastTotal = total;
|
||||
}
|
||||
|
||||
// . m_key bitmap in statsdb:
|
||||
@ -896,12 +902,13 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
bool needMax = true;
|
||||
float ymin = 0.0;
|
||||
float ymax = 0.0;
|
||||
|
||||
float yscalar = label->m_yscalar;
|
||||
char *p = pstart;
|
||||
|
||||
for ( ; p < pend ; p += 12 ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
if ( m_gw.getLength() > 10000000 ) break;
|
||||
// get the y
|
||||
float y2 = *(float *)(p+4);
|
||||
// get color of this point
|
||||
@ -909,7 +916,8 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
// stop if not us
|
||||
if ( gh != graphHash ) continue;
|
||||
// put into scaled space right away
|
||||
y2 = y2 * label->m_yscalar;
|
||||
if (label->m_yscalar >= 0)
|
||||
y2 = y2 * label->m_yscalar;
|
||||
// . limit y to absolute max
|
||||
// . these units should be scaled as well!
|
||||
if ( y2 > label->m_absYMax && label->m_absYMax > 0.0 )
|
||||
@ -922,13 +930,21 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
}
|
||||
|
||||
// force to zero for now
|
||||
ymin = 0.0;
|
||||
//ymin = 0.0;
|
||||
// . and force to ymax for now as well
|
||||
// . -1 indicates dynamic though!
|
||||
if ( label->m_absYMax > 0.0 ) ymax = label->m_absYMax;
|
||||
// add a 20% ceiling
|
||||
else ymax *= 1.20;
|
||||
// else ymax *= 1.20;
|
||||
|
||||
|
||||
if( label->m_yscalar <= 0 ) {
|
||||
if(ymax == ymin) {
|
||||
yscalar = 0;
|
||||
} else {
|
||||
yscalar = (float)DY2 / (ymax - ymin);
|
||||
}
|
||||
}
|
||||
// return that!
|
||||
char *retp = p;
|
||||
|
||||
@ -951,7 +967,7 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
|
||||
// . pad y range if total range is small
|
||||
// . only do this for certain types of stats, like qps and disk i/o
|
||||
if ( ourDiff < minDiff ) {
|
||||
if ( label->m_yscalar >=0 && ourDiff < minDiff ) {
|
||||
float pad = (minDiff - ourDiff) / 2;
|
||||
// pad it out
|
||||
ymin -= pad ;
|
||||
@ -981,16 +997,23 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
for ( ; p < pend ; ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
if ( m_gw.getLength() > 10000000 ) break;
|
||||
// first is x pixel pos
|
||||
int32_t x2 = *(int32_t *)p; p += 4;
|
||||
// then y pos
|
||||
float y2 = *(float *)p; p += 4;
|
||||
|
||||
// scale it right away
|
||||
y2 *= label->m_yscalar;
|
||||
if(label->m_yscalar < 0) {
|
||||
y2 = (y2 - ymin) * yscalar;
|
||||
}
|
||||
else {
|
||||
y2 *= yscalar;
|
||||
|
||||
}
|
||||
// adjust
|
||||
if ( y2 > ymax ) y2 = ymax;
|
||||
if ( y2 < 0 ) y2 = 0;
|
||||
|
||||
// then graphHash
|
||||
int32_t gh = *(int32_t *)p; p += 4;
|
||||
@ -1003,8 +1026,10 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
float y1 = lasty;
|
||||
|
||||
// normalize y into pixel space
|
||||
y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
|
||||
|
||||
if(label->m_yscalar >= 0 && ymax != ymin) {
|
||||
y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
|
||||
}
|
||||
|
||||
// set lasts for next iteration of this loop
|
||||
lastx = x2;
|
||||
lasty = y2;
|
||||
@ -1073,13 +1098,20 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
}
|
||||
|
||||
|
||||
float lastZ = -1;
|
||||
for ( float z = ymin ; z < ymax ; z += deltaz ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// draw it
|
||||
drawHR ( z , ymin , ymax , m_gw , label , zoff , color );
|
||||
if(z == lastZ) break;
|
||||
lastZ = z;
|
||||
//if ( m_gw.getLength() > 10000000 ) break;
|
||||
}
|
||||
|
||||
if ( m_gw.getLength() > 10000000 )
|
||||
log("statsdb: graph too big");
|
||||
|
||||
return retp;
|
||||
//#endif
|
||||
|
||||
@ -1158,7 +1190,7 @@ void Statsdb::drawHR ( float z ,
|
||||
"font-size:14px;"
|
||||
"min-height:20px;"
|
||||
"min-width:3px;\""
|
||||
" class=\"color-%"XINT32"\";"
|
||||
" class=\"color-%"XINT32"\""
|
||||
">%s</div>\n"
|
||||
, (int32_t)(m_bx)
|
||||
, (int32_t)z2 +m_by
|
||||
@ -1194,6 +1226,13 @@ bool Statsdb::processList ( ) {
|
||||
m_done = true;
|
||||
}
|
||||
|
||||
// HACK: the user can request all of the events, it can
|
||||
// become quite large. so limit to 100 mb right now.
|
||||
if( m_sb3.length() > 100000000) {
|
||||
log("statsdb: truncating statsdb results.");
|
||||
m_done = true;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// all these points are accumulated into 1-second buckets
|
||||
@ -1590,7 +1629,7 @@ void Statsdb::drawLine3 ( SafeBuf &sb ,
|
||||
"z-index:-5;"
|
||||
"min-height:%"INT32"px;"
|
||||
"min-width:%"INT32"px;\""
|
||||
"class=\"color-%"XINT32"\"></div>\n"
|
||||
" class=\"color-%"XINT32"\"></div>\n"
|
||||
, x1 + m_bx
|
||||
, (fy1 - width/2) + m_by
|
||||
, color
|
||||
@ -1599,3 +1638,5 @@ void Statsdb::drawLine3 ( SafeBuf &sb ,
|
||||
, color
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
23
Tagdb.cpp
23
Tagdb.cpp
@ -2803,24 +2803,15 @@ bool Msg8a::launchGetRequests ( ) {
|
||||
//uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true );
|
||||
//Host *group = g_hostdb.getGroup ( gid );
|
||||
int32_t shardNum = getShardNum ( m_rdbId , &startKey );//, true );
|
||||
Host *group = g_hostdb.getShard ( shardNum );
|
||||
|
||||
//int32_t numTwins = g_hostdb.getNumHostsPerShard();
|
||||
// use top byte!
|
||||
uint8_t *sks = (uint8_t *)&startKey;
|
||||
uint8_t top = sks[sizeof(TAGDB_KEY)-1];
|
||||
//int32_t hostNum = 0;
|
||||
//if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
|
||||
// TODO: fix this!
|
||||
//if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
|
||||
// support more than 2 stripes now...
|
||||
int32_t hostNum = top % g_hostdb.getNumHostsPerShard();
|
||||
int32_t hostId = group[hostNum].m_hostId;
|
||||
|
||||
Host *firstHost ;
|
||||
// if niceness 0 can't pick noquery host.
|
||||
// if niceness 1 can't pick nospider host.
|
||||
firstHost = g_hostdb.getLeastLoadedInShard ( shardNum , m_niceness );
|
||||
int32_t firstHostId = firstHost->m_hostId;
|
||||
|
||||
// . launch this request, even if to ourselves
|
||||
// . TODO: just use msg0!!
|
||||
bool status = m->getList ( hostId , // hostId
|
||||
bool status = m->getList ( firstHostId , // hostId
|
||||
0 , // ip
|
||||
0 , // port
|
||||
0 , // maxCacheAge
|
||||
@ -2837,7 +2828,7 @@ bool Msg8a::launchGetRequests ( ) {
|
||||
true , // error correction?
|
||||
true , // include tree?
|
||||
true , // doMerge?
|
||||
-1 , // firstHostId
|
||||
firstHostId , // firstHostId
|
||||
0 , // startFileNum
|
||||
-1 , // numFiles
|
||||
3600*24*365 );// timeout
|
||||
|
@ -2918,6 +2918,67 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
|
||||
SSL_set_connect_state(s->m_ssl);
|
||||
}
|
||||
|
||||
// . set hostname for SNI (Server Name Identification)
|
||||
// . can test with page parser on the test page: https://sni.velox.ch/
|
||||
// . we can parse the mime reliably here because we are the ones
|
||||
// that created the request, so we know it should be standardish.
|
||||
if ( s->m_sendBuf && ! s->m_readBuf ) {
|
||||
// grab hostname from the mime
|
||||
// skip first line
|
||||
char *p = s->m_sendBuf;
|
||||
char *pend = p + s->m_sendBufSize;
|
||||
if ( p+10 >= pend )
|
||||
goto skipSNI;
|
||||
bool gotIt = false;
|
||||
if ( p[0] == 'G' && p[1] == 'E' && p[2] == 'T' && p[3]==' ' )
|
||||
gotIt = true;
|
||||
if ( p[0] == 'P' && p[1] == 'O' && p[2] == 'S' && p[3]=='T' &&
|
||||
p[4] == ' ' )
|
||||
gotIt = true;
|
||||
// need to start with "GET " or "POST "
|
||||
if ( ! gotIt )
|
||||
goto skipSNI;
|
||||
scanMimeSomeMore:
|
||||
// skip to the first \r, indicating end of line
|
||||
for ( ; p < pend && *p != '\r' ; p++ );
|
||||
// if we couldn't find it, then there's no Host: directive
|
||||
if ( p == pend )
|
||||
goto skipSNI;
|
||||
// skip \r\n
|
||||
if ( *p == '\r' )
|
||||
p++;
|
||||
if ( p == pend )
|
||||
goto skipSNI;
|
||||
if ( *p == '\n' )
|
||||
p++;
|
||||
if ( p == pend )
|
||||
goto skipSNI;
|
||||
// end of mime (\r\n\r\n)
|
||||
if ( p+2<pend && p[0] == '\r' && p[1] == '\n' )
|
||||
goto skipSNI;
|
||||
// is it host:?
|
||||
if ( p+6 >= pend )
|
||||
goto skipSNI;
|
||||
if ( strncasecmp(p,"Host:",5) )
|
||||
goto scanMimeSomeMore;
|
||||
p += 5;
|
||||
if ( p<pend && *p == ' ' ) p++;
|
||||
if ( p<pend && *p == ' ' ) p++;
|
||||
char *hostname = p;
|
||||
// find end of line
|
||||
for ( ; p<pend && *p != '\r' ; p++ );
|
||||
if ( p == pend )
|
||||
goto skipSNI;
|
||||
// temp null
|
||||
char c = *p;
|
||||
*p = '\0';
|
||||
/// @todo what if we can't set TLS servername extension?
|
||||
SSL_set_tlsext_host_name(s->m_ssl, hostname );
|
||||
// replace the \0 with original char
|
||||
*p = c;
|
||||
}
|
||||
skipSNI:
|
||||
|
||||
// SSL_connect() calls malloc()
|
||||
g_inMemFunction = true;
|
||||
int r = SSL_connect(s->m_ssl);
|
||||
|
58
Threads.cpp
58
Threads.cpp
@ -320,7 +320,7 @@ bool Threads::init ( ) {
|
||||
// i raised since global specs new servers have 2 (hyperthreaded?) cpus
|
||||
int32_t max = g_conf.m_maxCpuThreads;
|
||||
if ( max < 1 ) max = 1;
|
||||
if ( ! g_threads.registerType ( INTERSECT_THREAD,max,200) )
|
||||
if ( ! g_threads.registerType ( INTERSECT_THREAD,max,10) )
|
||||
return log("thread: Failed to register thread type." );
|
||||
// filter thread spawned to call popen() to filter an http reply
|
||||
if ( ! g_threads.registerType ( FILTER_THREAD, 2/*maxThreads*/,300) )
|
||||
@ -334,10 +334,10 @@ bool Threads::init ( ) {
|
||||
// it was taking forever to go one at a time through the unlink
|
||||
// thread queue. seemed like a 1 second space between unlinks.
|
||||
// 1/23/1014
|
||||
if ( ! g_threads.registerType ( UNLINK_THREAD,30/*maxThreads*/,3000) )
|
||||
if ( ! g_threads.registerType ( UNLINK_THREAD,5/*maxThreads*/,3000) )
|
||||
return log("thread: Failed to register thread type." );
|
||||
// generic multipurpose
|
||||
if ( ! g_threads.registerType (GENERIC_THREAD,100/*maxThreads*/,100) )
|
||||
if ( ! g_threads.registerType (GENERIC_THREAD,20/*maxThreads*/,100) )
|
||||
return log("thread: Failed to register thread type." );
|
||||
// for call SSL_accept() which blocks for 10ms even when socket
|
||||
// is non-blocking...
|
||||
@ -435,6 +435,13 @@ int32_t Threads::getNumWriteThreadsOut() {
|
||||
return m_threadQueues[DISK_THREAD].getNumWriteThreadsOut();
|
||||
}
|
||||
|
||||
int32_t Threads::getNumActiveWriteUnlinkRenameThreadsOut() {
|
||||
// these do not countthreads that are done, and just awaiting join
|
||||
int32_t n = m_threadQueues[DISK_THREAD].getNumWriteThreadsOut();
|
||||
n += m_threadQueues[UNLINK_THREAD].getNumActiveThreadsOut();
|
||||
return n;
|
||||
}
|
||||
|
||||
// . returns false (and may set errno) if failed to launch a thread
|
||||
// . returns true if thread added to queue successfully
|
||||
// . may be launched instantly or later depending on # of threads in the queue
|
||||
@ -853,6 +860,19 @@ bool ThreadQueue::init ( char threadType, int32_t maxThreads, int32_t maxEntries
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t ThreadQueue::getNumActiveThreadsOut() {
|
||||
int32_t n = 0;
|
||||
for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
|
||||
ThreadEntry *e = &m_entries[i];
|
||||
if ( ! e->m_isOccupied ) continue;
|
||||
if ( ! e->m_isLaunched ) continue;
|
||||
// if it is done and just waiting for a join, do not count
|
||||
if ( e->m_isDone ) continue;
|
||||
n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
int32_t ThreadQueue::getNumThreadsOutOrQueued() {
|
||||
// MDW: we also need to count threads that are returned but need their
|
||||
// callback called so, in the case of RdbDump, the rdblist that was written
|
||||
@ -1108,6 +1128,7 @@ int32_t Threads::timedCleanUp (int32_t maxTime, int32_t niceness) {
|
||||
return 0;
|
||||
|
||||
if ( ! m_needsCleanup ) return 0;
|
||||
|
||||
//if ( g_inSigHandler ) return 0;
|
||||
int64_t startTime = gettimeofdayInMillisecondsLocal();
|
||||
int64_t took = 0;
|
||||
@ -1299,7 +1320,15 @@ bool ThreadQueue::timedCleanUp ( int32_t maxNiceness ) {
|
||||
// . join up with that thread
|
||||
// . damn, sometimes he can block forever on his
|
||||
// call to sigqueue(),
|
||||
int64_t startTime = gettimeofdayInMillisecondsLocal();
|
||||
int64_t took;
|
||||
int32_t status = pthread_join ( t->m_joinTid , NULL );
|
||||
took = startTime - gettimeofdayInMillisecondsLocal();
|
||||
if ( took > 50 ) {
|
||||
log("threads: pthread_join took %i ms",
|
||||
(int)took);
|
||||
}
|
||||
|
||||
if ( status != 0 ) {
|
||||
log("threads: pthread_join %"INT64" = %s (%"INT32")",
|
||||
(int64_t)t->m_joinTid,mstrerror(status),
|
||||
@ -2088,7 +2117,8 @@ bool ThreadQueue::launchThread2 ( ) {
|
||||
|
||||
if ( m_threadType != DISK_THREAD ) {
|
||||
// if one thread of this type is already out, forget it
|
||||
if ( m_launchedHead ) return false;
|
||||
// then we can't have 100 GENERIC THREADS!!! with this...
|
||||
//if ( m_launchedHead ) return false;
|
||||
// first try niceness 0 queue
|
||||
ThreadEntry **bestHeadPtr = &m_waitHead0;
|
||||
ThreadEntry **bestTailPtr = &m_waitTail0;
|
||||
@ -3315,3 +3345,23 @@ void Threads::printState() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ThreadQueue::killAllThreads ( ) {
|
||||
for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
|
||||
ThreadEntry *e = &m_entries[i];
|
||||
if ( ! e->m_isOccupied ) continue;
|
||||
if ( ! e->m_isLaunched ) continue;
|
||||
log("threads: killling thread id %i",(int)e->m_joinTid);
|
||||
pthread_kill ( e->m_joinTid , SIGKILL );
|
||||
log("threads: joining with thread id %i",(int)e->m_joinTid);
|
||||
pthread_join ( e->m_joinTid , NULL );
|
||||
}
|
||||
}
|
||||
|
||||
void Threads::killAllThreads ( ) {
|
||||
log("threads: killing all threads");
|
||||
for ( int32_t j = 0 ; j < m_numQueues ; j++ ) {
|
||||
ThreadQueue *tq = &m_threadQueues[j];
|
||||
tq->killAllThreads();
|
||||
}
|
||||
}
|
||||
|
@ -161,6 +161,7 @@ class ThreadQueue {
|
||||
|
||||
int32_t getNumThreadsOutOrQueued();
|
||||
int32_t getNumWriteThreadsOut() ;
|
||||
int32_t getNumActiveThreadsOut() ;
|
||||
|
||||
|
||||
// . for adding an entry
|
||||
@ -196,6 +197,8 @@ class ThreadQueue {
|
||||
void suspendLowPriorityThreads();
|
||||
void resumeLowPriorityThreads();
|
||||
|
||||
void killAllThreads();
|
||||
|
||||
// this is true if low priority threads are temporarily suspended
|
||||
bool m_isLowPrioritySuspended ;
|
||||
|
||||
@ -246,6 +249,8 @@ class Threads {
|
||||
bool areThreadsDisabled() { return m_disabled; };
|
||||
bool areThreadsEnabled () { return ! m_disabled; };
|
||||
|
||||
void killAllThreads();
|
||||
|
||||
// . returns false and sets errno if thread launch failed
|
||||
// . returns true on success
|
||||
// . when thread is done a signal will be put on the g_loop's
|
||||
@ -301,6 +306,8 @@ class Threads {
|
||||
int32_t getNumThreadsOutOrQueued();
|
||||
int32_t getNumWriteThreadsOut() ;
|
||||
|
||||
int32_t getNumActiveWriteUnlinkRenameThreadsOut() ;
|
||||
|
||||
// counts the high/low priority (niceness <= 0) threads
|
||||
//int64_t m_hiLaunched;
|
||||
//int64_t m_hiReturned;
|
||||
|
@ -286,6 +286,7 @@ bool UdpServer::init ( uint16_t port, UdpProtocol *proto, int32_t niceness,
|
||||
// no requests waiting yet
|
||||
m_requestsInWaiting = 0;
|
||||
// special count
|
||||
m_msg07sInWaiting = 0;
|
||||
m_msg10sInWaiting = 0;
|
||||
m_msgc1sInWaiting = 0;
|
||||
//m_msgDsInWaiting = 0;
|
||||
@ -1005,7 +1006,7 @@ UdpSlot *UdpServer::getBestSlotToSend ( int64_t now ) {
|
||||
UdpSlot *maxi = NULL;
|
||||
int32_t score;
|
||||
//UdpSlot *slot;
|
||||
// . we send dgrams with the lowest "score" first
|
||||
// . we send dgrams with the lowest "score" first
|
||||
// . the "score" is just number of ACKs you're waiting for
|
||||
// . that way transmissions that are the most caught up to their ACKs
|
||||
// are considered faster so we send to them first
|
||||
@ -1482,6 +1483,9 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
|
||||
// rate, these are pretty lightweight. msg 0x10 reply gen times
|
||||
// are VERY low. MDW
|
||||
bool getSlot = true;
|
||||
if ( msgType == 0x07 && m_msg07sInWaiting >= 100 )
|
||||
getSlot = false;
|
||||
|
||||
if ( msgType == 0x10 && m_msg10sInWaiting >= 50 )
|
||||
getSlot = false;
|
||||
// crawl update info from Spider.cpp
|
||||
@ -1671,6 +1675,7 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
|
||||
// if we connected to a request slot, count it
|
||||
m_requestsInWaiting++;
|
||||
// special count
|
||||
if ( msgType == 0x07 ) m_msg07sInWaiting++;
|
||||
if ( msgType == 0x10 ) m_msg10sInWaiting++;
|
||||
if ( msgType == 0xc1 ) m_msgc1sInWaiting++;
|
||||
//if ( msgType == 0xd ) m_msgDsInWaiting++;
|
||||
@ -3122,6 +3127,7 @@ void UdpServer::destroySlot ( UdpSlot *slot ) {
|
||||
// one less request in waiting
|
||||
m_requestsInWaiting--;
|
||||
// special count
|
||||
if ( slot->m_msgType == 0x07 ) m_msg07sInWaiting--;
|
||||
if ( slot->m_msgType == 0x10 ) m_msg10sInWaiting--;
|
||||
if ( slot->m_msgType == 0xc1 ) m_msgc1sInWaiting--;
|
||||
//if ( slot->m_msgType == 0xd ) m_msgDsInWaiting--;
|
||||
|
@ -390,6 +390,7 @@ class UdpServer {
|
||||
int32_t m_requestsInWaiting;
|
||||
|
||||
// like m_requestsInWaiting but requests which spawn other requests
|
||||
int32_t m_msg07sInWaiting;
|
||||
int32_t m_msg10sInWaiting;
|
||||
int32_t m_msgc1sInWaiting;
|
||||
//int32_t m_msgDsInWaiting;
|
||||
|
@ -1280,6 +1280,8 @@ bool UdpSlot::readDatagramOrAck ( int sock ,
|
||||
}
|
||||
// handle acks
|
||||
if ( m_proto->isAck ( peek , peekSize ) ) {
|
||||
// if ack for msg4 core to test its save stuff
|
||||
//if ( m_msgType == 0x04 ) { char *xx=NULL;*xx=0; }
|
||||
readAck ( sock, dgramNum , now );
|
||||
// keep stats
|
||||
if ( m_host ) m_host->m_dgramsFrom++;
|
||||
|
20
UdpSlot.h
20
UdpSlot.h
@ -10,6 +10,12 @@
|
||||
#include "UdpProtocol.h"
|
||||
#include "Hostdb.h"
|
||||
|
||||
// i'm seeing some networks not liking big dgrams, so
|
||||
// lets go super small. we won't be able to send back
|
||||
// huge msgs unfortunately, so we'll have to fix that
|
||||
// a different way later.
|
||||
#define SMALLDGRAMS
|
||||
|
||||
// . we want to avoid the overhead of IP level fragmentation
|
||||
// . so for an MTU of 1500 we got 28 bytes overhead (IP and UDP headers)
|
||||
// . later we can try large DGRAM_SIZE values to see if faster
|
||||
@ -19,9 +25,9 @@
|
||||
//#define DGRAM_SIZE 7500
|
||||
//#define DGRAM_SIZE ((1500-28)*5)
|
||||
// this was the most stable size, but now, 4/8/04, i'm trying bigger...
|
||||
#ifdef _SMALLDGRAMS_
|
||||
#ifdef SMALLDGRAMS
|
||||
// newspaperarchive machines need this smaller size
|
||||
#define DGRAM_SIZE (1500-28)
|
||||
#define DGRAM_SIZE (1500-28-10)
|
||||
#else
|
||||
// . here's the new size, 4/8/04, about 20x bigger
|
||||
// . only use this for our machines
|
||||
@ -30,10 +36,11 @@
|
||||
// . let's see if smaller dgrams fix the ping spike problem on gk0c
|
||||
// . this is in addition to lower the ack windows from 12 to 4
|
||||
#define DGRAM_SIZE 16400
|
||||
#endif
|
||||
|
||||
// . the 45k dgram doesn't travel well over the internet, and javier needs
|
||||
// to do that for the "interface client" code
|
||||
#define DGRAM_SIZE_INTERNET (1500-28)
|
||||
#endif
|
||||
#define DGRAM_SIZE_INTERNET (1500-28-10)
|
||||
|
||||
// i'd like to have less dgram to decrease interrupts and
|
||||
// to decrease the MAX_DGRAMS define which decrease UdpSlot size
|
||||
@ -76,10 +83,11 @@
|
||||
// raised from 50MB to 80MB so Msg13 compression proxy can send back big replies > 5MB
|
||||
// raised from 80MB to 180MB since we could be sending back a Msg95Reply
|
||||
// which is a list of QueryChanges. 3/29/13.
|
||||
#define MAX_DGRAMS (((180*1024*1024) / DGRAM_SIZE_LB) + 1)
|
||||
//#define MAX_DGRAMS (((180*1024*1024) / DGRAM_SIZE_LB) + 1)
|
||||
#define MAX_DGRAMS (((80*1024*1024) / DGRAM_SIZE) + 1)
|
||||
//#endif
|
||||
|
||||
#define MAX_ABSDOCLEN ((MAX_DGRAMS * DGRAM_SIZE_LB)-50000)
|
||||
#define MAX_ABSDOCLEN ((MAX_DGRAMS * DGRAM_SIZE)-50000)
|
||||
|
||||
// . the max size of an incoming request for a hot udp server
|
||||
// . we cannot call malloc so it must fit in here
|
||||
|
17
Unicode.h
17
Unicode.h
@ -66,15 +66,26 @@ static int utf8_sane[] = {
|
||||
|
||||
// how many bytes is char pointed to by p?
|
||||
inline char getUtf8CharSize ( uint8_t *p ) {
|
||||
return bytes_in_utf8_code[*p];
|
||||
uint8_t c = *p;
|
||||
if(c<128)
|
||||
return 1;
|
||||
else
|
||||
return bytes_in_utf8_code[c];
|
||||
}
|
||||
|
||||
inline char getUtf8CharSize ( char *p ) {
|
||||
return bytes_in_utf8_code[*(uint8_t *)p];
|
||||
uint8_t c = (uint8_t)*p;
|
||||
if(c<128)
|
||||
return 1;
|
||||
else
|
||||
return bytes_in_utf8_code[c];
|
||||
}
|
||||
|
||||
inline char getUtf8CharSize ( uint8_t c ) {
|
||||
return bytes_in_utf8_code[c];
|
||||
if(c<128)
|
||||
return 1;
|
||||
else
|
||||
return bytes_in_utf8_code[c];
|
||||
}
|
||||
|
||||
inline char getUtf8CharSize2 ( uint8_t *p ) {
|
||||
|
252
Url.cpp
252
Url.cpp
@ -5,6 +5,8 @@
|
||||
#include "Errno.h"
|
||||
#include "HashTable.h"
|
||||
#include "Speller.h"
|
||||
#include "Punycode.h"
|
||||
#include "Unicode.h"
|
||||
|
||||
static void print_string ( char *s , int32_t len );
|
||||
|
||||
@ -137,7 +139,7 @@ void Url::set (Url *baseUrl,char *s,int32_t len,bool addWWW,bool stripSessionId,
|
||||
// . i know sun.com has urls like "http://sun.com/;$sessionid=123ABC$"
|
||||
// . url should be ENCODED PROPERLY for this to work properly
|
||||
void Url::set ( char *t , int32_t tlen , bool addWWW , bool stripSessionId ,
|
||||
bool stripPound , bool stripCommonFile ,
|
||||
bool stripPound , bool stripCommonFile ,
|
||||
int32_t titleRecVersion ) {
|
||||
reset();
|
||||
// debug
|
||||
@ -157,11 +159,163 @@ void Url::set ( char *t , int32_t tlen , bool addWWW , bool stripSessionId ,
|
||||
while ( tlen > 0 && !is_alnum_a(*t) && *t!='-' && *t!='/'){t++;tlen--;}
|
||||
// . stop t at first space or binary char
|
||||
// . url should be in encoded form!
|
||||
int32_t i ;
|
||||
int32_t i = 0;
|
||||
int32_t nonAsciiPos = -1;
|
||||
for ( i = 0 ; i < tlen ; i++ ) {
|
||||
if ( ! is_ascii(t[i]) ) break; // no non-ascii chars allowed
|
||||
if ( is_wspace_a(t[i]) ) break; // no spaces allowed
|
||||
|
||||
if ( ! is_ascii(t[i]) ) {
|
||||
// Sometimes the length with the null is passed in,
|
||||
// so ignore nulls FIXME?
|
||||
if( t[i] ) nonAsciiPos = i;
|
||||
break; // no non-ascii chars allowed
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(nonAsciiPos != -1) {
|
||||
// Try turning utf8 and latin1 encodings into punycode.
|
||||
// All labels(between dots) in the domain are encoded
|
||||
// separately. We don't support encoded tlds, but they are
|
||||
// not widespread yet.
|
||||
// If it is a non ascii domain it needs to take the form
|
||||
// xn--<punycoded label>.xn--<punycoded label>.../
|
||||
char tmp = t[tlen];
|
||||
if(t[tlen]) t[tlen] = 0;
|
||||
log(LOG_DEBUG, "build: attempting to decode unicode url %s pos at %"INT32, t, nonAsciiPos);
|
||||
if(tmp) t[tlen] = tmp;
|
||||
char encoded [ MAX_URL_LEN ];
|
||||
size_t encodedLen = MAX_URL_LEN;
|
||||
char *encodedDomStart = encoded;
|
||||
char *p = t;
|
||||
char *pend = t+tlen;
|
||||
|
||||
// Find the start of the domain
|
||||
if(tlen > 7 && strncmp(p, "http://", 7) == 0) p += 7;
|
||||
else if(tlen > 8 && strncmp(p, "https://", 8) == 0) p += 8;
|
||||
|
||||
gbmemcpy(encodedDomStart, t, p-t);
|
||||
encodedDomStart += p-t;
|
||||
|
||||
while(p < pend && *p != '/') {
|
||||
char *labelStart = p;
|
||||
uint32_t tmpBuf[MAX_URL_LEN];
|
||||
int32_t tmpLen = 0;
|
||||
|
||||
while(p < pend && *p != '.' && *p != '/') p++;
|
||||
int32_t labelLen = p - labelStart;
|
||||
|
||||
bool tryLatin1 = false;
|
||||
// For utf8 urls
|
||||
p = labelStart;
|
||||
bool labelIsAscii = true;
|
||||
|
||||
// Convert the domain to code points and copy it to
|
||||
// tmpbuf to be punycoded
|
||||
for(;p-labelStart<labelLen;
|
||||
p += utf8Size(tmpBuf[tmpLen]), tmpLen++) {
|
||||
|
||||
labelIsAscii &= is_ascii(*p);
|
||||
tmpBuf[tmpLen] = utf8Decode(p);
|
||||
if(!tmpBuf[tmpLen]) { // invalid char?
|
||||
tryLatin1 = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(labelIsAscii) {
|
||||
if(labelStart[labelLen] == '.') {
|
||||
labelLen++;
|
||||
p++;
|
||||
}
|
||||
gbmemcpy(encodedDomStart, labelStart, labelLen);
|
||||
encodedDomStart += labelLen;
|
||||
continue;
|
||||
}
|
||||
|
||||
if( tryLatin1 ) {
|
||||
// For latin1 urls
|
||||
tmpLen = 0;
|
||||
for(;tmpLen<labelLen;tmpLen++) {
|
||||
tmpBuf[tmpLen] = labelStart[tmpLen];
|
||||
}
|
||||
}
|
||||
|
||||
gbmemcpy(encodedDomStart, "xn--", 4);
|
||||
encodedDomStart += 4;
|
||||
|
||||
punycode_status status ;
|
||||
status = punycode_encode(tmpLen,
|
||||
tmpBuf,
|
||||
NULL,
|
||||
&encodedLen,
|
||||
encodedDomStart);
|
||||
if ( status != 0 ) {
|
||||
// Give up? try again?
|
||||
log("build: Bad Engineer, failed to "
|
||||
"punycode international url %s", t);
|
||||
return;
|
||||
}
|
||||
// We should check if what we encoded were valid url
|
||||
// characters, no spaces, etc
|
||||
// FIXME: should we exclude just the bad chars? I've
|
||||
// seen plenty of urls with
|
||||
// a newline in the middle. Just discard the whole
|
||||
// chunk for now
|
||||
bool badUrlChars = false;
|
||||
for(uint32_t i=0;i<encodedLen;i++) {
|
||||
if(is_wspace_a(encodedDomStart[i])){
|
||||
badUrlChars = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(encodedLen == 0 || badUrlChars) {
|
||||
encodedDomStart -= 4; //don't need the xn--
|
||||
p++;
|
||||
} else {
|
||||
encodedDomStart += encodedLen;
|
||||
*encodedDomStart++ = *p++; // Copy in the . or the /
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// p now points to the end of the domain
|
||||
// encodedDomStart now points to the first free space in encoded string
|
||||
|
||||
// Now copy the rest of the url in. Watch out for non-ascii chars
|
||||
// truncate the url, and keep it under max url length
|
||||
uint32_t newUrlLen = encodedDomStart - encoded;
|
||||
|
||||
while(p < pend) {
|
||||
if ( ! *p ) break; // null?
|
||||
if(!is_ascii(*p)) {
|
||||
//break;
|
||||
// url encode utf8 characters now
|
||||
char cs = getUtf8CharSize(p);
|
||||
// bad utf8 char?
|
||||
if ( cs <= 1 ) break;
|
||||
// too long?
|
||||
if ( newUrlLen + 12 >= MAX_URL_LEN )
|
||||
break;
|
||||
char stored = urlEncode ( &encoded[newUrlLen],
|
||||
12 ,
|
||||
p ,
|
||||
cs );
|
||||
p += cs;
|
||||
newUrlLen += stored;
|
||||
continue;
|
||||
}
|
||||
if(is_wspace_a(*p)) break;
|
||||
if(newUrlLen >= MAX_URL_LEN) break;
|
||||
encoded[newUrlLen++] = *p++;
|
||||
}
|
||||
|
||||
|
||||
//gbmemcpy(encodedDomStart, p, restOfUrlLen);
|
||||
encoded[newUrlLen] = '\0';
|
||||
return this->set(encoded, newUrlLen, addWWW, stripSessionId,
|
||||
stripPound, stripCommonFile, titleRecVersion);
|
||||
}
|
||||
// truncate length to the first occurence of an unacceptable char
|
||||
tlen = i;
|
||||
// . decode characters that should not have been encoded
|
||||
@ -955,6 +1109,10 @@ char *Url::getPathComponent ( int32_t num , int32_t *clen ) {
|
||||
// return pc + pclen;
|
||||
//}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
bool Url::isHostWWW ( ) {
|
||||
if ( m_hlen < 4 ) return false;
|
||||
if ( m_host[0] != 'w' ) return false;
|
||||
@ -2380,3 +2538,91 @@ bool Url::hasMediaExtension ( ) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t Url::unitTests() {
|
||||
char* urls[] = {
|
||||
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela\032s-arsenals.php",
|
||||
"http://topbeskæring.dk/velkommen",
|
||||
"www.Alliancefrançaise.nu",
|
||||
"française.Alliance.nu",
|
||||
"française.Alliance.nu/asdf",
|
||||
"http://française.Alliance.nu/asdf",
|
||||
"http://française.Alliance.nu/",
|
||||
"幸运.龍.com",
|
||||
"幸运.龍.com/asdf/运/abc",
|
||||
"幸运.龍.com/asdf",
|
||||
"http://幸运.龍.com/asdf",
|
||||
"http://Беларуская.org/Акадэмічная",
|
||||
"https://hi.Български.com",
|
||||
"https://fakedomain.中文.org/asdf",
|
||||
"https://gigablast.com/abc/文/efg",
|
||||
"https://gigablast.com/?q=文",
|
||||
"http://www.example.сайт",
|
||||
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqués",
|
||||
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://腕時計通販.jp/",
|
||||
// Lets check some bad urls too:
|
||||
"https://pypi.python\n\n\t\t\t\t.org/packages/source/p/pyramid/pyramid-1.5.tar.gz#md5=8747658dcbab709a9c491e43d3b0d58b"
|
||||
};
|
||||
|
||||
StackBuf(sb);
|
||||
uint32_t len = sizeof(urls) / sizeof(char*);
|
||||
for(uint32_t i = 0; i < len; i++) {
|
||||
Url u;
|
||||
u.set(urls[i], strlen(urls[i]));
|
||||
log("build:%s normalized to %s, printed to %s ",
|
||||
urls[i], u.getUrl(), Url::getDisplayUrl(u.getUrl(), &sb));
|
||||
sb.reset();
|
||||
}
|
||||
//FIXME: need to return an error if there is a problem
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
char* Url::getDisplayUrl(char* url, SafeBuf* sb) {
|
||||
char* found;
|
||||
char* labelCursor = url;
|
||||
if((found = strstr(labelCursor, "xn--"))) {
|
||||
sb->safeMemcpy(url, found - url);
|
||||
|
||||
char* p = url;
|
||||
char* pend = url + gbstrlen(url);
|
||||
if(strncmp(p, "http://", 7) == 0) p += 7;
|
||||
else if(strncmp(p, "https://", 8) == 0) p += 8;
|
||||
|
||||
while(p < pend && *p != '/') p++;
|
||||
char* domEnd = p;
|
||||
|
||||
do {
|
||||
if(found > domEnd) {
|
||||
// Dont even look if it is past the domain
|
||||
break;
|
||||
}
|
||||
|
||||
char* encodedStart = found + 4;
|
||||
uint32_t decoded [ MAX_URL_LEN];
|
||||
size_t decodedLen = MAX_URL_LEN - 1 ;
|
||||
char* labelEnd = encodedStart;
|
||||
while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' )
|
||||
labelEnd++;
|
||||
|
||||
punycode_status status = punycode_decode(labelEnd - encodedStart,
|
||||
encodedStart,
|
||||
&decodedLen,
|
||||
decoded, NULL);
|
||||
if(status != 0) {
|
||||
log("build: Bad Engineer, failed to depunycode international url %s", url);
|
||||
sb->safePrintf("%s", url);
|
||||
return url;
|
||||
}
|
||||
sb->utf32Encode(decoded, decodedLen);
|
||||
//sb->pushChar(*labelEnd);
|
||||
labelCursor = labelEnd;
|
||||
} while((found = strstr(labelCursor, "xn--")));
|
||||
}
|
||||
// Copy in the rest
|
||||
sb->safePrintf("%s", labelCursor);
|
||||
sb->nullTerm();
|
||||
return sb->getBufStart();
|
||||
}
|
||||
|
4
Url.h
4
Url.h
@ -232,6 +232,7 @@ public:
|
||||
// this is private
|
||||
bool isSpam ( char *s , int32_t slen ) ;
|
||||
|
||||
|
||||
// . detects crazy repetetive urls like this:
|
||||
// http://www.pittsburghlive.com:8000/x/tribune-review/opinion/
|
||||
// steigerwald/letters/send/archive/letters/send/archive/bish/
|
||||
@ -244,6 +245,9 @@ public:
|
||||
// is probably more accurate than this function.
|
||||
bool isLinkLoop();
|
||||
|
||||
static uint32_t unitTests();
|
||||
static char* getDisplayUrl(char* url, SafeBuf* sb);
|
||||
|
||||
// private:
|
||||
|
||||
char m_url[MAX_URL_LEN]; // the normalized url
|
||||
|
986
XmlDoc.cpp
986
XmlDoc.cpp
File diff suppressed because it is too large
Load Diff
33
XmlDoc.h
33
XmlDoc.h
@ -475,7 +475,7 @@ class XmlDoc {
|
||||
key_t *doledbKey ,
|
||||
char *coll ,
|
||||
class SafeBuf *pbuf ,
|
||||
int32_t niceness ,
|
||||
int32_t niceness ,
|
||||
char *utf8Content = NULL ,
|
||||
bool deleteFromIndex = false ,
|
||||
int32_t forcedIp = 0 ,
|
||||
@ -483,9 +483,11 @@ class XmlDoc {
|
||||
uint32_t spideredTime = 0 , // time_t
|
||||
bool contentHasMime = false ,
|
||||
// for container docs, what is the separator of subdocs?
|
||||
char *contentDelim = NULL,
|
||||
char *metadata = NULL,
|
||||
uint32_t metadataLen = 0) ;
|
||||
char *contentDelim = NULL,
|
||||
char *metadata = NULL,
|
||||
uint32_t metadataLen = 0,
|
||||
// for injected docs we have the recv, buffer size don't exceed that
|
||||
int32_t payloadLen = -1) ;
|
||||
|
||||
// we now call this right away rather than at download time!
|
||||
int32_t getSpideredTime();
|
||||
@ -513,7 +515,9 @@ class XmlDoc {
|
||||
bool indexDoc2 ( );
|
||||
bool isContainerDoc ( );
|
||||
bool indexContainerDoc ( );
|
||||
bool indexWarcOrArc ( char ct ) ;
|
||||
|
||||
bool readMoreWarc();
|
||||
bool indexWarcOrArc ( ) ;
|
||||
key_t *getTitleRecKey() ;
|
||||
//char *getSkipIndexing ( );
|
||||
char *prepareToMakeTitleRec ( ) ;
|
||||
@ -521,6 +525,7 @@ class XmlDoc {
|
||||
bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
|
||||
// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
|
||||
SafeBuf *getTitleRecBuf ( );
|
||||
bool appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) ;
|
||||
SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
|
||||
bool forDelete ) ;
|
||||
SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
|
||||
@ -705,7 +710,7 @@ class XmlDoc {
|
||||
char **getExpandedUtf8Content ( ) ;
|
||||
char **getUtf8Content ( ) ;
|
||||
// we download large files to a file on disk, like warcs and arcs
|
||||
BigFile *getUtf8ContentInFile ( int64_t *fileSizeArg );
|
||||
FILE *getUtf8ContentInFile ( );
|
||||
int32_t *getContentHash32 ( ) ;
|
||||
int32_t *getContentHashJson32 ( ) ;
|
||||
//int32_t *getTagHash32 ( ) ;
|
||||
@ -768,6 +773,8 @@ class XmlDoc {
|
||||
uint64_t m_ipStartTime;
|
||||
uint64_t m_ipEndTime;
|
||||
|
||||
bool m_updatedMetaData;
|
||||
|
||||
void copyFromOldDoc ( class XmlDoc *od ) ;
|
||||
|
||||
class SpiderReply *getFakeSpiderReply ( );
|
||||
@ -813,6 +820,7 @@ class XmlDoc {
|
||||
int32_t getBoostFromSiteNumInlinks ( int32_t inlinks ) ;
|
||||
bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ;
|
||||
bool hashMetaTags ( class HashTableX *table ) ;
|
||||
bool hashMetaData ( class HashTableX *table ) ;
|
||||
bool hashIsClean ( class HashTableX *table ) ;
|
||||
bool hashZipCodes ( class HashTableX *table ) ;
|
||||
bool hashMetaZip ( class HashTableX *table ) ;
|
||||
@ -1067,6 +1075,7 @@ class XmlDoc {
|
||||
int32_t m_addedSpiderRequestSize;
|
||||
int32_t m_addedSpiderReplySize;
|
||||
int32_t m_addedStatusDocSize;
|
||||
int64_t m_addedStatusDocId;
|
||||
|
||||
SafeBuf m_metaList2;
|
||||
SafeBuf m_zbuf;
|
||||
@ -1084,12 +1093,16 @@ class XmlDoc {
|
||||
int32_t m_warcError ;
|
||||
int32_t m_arcError ;
|
||||
bool m_doneInjectingWarc ;
|
||||
bool m_doneInjectingArc ;
|
||||
int64_t m_fileOff ;
|
||||
|
||||
int64_t m_bytesStreamed;
|
||||
char *m_fileBuf ;
|
||||
int32_t m_fileBufAllocSize;
|
||||
bool m_registeredWgetReadCallback;
|
||||
char *m_fptr ;
|
||||
char *m_fptrEnd ;
|
||||
|
||||
FILE* m_pipe;
|
||||
|
||||
BigFile m_file;
|
||||
int64_t m_fileSize;
|
||||
FileState m_fileState;
|
||||
@ -2401,7 +2414,6 @@ class XmlDoc {
|
||||
bool m_setFromDocId;
|
||||
bool m_freeLinkInfo1;
|
||||
bool m_freeLinkInfo2;
|
||||
|
||||
bool m_contentInjected;
|
||||
|
||||
bool m_recycleContent;
|
||||
@ -2470,7 +2482,8 @@ class XmlDoc {
|
||||
// for container docs consisting of subdocs to inject
|
||||
char *contentDelim = NULL,
|
||||
char* metadata = NULL,
|
||||
uint32_t metadataLen = 0);
|
||||
uint32_t metadataLen = 0,
|
||||
int32_t payloadLen = -1);
|
||||
|
||||
|
||||
bool injectLinks ( HashTableX *linkDedupTable ,
|
||||
|
@ -2515,7 +2515,7 @@ int32_t deserializeMsg ( int32_t baseSize ,
|
||||
return baseSize + (p - stringBuf);//getStringBuf());
|
||||
}
|
||||
|
||||
void deserializeMsg2 ( char **firstStrPtr , // ptr_url
|
||||
bool deserializeMsg2 ( char **firstStrPtr , // ptr_url
|
||||
int32_t *firstSizeParm ) { // size_url
|
||||
int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
|
||||
// point to our string buffer
|
||||
@ -2531,7 +2531,7 @@ void deserializeMsg2 ( char **firstStrPtr , // ptr_url
|
||||
// make it NULL if size is 0 though
|
||||
if ( *sizePtr == 0 ) *strPtr = NULL;
|
||||
// sanity check
|
||||
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
|
||||
if ( *sizePtr < 0 ) return false;//{ char *xx = NULL; *xx =0; }
|
||||
// advance our destination ptr
|
||||
p += *sizePtr;
|
||||
// advance both ptrs to next string
|
||||
@ -2540,6 +2540,7 @@ void deserializeMsg2 ( char **firstStrPtr , // ptr_url
|
||||
}
|
||||
// return how many bytes we processed
|
||||
//return baseSize + (p - stringBuf);//getStringBuf());
|
||||
return true;
|
||||
}
|
||||
|
||||
// print it to stdout for debugging Dates.cpp
|
||||
@ -2618,4 +2619,3 @@ bool verifyUtf8 ( char *txt ) {
|
||||
int32_t tlen = gbstrlen(txt);
|
||||
return verifyUtf8(txt,tlen);
|
||||
}
|
||||
|
||||
|
@ -237,7 +237,7 @@ bool saveTimeAdjustment ( ) ;
|
||||
#define is_hspace_a(c) g_map_is_hspace[(unsigned char)c]
|
||||
#define is_ascii(c) g_map_is_ascii[(unsigned char)c]
|
||||
#define is_ascii9(c) g_map_is_ascii[(unsigned char)c]
|
||||
#define is_ascii3(c) g_map_is_ascii3[(unsigned char)c]
|
||||
#define is_ascii3(c) ((unsigned char)c<128 || g_map_is_ascii3[(unsigned char)c])
|
||||
#define is_punct_a(c) g_map_is_punct[(unsigned char)c]
|
||||
#define is_alnum_a(c) g_map_is_alnum[(unsigned char)c]
|
||||
#define is_alpha_a(c) g_map_is_alpha[(unsigned char)c]
|
||||
@ -627,6 +627,6 @@ int32_t deserializeMsg ( int32_t baseSize ,
|
||||
char **firstStrPtr ,
|
||||
char *stringBuf ) ;
|
||||
|
||||
void deserializeMsg2 ( char **firstStrPtr , int32_t *firstSizeParm );
|
||||
bool deserializeMsg2 ( char **firstStrPtr , int32_t *firstSizeParm );
|
||||
|
||||
#endif
|
||||
|
@ -236,7 +236,7 @@ int filterContent ( char *buf , int32_t n , int32_t mimeLen , char ctype , int32
|
||||
|
||||
//fprintf(stderr,"in=%s\n",in);
|
||||
|
||||
int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU );
|
||||
int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU | S_IRWXG );
|
||||
if ( fd < 0 ) {
|
||||
fprintf(stderr,"gbfilter: open: %s\n",strerror(errno));
|
||||
return -1;
|
||||
|
144
main.cpp
144
main.cpp
@ -289,7 +289,7 @@ bool summaryTest1 ( char *rec, int32_t listSize, char *coll , int64_t docId ,
|
||||
// time a big write, read and then seeks
|
||||
bool thrutest ( char *testdir , int64_t fileSize ) ;
|
||||
void seektest ( char *testdir , int32_t numThreads , int32_t maxReadSize ,
|
||||
char *filename );
|
||||
char *filename , bool doSeqWriteThread );
|
||||
|
||||
bool pingTest ( int32_t hid , uint16_t clientPort );
|
||||
bool memTest();
|
||||
@ -810,17 +810,21 @@ int main2 ( int argc , char *argv[] ) {
|
||||
"parser speed tests\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
"thrutest [dir] [fileSize]\n\tdisk write/read speed "
|
||||
"test\n\n"
|
||||
"thrutest [dir] [fileSize]\n\tdisk sequential "
|
||||
"write then read speed tests.\n\n"
|
||||
|
||||
"seektest [dir] [numThreads] [maxReadSize] "
|
||||
"[filename]\n"
|
||||
"\tdisk seek speed test\n\n"
|
||||
"\tdisk access speed test. (IOps)\n\n"
|
||||
|
||||
"rwtest [dir] [numThreads] [maxReadSize] "
|
||||
"[filename]\n"
|
||||
"\tdisk read access speed test while sequentially "
|
||||
"writing. Simulates Gigablast while spidering and "
|
||||
"querying nicely.\n\n"
|
||||
|
||||
"memtest\n"
|
||||
"\t Test how much memory we can use\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
// Quality Tests
|
||||
@ -1390,7 +1394,20 @@ int main2 ( int argc , char *argv[] ) {
|
||||
if ( cmdarg+2 < argc ) numThreads = atol(argv[cmdarg+2]);
|
||||
if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]);
|
||||
if ( cmdarg+4 < argc ) filename = argv[cmdarg+4];
|
||||
seektest ( testdir , numThreads , maxReadSize , filename );
|
||||
seektest ( testdir , numThreads , maxReadSize ,filename,false);
|
||||
return 0;
|
||||
}
|
||||
// gb rwtest <testdir> <numThreads> <maxReadSize>
|
||||
if ( strcmp ( cmd , "rwtest" ) == 0 ) {
|
||||
char *testdir = "/tmp/";
|
||||
int32_t numThreads = 20; //30;
|
||||
int64_t maxReadSize = 20000;
|
||||
char *filename = NULL;
|
||||
if ( cmdarg+1 < argc ) testdir = argv[cmdarg+1];
|
||||
if ( cmdarg+2 < argc ) numThreads = atol(argv[cmdarg+2]);
|
||||
if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]);
|
||||
if ( cmdarg+4 < argc ) filename = argv[cmdarg+4];
|
||||
seektest ( testdir , numThreads , maxReadSize,filename,true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2572,6 +2589,13 @@ int main2 ( int argc , char *argv[] ) {
|
||||
false );// sendtoproxies
|
||||
}
|
||||
|
||||
if ( strcmp ( cmd , "unittest" ) == 0 ) {
|
||||
if ( cmdarg + 1 >= argc ) exit(1);
|
||||
if(strcmp("url", argv[cmdarg+1]) == 0) {
|
||||
exit(Url::unitTests());
|
||||
}
|
||||
}
|
||||
|
||||
// gb startclassifier coll ruleset [hostId]
|
||||
/*
|
||||
if ( strcmp ( cmd , "startclassifier" ) == 0 ) {
|
||||
@ -4936,7 +4960,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
// ensure directory is there, if
|
||||
// not then make it
|
||||
"ssh %s 'mkdir %s' ; "
|
||||
"scp -r %s %s:%s"
|
||||
"scp -p -r %s %s:%s"
|
||||
, ipStr
|
||||
, h2->m_dir
|
||||
|
||||
@ -5022,7 +5046,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
if ( ! f.doesExist() ) target = "gb";
|
||||
|
||||
sprintf(tmp,
|
||||
"scp -c arcfour " // blowfish is faster
|
||||
"scp -p " // blowfish is faster
|
||||
"%s%s "
|
||||
"%s:%s/gb.installed%s",
|
||||
dir,
|
||||
@ -5058,7 +5082,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
// don't copy to ourselves
|
||||
//if ( h2->m_hostId == h->m_hostId ) continue;
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%sgb.new "
|
||||
"%s:%s/tmpgb.installed &",
|
||||
dir,
|
||||
@ -5071,7 +5095,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
// don't copy to ourselves
|
||||
//if ( h2->m_hostId == h->m_hostId ) continue;
|
||||
sprintf(tmp,
|
||||
"scp %sgb.conf %shosts.conf %s:%s %s",
|
||||
"scp -p %sgb.conf %shosts.conf %s:%s %s",
|
||||
dir ,
|
||||
dir ,
|
||||
//h->m_hostId ,
|
||||
@ -5453,7 +5477,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
}
|
||||
*/
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/content.rdf.u8 "
|
||||
"%s:%scatdb/content.rdf.u8",
|
||||
dir,
|
||||
@ -5462,7 +5486,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
log(LOG_INIT,"admin: %s", tmp);
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/structure.rdf.u8 "
|
||||
"%s:%scatdb/structure.rdf.u8",
|
||||
dir,
|
||||
@ -5471,7 +5495,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
log(LOG_INIT,"admin: %s", tmp);
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/gbdmoz.structure.dat "
|
||||
"%s:%scatdb/gbdmoz.structure.dat",
|
||||
dir,
|
||||
@ -5480,7 +5504,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
log(LOG_INIT,"admin: %s", tmp);
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/gbdmoz.content.dat "
|
||||
"%s:%scatdb/gbdmoz.content.dat",
|
||||
dir,
|
||||
@ -5503,7 +5527,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
// don't copy to ourselves
|
||||
if ( h2->m_hostId == 0 ) continue;
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/content.rdf.u8.new "
|
||||
"%s:%scatdb/content.rdf.u8.new",
|
||||
dir,
|
||||
@ -5512,7 +5536,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
log(LOG_INIT,"admin: %s", tmp);
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/structure.rdf.u8.new "
|
||||
"%s:%scatdb/structure.rdf.u8.new",
|
||||
dir,
|
||||
@ -5521,7 +5545,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
log(LOG_INIT,"admin: %s", tmp);
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/gbdmoz.structure.dat.new "
|
||||
"%s:%scatdb/gbdmoz.structure.dat.new",
|
||||
dir,
|
||||
@ -5530,7 +5554,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
log(LOG_INIT,"admin: %s", tmp);
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/gbdmoz.content.dat.new "
|
||||
"%s:%scatdb/gbdmoz.content.dat.new",
|
||||
dir,
|
||||
@ -5539,7 +5563,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
log(LOG_INIT,"admin: %s", tmp);
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"scp "
|
||||
"scp -p "
|
||||
"%scatdb/gbdmoz.content.dat.new.diff "
|
||||
"%s:%scatdb/gbdmoz.content.dat.new.diff",
|
||||
dir,
|
||||
@ -6384,6 +6408,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
|
||||
bool justPrintSentences,
|
||||
bool justPrintWords ) {
|
||||
|
||||
g_isDumpingRdbFromMain = 1;
|
||||
if (!ucInit(g_hostdb.m_dir, true)) {
|
||||
log("Unicode initialization failed!");
|
||||
return;
|
||||
@ -6903,6 +6928,8 @@ void dumpDoledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTr
|
||||
printf("\n");
|
||||
// must be a request -- for now, for stats
|
||||
if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) {
|
||||
// error!
|
||||
continue;
|
||||
char *xx=NULL;*xx=0; }
|
||||
// cast it
|
||||
SpiderRequest *sreq = (SpiderRequest *)srec;
|
||||
@ -11642,17 +11669,19 @@ static BigFile s_f;
|
||||
static int32_t s_numThreads = 0;
|
||||
static int64_t s_maxReadSize = 1;
|
||||
static int64_t s_startTime = 0;
|
||||
static bool s_doSeqWriteThread;
|
||||
//#define MAX_READ_SIZE (2000000)
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
|
||||
void seektest ( char *testdir, int32_t numThreads, int32_t maxReadSize ,
|
||||
char *filename ) {
|
||||
char *filename , bool doSeqWriteThread ) {
|
||||
|
||||
g_loop.init();
|
||||
g_threads.init();
|
||||
s_numThreads = numThreads;
|
||||
s_maxReadSize = maxReadSize;
|
||||
s_doSeqWriteThread = doSeqWriteThread;
|
||||
if ( s_maxReadSize <= 0 ) s_maxReadSize = 1;
|
||||
//if ( s_maxReadSize > MAX_READ_SIZE ) s_maxReadSize = MAX_READ_SIZE;
|
||||
|
||||
@ -11689,7 +11718,7 @@ void seektest ( char *testdir, int32_t numThreads, int32_t maxReadSize ,
|
||||
"exist. Use ./gb thrutest ... to create speedtest* files.");
|
||||
return;
|
||||
skip:
|
||||
s_f.open ( O_RDONLY );
|
||||
s_f.open ( O_RDWR );
|
||||
s_filesize = s_f.getFileSize();
|
||||
log ( LOG_INIT, "admin: file size = %"INT64".",s_filesize);
|
||||
// always block
|
||||
@ -11719,6 +11748,30 @@ skip:
|
||||
//s_lock = 1;
|
||||
//pthread_t tid1 ; //, tid2;
|
||||
|
||||
//g_conf.m_logDebugThread = 1;
|
||||
|
||||
// garbage collection on ssds seems to be triggered by writes so
|
||||
// that they do not hurt read times, do this:
|
||||
g_conf.m_flushWrites = 1;
|
||||
|
||||
// disable linux file cache
|
||||
// system("echo 1 > /proc/sys/vm/drop_caches");
|
||||
|
||||
// -o sync TOTAL WORKS!!!!!!!
|
||||
// mount with -o sync to disable write page caching on linux
|
||||
|
||||
// disable on-disk write cache
|
||||
// system("sudo hdparm -W 0 /dev/sda2");
|
||||
// system("sudo hdparm -W 0 /dev/sdb1");
|
||||
// system("sudo hdparm -W 0 /dev/sdc1");
|
||||
// system("sudo hdparm -W 0 /dev/sdd1");
|
||||
|
||||
// disable read-ahead
|
||||
// system("sudo hdparm -A 0 /dev/sda2");
|
||||
// system("sudo hdparm -A 0 /dev/sdb1");
|
||||
// system("sudo hdparm -A 0 /dev/sdc1");
|
||||
// system("sudo hdparm -A 0 /dev/sdd1");
|
||||
|
||||
// set time
|
||||
s_startTime = gettimeofdayInMilliseconds_force();
|
||||
|
||||
@ -11771,6 +11824,7 @@ void *startUp ( void *state , ThreadEntry *t ) {
|
||||
// fprintf(stderr,"Threads::startUp: setpriority: failed\n");
|
||||
// exit(-1);
|
||||
//}
|
||||
|
||||
// read buf
|
||||
//char buf [ MAX_READ_SIZE ];
|
||||
#undef malloc
|
||||
@ -11782,13 +11836,25 @@ void *startUp ( void *state , ThreadEntry *t ) {
|
||||
}
|
||||
// we got ourselves
|
||||
s_launched++;
|
||||
|
||||
char *s = "reads";
|
||||
if ( id == 0 && s_doSeqWriteThread )
|
||||
s = "writes";
|
||||
// msg
|
||||
fprintf(stderr,"id=%"INT32" launched. Performing 100000 reads.\n",id);
|
||||
fprintf(stderr,"threadid=%"INT32" launched. "
|
||||
"Performing 100000 %s.\n",id,s);
|
||||
|
||||
// #undef sleep
|
||||
// if ( id == 0 ) sleep(1000);
|
||||
// #define sleep(a) { char *xx=NULL;*xx=0; }
|
||||
|
||||
|
||||
// wait for lock to be unleashed
|
||||
//while ( s_launched != s_numThreads ) usleep(10);
|
||||
// now do a stupid loop
|
||||
//int32_t j, off , size;
|
||||
int64_t off , size;
|
||||
int64_t seqOff = 0;
|
||||
for ( int32_t i = 0 ; i < 100000 ; i++ ) {
|
||||
uint64_t r = rand();
|
||||
r <<= 32 ;
|
||||
@ -11802,7 +11868,13 @@ void *startUp ( void *state , ThreadEntry *t ) {
|
||||
int64_t start = gettimeofdayInMilliseconds_force();
|
||||
//fprintf(stderr,"%"INT32") i=%"INT32" start\n",id,i );
|
||||
//pread ( s_fd1 , buf , size , off );
|
||||
s_f.read ( buf , size , off );
|
||||
if ( id == 0 && s_doSeqWriteThread )
|
||||
s_f.write ( buf , size , seqOff );
|
||||
else
|
||||
s_f.read ( buf , size , off );
|
||||
seqOff += size;
|
||||
if ( seqOff + size > s_filesize )
|
||||
seqOff = 0;
|
||||
//fprintf(stderr,"%"INT32") i=%"INT32" done\n",id,i );
|
||||
int64_t now = gettimeofdayInMilliseconds_force();
|
||||
#undef usleep
|
||||
@ -11811,13 +11883,25 @@ void *startUp ( void *state , ThreadEntry *t ) {
|
||||
s_count++;
|
||||
float sps = (float)((float)s_count * 1000.0) /
|
||||
(float)(now - s_startTime);
|
||||
fprintf(stderr,"count=%"INT32" off=%012"INT64" size=%"INT32" time=%"INT32"ms "
|
||||
"(%.2f seeks/sec)\n",
|
||||
int64_t poff = off;
|
||||
char *str = "seeks";
|
||||
if ( id == 0 && s_doSeqWriteThread ) {
|
||||
poff = seqOff;
|
||||
str = "writes";
|
||||
}
|
||||
fprintf(stderr,"threadid=%i "
|
||||
"count=%"INT32" "
|
||||
"off=%012"INT64" "
|
||||
"size=%"INT32" "
|
||||
"time=%"INT32"ms "
|
||||
"(%.2f %s/sec)\n",
|
||||
(int)id,
|
||||
(int32_t)s_count,
|
||||
(int64_t)off,
|
||||
(int64_t)poff,
|
||||
(int32_t)size,
|
||||
(int32_t)(now - start) ,
|
||||
sps );
|
||||
sps ,
|
||||
str );
|
||||
}
|
||||
|
||||
|
||||
@ -16849,7 +16933,7 @@ void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool inclu
|
||||
int32_t filenum = 0;
|
||||
char filename[64];
|
||||
sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
|
||||
int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
|
||||
//int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
|
||||
int32_t numDumped = 0;
|
||||
uint32_t bytesDumped = 0;
|
||||
loop:
|
||||
@ -17016,7 +17100,7 @@ void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool inclu
|
||||
filenum++;
|
||||
sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
|
||||
close(FD);
|
||||
FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
|
||||
//FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
|
||||
bytesDumped = 0;
|
||||
fprintf(stderr, "Started new file: %s. starts at docId: %"INT64".\n",filename, lastDocId);
|
||||
}
|
||||
|
43
qa.cpp
43
qa.cpp
@ -248,10 +248,10 @@ void makeQADir ( ) {
|
||||
char dir[1024];
|
||||
snprintf(dir,1000,"%sqa",g_hostdb.m_dir);
|
||||
log("mkdir mkdir %s",dir);
|
||||
int32_t status = ::mkdir ( dir ,
|
||||
S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
S_IROTH | S_IXOTH );
|
||||
int32_t status = ::mkdir ( dir ,getDirCreationFlags() );
|
||||
// S_IRUSR | S_IWUSR | S_IXUSR |
|
||||
// S_IRGRP | S_IWGRP | S_IXGRP |
|
||||
// S_IROTH | S_IXOTH );
|
||||
if ( status == -1 && errno != EEXIST && errno )
|
||||
log("qa: Failed to make directory %s: %s.",
|
||||
dir,mstrerror(errno));
|
||||
@ -1459,6 +1459,13 @@ bool qaTimeAxis ( ) {
|
||||
"format=xml&u=");
|
||||
sb.urlEncode ( s_urlPtrs[s_flags[URL_COUNTER]]);
|
||||
sb.safePrintf("&hasmime=1");
|
||||
// add some meta data now, the current time stamp so we can
|
||||
// make sure the meta data is updated even if its EDOCUNCHANGED
|
||||
sb.safePrintf("&metadata=");
|
||||
static int32_t s_count9 = 0;
|
||||
SafeBuf tmp;
|
||||
tmp.safePrintf("{\"qatesttime\":%"INT32"}\n",s_count9++);
|
||||
sb.urlEncode ( tmp.getBufStart(), tmp.getLength() );
|
||||
sb.safePrintf("&content=");
|
||||
sb.urlEncode(s_contentPtrs[contentIndex]);
|
||||
|
||||
@ -1494,13 +1501,17 @@ bool qaTimeAxis ( ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if ( ! s_flags[EXAMINE_RESULTS] ) {
|
||||
// s_flags[16] = true;
|
||||
// if ( ! getUrl ( "/search?c=qatest123&qa=1&q=%2Bthe"
|
||||
// "&dsrt=500",
|
||||
// 702467314 ) )
|
||||
// return false;
|
||||
// }
|
||||
// this doc should have qatesttime:197 and qatesttime:198
|
||||
// since it had a EDOCUNCHANGED error the 2nd time around but
|
||||
// different metadata.
|
||||
if ( ! s_flags[EXAMINE_RESULTS1] ) {
|
||||
s_flags[EXAMINE_RESULTS1] = true;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&"
|
||||
"format=json&"
|
||||
"q=qatesttime:197",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -1534,6 +1545,8 @@ bool qaWarcFiles ( ) {
|
||||
"&obeyRobots=0"
|
||||
// This is what we are testing
|
||||
"&usetimeaxis=1"
|
||||
// we are indexing warc files
|
||||
"&indexwarcs=1"
|
||||
,
|
||||
// checksum of reply expected
|
||||
0 ) )
|
||||
@ -1638,7 +1651,7 @@ bool qaInjectMetadata ( ) {
|
||||
|
||||
char* metadata = "{\"testtest\":42,\"a-hyphenated-name\":5, "
|
||||
"\"a-string-value\":\"can we search for this\", "
|
||||
"an array:['a','b', 'c', 1,2,3], "
|
||||
"\"an array\":[\"a\",\"b\", \"c\", 1,2,3], "
|
||||
"\"a field with spaces\":6, \"compound\":{\"field\":7}}";
|
||||
|
||||
s_flags[ADD_INITIAL_URLS]++;
|
||||
@ -3401,9 +3414,9 @@ static QATest s_qatests[] = {
|
||||
"when content has changed, even if the url is the same. "},
|
||||
|
||||
|
||||
{qaWarcFiles,
|
||||
"indexWarcFiles",
|
||||
"Ensure the spider handles arc.gz and warc.gz file formats."},
|
||||
// {qaWarcFiles,
|
||||
// "indexWarcFiles",
|
||||
// "Ensure the spider handles arc.gz and warc.gz file formats."},
|
||||
|
||||
{qaInjectMetadata,
|
||||
"injectMetadata",
|
||||
|
@ -11,13 +11,14 @@ import sqlite3
|
||||
import datetime
|
||||
import sys
|
||||
import time
|
||||
import flask
|
||||
# import flask
|
||||
import signal, os
|
||||
import random
|
||||
from itertools import repeat
|
||||
staleTime = datetime.timedelta(90,0,0) # three month for now
|
||||
|
||||
app = flask.Flask(__name__)
|
||||
app.secret_key = 'oaisj84alwsdkjhf9238u'
|
||||
staleTime = datetime.timedelta(7,0,0) # one week for now
|
||||
# app = flask.Flask(__name__)
|
||||
# app.secret_key = 'oaisj84alwsdkjhf9238u'
|
||||
|
||||
def getDb(makeDates=True):
|
||||
if makeDates:
|
||||
@ -33,6 +34,9 @@ def handler(signum, frame):
|
||||
#Generate environment with:
|
||||
#pex -r requests -r multiprocessing -e inject:main -o warc-inject -s '.' --no-wheel
|
||||
#pex -r requests -r multiprocessing -o warc-inject
|
||||
# see the Makefile
|
||||
|
||||
# TODO: add argument parser
|
||||
# import argparse
|
||||
# parser = argparse.ArgumentParser()
|
||||
# parser.add_argument('--foo', help='foo help')
|
||||
@ -63,13 +67,16 @@ def reallyExecuteMany(c, query, qargs):
|
||||
|
||||
|
||||
def injectItem(item, db, mode):
|
||||
itemStart = time.time()
|
||||
|
||||
c = db.cursor()
|
||||
res = reallyExecute(c, 'select * from items where item = ?', (item,)).fetchone()
|
||||
db.commit()
|
||||
itemId = None
|
||||
if res:
|
||||
if res[1] > (datetime.datetime.now() - staleTime):
|
||||
print 'skipping %s because we checked recently' % item
|
||||
return 0 # We checked recently
|
||||
return time.time() - itemStart # We checked recently
|
||||
itemId = res[0]
|
||||
|
||||
|
||||
@ -83,7 +90,7 @@ def injectItem(item, db, mode):
|
||||
except Exception, e:
|
||||
print 'error: metadata feed went down (%s) for: %s' % (e, item)
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
|
||||
if itemId is None:
|
||||
reallyExecute(c, "insert INTO items VALUES (?,?)", (item, datetime.datetime.now()))
|
||||
@ -91,11 +98,12 @@ def injectItem(item, db, mode):
|
||||
db.commit()
|
||||
|
||||
if 'files' not in md:
|
||||
return
|
||||
time.time() - itemStart
|
||||
|
||||
res = None
|
||||
res = reallyExecute(c, "select fileName, updated, status, took from files where itemId = ?",
|
||||
(itemId,)).fetchall()
|
||||
db.commit()
|
||||
|
||||
lastUpdate = {}
|
||||
for fileName, updated, status, took in res:
|
||||
@ -105,22 +113,31 @@ def injectItem(item, db, mode):
|
||||
|
||||
dbUpdates = []
|
||||
skipped = 0
|
||||
for ff in md['files']:
|
||||
if not ff['name'].endswith('arc.gz'): continue
|
||||
warcs = filter(lambda x: 'name' in x and x['name'].endswith and x['name'].endswith('arc.gz'), md['files'])
|
||||
collectionName = md['metadata'].get('archiveit-collection-name', '')
|
||||
for ii, ff in enumerate(warcs):
|
||||
#if not ff['name'].endswith('arc.gz'): continue
|
||||
itemMetadata = {'mtime':ff['mtime']}
|
||||
updateTime = datetime.datetime.fromtimestamp(float(ff['mtime']))
|
||||
if ff['name'] in lastUpdate and updateTime <= lastUpdate[ff['name']]:
|
||||
if mode != 'force' and ff['name'] in lastUpdate and updateTime <= lastUpdate[ff['name']]:
|
||||
print "skip {0} because it is up to date".format(ff['name'])
|
||||
skipped += 1
|
||||
requests.post('http://localhost:10008/progress',
|
||||
json={'item':item, 'total':len(warcs), 'done':ii+1,
|
||||
'collection-name':collectionName})
|
||||
continue
|
||||
|
||||
itemMetadata.update(md['metadata'])
|
||||
postVars = {'url':'http://archive.org/download/%s/%s' %
|
||||
(item,ff['name']),
|
||||
'metadata':json.dumps(itemMetadata),
|
||||
'c':'ait'}
|
||||
'c':'ait',
|
||||
'spiderlinks':0}
|
||||
start = time.time()
|
||||
if mode == 'production':
|
||||
if mode == 'testing':
|
||||
time.sleep(random.randint(1,4))
|
||||
statusCode = 999
|
||||
else:
|
||||
try:
|
||||
rp = requests.post("http://localhost:8000/admin/inject", postVars)
|
||||
statusCode = rp.status_code
|
||||
@ -129,49 +146,60 @@ def injectItem(item, db, mode):
|
||||
print 'error: gb inject', postVars['url'], e
|
||||
statusCode = -1
|
||||
#print postVars['url'], rp.status_code
|
||||
else:
|
||||
time.sleep(random.randint(1,4))
|
||||
statusCode = 999
|
||||
took = time.time() - start
|
||||
|
||||
print "sent", ff['name'],'to gb, took', took
|
||||
sys.stdout.flush()
|
||||
|
||||
dbUpdates.append((itemId, ff['name'], updateTime, statusCode, took))
|
||||
requests.post('http://localhost:10008/progress',
|
||||
json={'item':item, 'total':len(warcs), 'done':ii+1,
|
||||
'collection-name':collectionName})
|
||||
|
||||
reallyExecuteMany(c, "DELETE FROM files where fileName = ? ", zip(lastUpdate.iterkeys()))
|
||||
reallyExecuteMany(c, "INSERT INTO files VALUES (?,?,?,?,?)",
|
||||
dbUpdates)
|
||||
db.commit()
|
||||
|
||||
if len(dbUpdates):
|
||||
reallyExecuteMany(c, "DELETE FROM files where fileName = ? ", zip(lastUpdate.iterkeys()))
|
||||
reallyExecuteMany(c, "INSERT INTO files VALUES (?,?,?,?,?)",
|
||||
dbUpdates)
|
||||
db.commit()
|
||||
print 'completed %s with %s items injected and %s skipped' % (item, len(dbUpdates), skipped)
|
||||
return time.time() - itemStart
|
||||
|
||||
|
||||
def getPage(zippedArgs):
|
||||
page, mode = zippedArgs
|
||||
page, mode, resultsPerPage, extraQuery = zippedArgs
|
||||
query = 'collection%3Aarchiveitdigitalcollection+' + extraQuery
|
||||
#r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&rows=1&page={0}&output=json&save=yes'.format(page))
|
||||
r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=100&page={0}&output=json&save=yes'.format(page))
|
||||
if r.status_code != 200:
|
||||
return 0
|
||||
url = 'https://archive.org/advancedsearch.php?q={1}&fl%5B%5D=identifier&sort[]=date+asc&rows={2}&page={0}&output=json'.format(page, query, resultsPerPage)
|
||||
try:
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
return 0
|
||||
|
||||
contents = r.content
|
||||
jsonContents = json.loads(contents)
|
||||
items = [x['identifier'] for x in jsonContents['response']['docs']]
|
||||
numFound = jsonContents['response']['numFound']
|
||||
|
||||
if len(items) == 0:
|
||||
print 'got 0 items for search page', page
|
||||
return 0
|
||||
print 'loading %s items, %s - %s of %s' % (len(items), items[0], items[-1], numFound)
|
||||
|
||||
db = getDb()
|
||||
|
||||
for item in items:
|
||||
injectItem(item, db, mode)
|
||||
|
||||
db.close()
|
||||
return len(items)
|
||||
contents = r.content
|
||||
jsonContents = json.loads(contents)
|
||||
items = [x['identifier'] for x in jsonContents['response']['docs']]
|
||||
numFound = jsonContents['response']['numFound']
|
||||
|
||||
if len(items) == 0:
|
||||
requests.post('http://localhost:10008/progress', json={'total':numFound, 'completed':'', 'query':extraQuery})
|
||||
print 'got 0 items for search page', page
|
||||
return 0
|
||||
print 'loading %s items, %s - %s of %s' % (len(items), items[0], items[-1], numFound)
|
||||
|
||||
for item in items:
|
||||
db = getDb()
|
||||
took = injectItem(item, db, mode)
|
||||
db.close()
|
||||
requests.post('http://localhost:10008/progress', json={'total':numFound,
|
||||
'completed':item,
|
||||
'query':extraQuery,
|
||||
'took':took})
|
||||
return len(items)
|
||||
except Exception, e:
|
||||
print 'Caught', e, 'sleep and retry', url
|
||||
time.sleep(60)
|
||||
return getPage(zippedArgs)
|
||||
|
||||
|
||||
def dumpDb():
|
||||
@ -197,6 +225,10 @@ def showItems():
|
||||
|
||||
|
||||
def nuke(lastPid, fromOrbit=False):
|
||||
try:
|
||||
requests.post('http://localhost:10008/shutdown', {})
|
||||
except:
|
||||
pass
|
||||
sig = signal.SIGTERM
|
||||
if fromOrbit:
|
||||
sig = signal.SIGKILL
|
||||
@ -209,7 +241,7 @@ def nuke(lastPid, fromOrbit=False):
|
||||
except:
|
||||
pass
|
||||
|
||||
killed = subprocess.Popen("""kill `ps auxx |grep warc-inject|awk -e '{print $2}'`""" % sys.argv[0],
|
||||
killed = subprocess.Popen("""kill `ps auxx |grep warc-inject|grep -v grep|awk -e '{print $2}'`""",
|
||||
shell=True,stdout=subprocess.PIPE).communicate()[0]
|
||||
|
||||
if killed == 'Terminated':
|
||||
@ -219,13 +251,47 @@ def nuke(lastPid, fromOrbit=False):
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
lastPid = open('running.pid', 'r').read()
|
||||
except:
|
||||
lastPid = None
|
||||
global staleTime
|
||||
print 'arguments were', sys.argv, 'pid is', os.getpid()
|
||||
open('running.pid', 'w').write(str(os.getpid()))
|
||||
|
||||
if sys.argv[1] != 'monitor':
|
||||
try:
|
||||
lastPid = open('running.pid', 'r').read()
|
||||
except:
|
||||
lastPid = None
|
||||
open('running.pid', 'w').write(str(os.getpid()))
|
||||
|
||||
# p = multiprocessing.Process(target=serveForever)
|
||||
# p.start()
|
||||
|
||||
if sys.argv[1] == 'test':
|
||||
query = ''
|
||||
if len(sys.argv) == 3:
|
||||
query = sys.argv[2]
|
||||
|
||||
#subprocess.Popen(['python','inject', 'monitor'])
|
||||
|
||||
mode = 'testing'
|
||||
runInjects(10, 'testing', query)
|
||||
|
||||
if sys.argv[1] == 'run':
|
||||
query = ''
|
||||
if len(sys.argv) == 4:
|
||||
query = sys.argv[3]
|
||||
|
||||
#subprocess.Popen(['./warc-inject','monitor'])
|
||||
threads = int(sys.argv[2])
|
||||
runInjects(threads, 'production', query)
|
||||
print "done running"
|
||||
|
||||
|
||||
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
if sys.argv[1] == 'monitor':
|
||||
import monitor
|
||||
monitor.main()
|
||||
|
||||
if sys.argv[1] == 'init':
|
||||
init()
|
||||
print 'initialized'
|
||||
@ -247,6 +313,8 @@ def main():
|
||||
nuke(lastPid, fromOrbit=True)
|
||||
|
||||
if sys.argv[1] == 'test':
|
||||
subprocess.Popen(['./warc-inject','monitor'])
|
||||
|
||||
mode = 'testing'
|
||||
runInjects(10, 'testing')
|
||||
|
||||
@ -308,33 +376,106 @@ def main():
|
||||
|
||||
signal.alarm(0) # Disable the alarm
|
||||
|
||||
|
||||
|
||||
if sys.argv[1] == 'serve':
|
||||
serveForever()
|
||||
# if sys.argv[1] == 'serve':
|
||||
# serveForever()
|
||||
|
||||
if len(sys.argv) == 3:
|
||||
if sys.argv[1] == 'force':
|
||||
itemName = sys.argv[2]
|
||||
db = getDb()
|
||||
injectItem(itemName, db, 'production')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
|
||||
if len(sys.argv) == 4:
|
||||
if sys.argv[1] == 'injectfile':
|
||||
staleTime = datetime.timedelta(0,0,0)
|
||||
from multiprocessing.pool import ThreadPool
|
||||
fileName = sys.argv[2]
|
||||
items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
|
||||
threads = int(sys.argv[3])
|
||||
pool = ThreadPool(processes=threads)
|
||||
#print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
|
||||
def injectItemTupleWrapper(itemName):
|
||||
db = getDb()
|
||||
ret = injectItem(itemName, db, 'production')
|
||||
db.close()
|
||||
return ret
|
||||
|
||||
answer = pool.map(injectItemTupleWrapper, items)
|
||||
print 'finished: ', answer
|
||||
sys.exit(0)
|
||||
if sys.argv[1] == 'forcefile':
|
||||
staleTime = datetime.timedelta(0,0,0)
|
||||
from multiprocessing.pool import ThreadPool
|
||||
fileName = sys.argv[2]
|
||||
items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
|
||||
threads = int(sys.argv[3])
|
||||
pool = ThreadPool(processes=threads)
|
||||
#print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
|
||||
def injectItemTupleWrapper(itemName):
|
||||
db = getDb()
|
||||
ret = injectItem(itemName, db, 'force')
|
||||
db.close()
|
||||
return ret
|
||||
|
||||
answer = pool.map(injectItemTupleWrapper, items)
|
||||
print 'finished: ', answer
|
||||
sys.exit(0)
|
||||
|
||||
if sys.argv[1] == 'injectitems':
|
||||
from multiprocessing.pool import ThreadPool
|
||||
fileName = sys.argv[2]
|
||||
items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
|
||||
threads = int(sys.argv[3])
|
||||
pool = ThreadPool(processes=threads)
|
||||
#print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
|
||||
def injectItemTupleWrapper(itemName):
|
||||
db = getDb()
|
||||
ret = injectItem(itemName, db, 'production')
|
||||
db.close()
|
||||
return ret
|
||||
|
||||
answer = pool.map(injectItemTupleWrapper, items)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def getNumResults(query):
|
||||
query = 'collection%3Aarchiveitdigitalcollection+' + query
|
||||
r = requests.get('https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=identifier&sort[]=date+asc&rows=1&page=0&output=json'.format(query))
|
||||
if r.status_code != 200:
|
||||
return 0
|
||||
contents = r.content
|
||||
jsonContents = json.loads(contents)
|
||||
numFound = jsonContents['response']['numFound']
|
||||
return numFound
|
||||
|
||||
|
||||
if sys.argv[1] == 'run':
|
||||
threads = int(sys.argv[2])
|
||||
runInjects(threads)
|
||||
|
||||
# else:
|
||||
# #getPage(3)
|
||||
# from multiprocessing.pool import ThreadPool
|
||||
# pool = ThreadPool(processes=150)
|
||||
# pool.map(getPage, xrange(1,1300))
|
||||
|
||||
def runInjects(threads, mode='production'):
|
||||
def runInjects(threads, mode='production', query=''):
|
||||
from multiprocessing.pool import ThreadPool
|
||||
import math
|
||||
pool = ThreadPool(processes=threads)
|
||||
try:
|
||||
from itertools import repeat
|
||||
maxPages = 1300
|
||||
answer = pool.map(getPage, zip(xrange(1,maxPages), repeat(mode, maxPages)))
|
||||
totalResults = getNumResults(query)
|
||||
resultsPerPage = 100
|
||||
maxPages = int(math.ceil(totalResults / float(resultsPerPage)))
|
||||
if maxPages < threads:
|
||||
maxPages = threads
|
||||
resultsPerPage = int(math.ceil(totalResults / float(maxPages)))
|
||||
print threads, ' threads,', totalResults, 'total,', maxPages, 'pages', resultsPerPage, 'results per page'
|
||||
answer = pool.map(getPage, zip(xrange(1,maxPages),
|
||||
repeat(mode, maxPages),
|
||||
repeat(resultsPerPage, maxPages),
|
||||
repeat(query, maxPages)))
|
||||
print "finished item pass", answer
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
print 'ok, caught'
|
||||
raise
|
||||
requests.post('http://localhost:10008/shutdown', {})
|
||||
sys.exit(0)
|
||||
#raise
|
||||
|
||||
|
||||
def init():
|
||||
@ -351,73 +492,67 @@ def init():
|
||||
db.close()
|
||||
|
||||
|
||||
def serveForever():
|
||||
@app.route('/',
|
||||
methods=['GET', 'POST'], endpoint='home')
|
||||
def home():
|
||||
db = getDb(makeDates=False)
|
||||
res = db.execute('select * from items limit 10')
|
||||
for item, checked in res.fetchall():
|
||||
print item
|
||||
try:
|
||||
metadata = subprocess.Popen(['./ia','metadata', item],
|
||||
stdout=subprocess.PIPE).communicate()[0]
|
||||
# def serveForever():
|
||||
# @app.route('/',
|
||||
# methods=['GET', 'POST'], endpoint='home')
|
||||
# def home():
|
||||
# db = getDb(makeDates=False)
|
||||
# res = db.execute('select * from items limit 10')
|
||||
# for item, checked in res.fetchall():
|
||||
# print item
|
||||
# try:
|
||||
# metadata = subprocess.Popen(['./ia','metadata', item],
|
||||
# stdout=subprocess.PIPE).communicate()[0]
|
||||
|
||||
break
|
||||
except Exception, e:
|
||||
pass
|
||||
db.close()
|
||||
# break
|
||||
# except Exception, e:
|
||||
# pass
|
||||
# db.close()
|
||||
|
||||
# return flask.make_response(metadata)
|
||||
|
||||
# @app.route('/progress',
|
||||
# methods=['GET', 'POST'], endpoint='progress')
|
||||
# def progress():
|
||||
# r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=1&page=1&output=json')
|
||||
# if r.status_code != 200:
|
||||
# return flask.make_response(json.dumps({error:'ia search feed is down'}),
|
||||
# 'application/json')
|
||||
|
||||
# contents = r.content
|
||||
# jsonContents = json.loads(contents)
|
||||
# numFound = jsonContents['response']['numFound']
|
||||
|
||||
|
||||
# db = getDb()
|
||||
# examinedItems = db.execute('select count(*) from items').fetchone()
|
||||
# itemsWithWarc = db.execute('select count(*) from items where ROWID in (select itemId from files where files.status = 200)').fetchone()
|
||||
# return flask.make_response(json.dumps({'totalItems':numFound,
|
||||
# 'examinedItems':examinedItems,
|
||||
# 'itemsWithWarc':itemsWithWarc
|
||||
# }, indent=4), 'application/json')
|
||||
|
||||
|
||||
# @app.route('/items',
|
||||
# methods=['GET', 'POST'], endpoint='items')
|
||||
# def items():
|
||||
# db = getDb(makeDates=False)
|
||||
|
||||
# c = db.cursor()
|
||||
# res = c.execute("select item, checked from items")
|
||||
|
||||
# out = []
|
||||
# for item, checked in res.fetchall():
|
||||
# out.append({'item':item, 'checked':checked})
|
||||
# db.close()
|
||||
|
||||
return flask.make_response('hihih' + metadata)
|
||||
# return flask.make_response(json.dumps(out), 'application/json')
|
||||
|
||||
@app.route('/progress',
|
||||
methods=['GET', 'POST'], endpoint='progress')
|
||||
def progress():
|
||||
r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=1&page=1&output=json')
|
||||
if r.status_code != 200:
|
||||
return flask.make_response(json.dumps({error:'ia search feed is down'}),
|
||||
'application/json')
|
||||
|
||||
contents = r.content
|
||||
jsonContents = json.loads(contents)
|
||||
numFound = jsonContents['response']['numFound']
|
||||
|
||||
|
||||
db = getDb()
|
||||
examinedItems = db.execute('select count(*) from items').fetchone()
|
||||
itemsWithWarc = db.execute('select count(*) from items where ROWID in (select itemId from files where files.status = 200)').fetchone()
|
||||
return flask.make_response(json.dumps({'totalItems':numFound,
|
||||
'examinedItems':examinedItems,
|
||||
'itemsWithWarc':itemsWithWarc
|
||||
}, indent=4), 'application/json')
|
||||
|
||||
|
||||
@app.route('/items',
|
||||
methods=['GET', 'POST'], endpoint='items')
|
||||
def items():
|
||||
db = getDb(makeDates=False)
|
||||
|
||||
c = db.cursor()
|
||||
res = c.execute("select item, checked from items")
|
||||
|
||||
out = []
|
||||
for item, checked in res.fetchall():
|
||||
out.append({'item':item, 'checked':checked})
|
||||
db.close()
|
||||
|
||||
return flask.make_response(json.dumps(out), 'application/json')
|
||||
|
||||
app.run('0.0.0.0',
|
||||
port=7999,
|
||||
debug=True,
|
||||
use_reloader=True,
|
||||
use_debugger=True)
|
||||
# app.run('0.0.0.0',
|
||||
# port=7999,
|
||||
# debug=False,
|
||||
# use_reloader=False,
|
||||
# use_debugger=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
4109
script/inject/monitor.py
Normal file
4109
script/inject/monitor.py
Normal file
File diff suppressed because one or more lines are too long
@ -100,7 +100,7 @@ def getSplitTime():
|
||||
|
||||
|
||||
|
||||
def copyToTwins(fname):
|
||||
def copyToTwins(fname, backToFront=False):
|
||||
fh = open(fname, 'r')
|
||||
ret = {}
|
||||
hosts = []
|
||||
@ -117,23 +117,25 @@ def copyToTwins(fname):
|
||||
continue
|
||||
#print directory, ip1, note
|
||||
step = len(hosts)/2
|
||||
hostPlex = {}
|
||||
someIp = None
|
||||
cmds = []
|
||||
for hostId, dnsPort, httpsPort, httpPort, udbPort,ip1, ip2, directory, note in hosts[:step]:
|
||||
if ip1 not in hostPlex:
|
||||
hostPlex[ip1] = []
|
||||
someIp = ip1
|
||||
hostPlex[ip1].append('scp -r %s:%s* %s:%s. ' % (ip1, directory, (hosts[hostId + step][5]), (hosts[hostId + step][7])))
|
||||
backHostId, backDnsPort, backHttpsPort, backHttpPort, backUdbPort,backIp1, backIp2, backDirectory, backNote = hosts[hostId + step]
|
||||
|
||||
if note != directory:
|
||||
print 'oh looks like you overlooked host %s' % hostId
|
||||
if backNote != backDirectory:
|
||||
print 'oh looks like you overlooked host %s' % backHostId
|
||||
|
||||
if backToFront:
|
||||
cmd = 'scp -r %s:%s* %s:%s. &' % (backIp1, backDirectory, ip1, directory )
|
||||
else:
|
||||
cmd = 'scp -r %s:%s* %s:%s. &' % (ip1, directory, backIp1, backDirectory)
|
||||
cmds.append(cmd)
|
||||
#print 'scp -r %s:%s* %s:%s. &' % (ip1, directory, (hosts[hostId + step][5]), (hosts[hostId + step][7]))
|
||||
|
||||
while len(hostPlex[someIp]) > 0:
|
||||
cmd = []
|
||||
for cmd in cmds:
|
||||
print cmd
|
||||
|
||||
for ip in hostPlex.iterkeys():
|
||||
cmd.append(hostPlex[ip].pop())
|
||||
#print hostPlex[ip].pop()
|
||||
|
||||
print '&\n'.join(cmd), ';'
|
||||
|
||||
|
||||
def testDiskSpeed(host, directory):
|
||||
|
Binary file not shown.
Reference in New Issue
Block a user