743 lines
22 KiB
C++
743 lines
22 KiB
C++
#include "SiteGetter.h"
|
|
#include "Url.h"
|
|
#include "Msg1.h"
|
|
#include "Rdb.h"
|
|
#include "Posdb.h"
|
|
|
|
//
|
|
// BASIC IDEA
|
|
//
|
|
|
|
// if we got xyz.com/a/b/c/index.html then we hash the root like:
|
|
// siteterm:xyz.com/a/b/c/ but we only hash that if we are an index.html or
|
|
// other identifiable index page. i.e. we need to look like we are the root
|
|
// of a subsite. we could also check the link structure? how?
|
|
|
|
// then if we got a ton of siteterm:xyz.com/a/b/*/ terms we figure that
|
|
// they must all be subsites.
|
|
|
|
// EXCEPTION: things like wikipedia? wikipedia doesn't seem to pose a prob itself
|
|
|
|
// examples:
|
|
// xyz.com/home/users/fred/
|
|
// xyz.com/home/users/jamie/
|
|
// xyz.com/home/users/bob/
|
|
|
|
// we only index the siteterm:* thing for certain urls. that is a key component
|
|
// of this algorithm. see XmlDoc::hashNoSplit() for those conditions!!!
|
|
|
|
// basically, if you are a root directory, you "vote" for your PARENT as the
|
|
// subsite by indexing your parent subdir with the siteterm: term prefix.
|
|
|
|
// so for the 3 examples above we would index:
|
|
// siteterm:xyz.com/home/users/
|
|
// siteterm:xyz.com/home/users/
|
|
// siteterm:xyz.com/home/users/
|
|
|
|
// thereby giving xyz.com/home/users/ a pretty good subsite score! but below
|
|
// you'll see that we need 100 such votes to be a subsite!
|
|
|
|
|
|
|
|
|
|
// this screws us up:
|
|
// http://svn.wp-plugins.org/
|
|
// BUT none of those "sub-site" pages have pure text content. maybe we can
|
|
// include that as a factor? and make sure the text is unique???
|
|
// furthermore we should only divide a site into subsites if its siteNumInlinks
|
|
// is high!!
|
|
|
|
|
|
// test sites:
|
|
// blogs.ubc.ca/wetsocks/feed/
|
|
// my.donews.com/comboatme/2008/12/17/t-think-they-could-be-hydroxycut-hepatic-failure-upon/
|
|
// blogs.kaixo.com/maildepa/2008/12/23/by-thom-patterson-cnn-the-adorable-shemale-wave-of-millions-of-early/
|
|
// blog.seamolec.org/tim980/2008/12/30/purchase-thief-of-hearts-online/
|
|
// www.homeschoolblogger.com/sagerats/623595/
|
|
// blogs.x7web.com/lamonblog2427/
|
|
// blogs.msdn.com/predos_spot/
|
|
|
|
//
|
|
// POSSIBLE IMPROVEMENTS
|
|
//
|
|
// check usernames to be compound that when split the right way contain
|
|
// a common name, like "dave", "pedro" or "williams". and possibly discard
|
|
// subsites whose path contains categories or other names in the dictionary.
|
|
|
|
static void gotSiteListWrapper ( void *state ) ;
|
|
//static void addedTagWrapper ( void *state ) ;
|
|
|
|
SiteGetter::SiteGetter ( ) {
|
|
m_siteLen = 0;
|
|
m_site[0] = '\0';
|
|
m_errno = 0;
|
|
}
|
|
|
|
SiteGetter::~SiteGetter ( ) {
|
|
}
|
|
|
|
// . also sets m_sitePathDepth to what it should be
|
|
// . -1 indicates unknown (not enough data, etc.) or host/domain is the site
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
// . sets m_site to reference into "url" so XmlDoc::updateTagdb() can just
|
|
// pass a bunch of site ptrs to msg9a
|
|
// . "url" MUST BE NORMALIZED via Url.cpp. so using Links' buffer is ok!
|
|
// . TODO: consider setting "state" to null if your url host has tons of inlinx
|
|
bool SiteGetter::getSite ( char *url ,
|
|
TagRec *gr ,
|
|
int32_t timestamp,
|
|
//char *coll ,
|
|
collnum_t collnum,
|
|
int32_t niceness ,
|
|
//bool addTags ,
|
|
void *state ,
|
|
void (* callback)(void *state) ) {
|
|
|
|
// save it
|
|
m_gr = gr;
|
|
m_url = url;
|
|
//m_coll = coll;
|
|
m_collnum = collnum;
|
|
//m_addTags = addTags;
|
|
m_state = state;
|
|
m_callback = callback;
|
|
m_timestamp= timestamp;
|
|
m_niceness = niceness;
|
|
m_errno = 0;
|
|
|
|
// is it domain only?
|
|
m_hasSubdomain = ::hasSubdomain ( url );
|
|
|
|
// reset
|
|
m_siteLen = 0;
|
|
m_site[0] = '\0';
|
|
m_allDone = false;
|
|
m_addedTag.reset();
|
|
|
|
// set this to unknown for now
|
|
m_sitePathDepth = -1;
|
|
m_oldSitePathDepth = -1;
|
|
|
|
// reset this just in case
|
|
g_errno = 0;
|
|
|
|
//
|
|
// HARDCODED algos
|
|
//
|
|
// ~ /user/ /users/ /profile/ myspace facebook linkedin
|
|
//
|
|
if ( setRecognizedSite ( ) ) {
|
|
m_allDone = true;
|
|
return true;
|
|
}
|
|
|
|
// bail if nothing else we can do
|
|
if ( ! gr ) return setSite ( ) ;
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
|
// g_errno should be set if this is NULL
|
|
if ( ! cr ) return true;
|
|
//if ( ! cr->m_subsiteDetectionEnabled ) return true;
|
|
|
|
// check the current tag for an age
|
|
Tag *tag = gr->getTag("sitepathdepth");
|
|
// if there and the age is young, skip it
|
|
int32_t age = -1;
|
|
//int32_t now = getTimeGlobal();
|
|
//if ( tag ) age = now - tag->m_timestamp;
|
|
// to parse conssitently for the qa test "qatest123" coll use
|
|
// "timestamp" as the "current time"
|
|
if ( tag ) age = timestamp - tag->m_timestamp;
|
|
// if there, at least get it (might be -1)
|
|
if ( tag ) m_oldSitePathDepth = atol ( tag->getTagData() );
|
|
// . if older than 10 days, we need to redo it
|
|
// . if caller give us a timestamp of 0, never redo it!
|
|
if ( age > 10*24*60*60 && timestamp != 0 ) age = -1;
|
|
|
|
//if ( strstr(m_url,"http://www.topix.com/yp/albuquerque/c/community-religion-and-spirituality-churches") )
|
|
// log("hey");
|
|
|
|
// . if our site quality is low, forget about dividing it up too
|
|
// . if age is valid, skip it
|
|
// . also if caller does not want a callback, like XmlDoc.cpp,
|
|
// then use whatever we got
|
|
if ( age >= 0 || ! m_state ) { // || hostRootNumInlinks < 500 ) {
|
|
// do not add to tagdb
|
|
m_state = NULL;
|
|
// just use what we had, it is not expired
|
|
m_sitePathDepth = m_oldSitePathDepth;
|
|
// . now set the site with m_sitePathDepth
|
|
// . sanity check, should not block since m_state is NULL
|
|
if ( ! setSite () ) { char *xx=NULL;*xx=0; }
|
|
// we did not block
|
|
return true;
|
|
}
|
|
|
|
// right now we only run on host #0 so we do not flood the cluster
|
|
// with queries...
|
|
if ( g_hostdb.m_hostId != 0 ) {
|
|
// do not add to tagdb and do not block!
|
|
m_state = NULL;
|
|
// . use a sitepathdepth of -1 by default then, until host #0
|
|
// has a chance to evaluate
|
|
// . a sitepathdepth of -1 means to use the full hostname
|
|
// as the site
|
|
m_sitePathDepth = -1;
|
|
// sanity check, should not block since m_state is NULL
|
|
if ( ! setSite () ) { char *xx=NULL;*xx=0; }
|
|
// we did not block
|
|
return true;
|
|
}
|
|
|
|
// . initial path depth
|
|
// . this actually includes the first subdir name, up to, but not
|
|
// including the /, according to Url::getPathEnd()
|
|
// . start with the broadest site as our possible subsite first
|
|
// in order to reduce errors i guess. because if we have examples:
|
|
// xyz.com/fred/
|
|
// xyz.com/jamie/
|
|
// xyz.com/bob/ ...
|
|
// and we also have:
|
|
// xyz.com/home/users/fred/
|
|
// xyz.com/home/users/jamie/
|
|
// xyz.com/home/users/bob/ ...
|
|
// then we need the first set to take precedence!
|
|
m_pathDepth = 0;
|
|
|
|
// set our fill url class. do not addWWW
|
|
//m_u.set ( m_url , gbstrlen(m_url) , false );
|
|
|
|
// must have http:// i guess
|
|
if ( strncmp(m_url,"http",4) ) {
|
|
g_errno = EBADURL;
|
|
return true;
|
|
// don't let bad input from pageparser core us!
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// how many can we do? false = countFilename?
|
|
//m_maxPathDepth = m_u.getPathDepth ( false );
|
|
|
|
// . pathDepth==0 for "www.xyz.com"
|
|
// . pathDepth==0 for "www.xyz.com/"
|
|
// . pathDepth==0 for "www.xyz.com/foo"
|
|
// . pathDepth==1 for "www.xyz.com/foo/"
|
|
// . pathDepth==1 for "www.xyz.com/foo/x"
|
|
// . pathDepth==2 for "www.xyz.com/foo/x/"
|
|
// . pathDepth==2 for "www.xyz.com/foo/x/y"
|
|
// . true --> we have the protocol, http:// in m_url
|
|
m_maxPathDepth = getPathDepth ( m_url , true );
|
|
|
|
// get it. return false if it blocked.
|
|
return getSiteList();
|
|
}
|
|
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true on error and sets g_errno
|
|
bool SiteGetter::getSiteList ( ) {
|
|
|
|
top:
|
|
// . setSite() will return TRUE and set g_errno on error, and returns
|
|
// false if it blocked adding a tag, which will call callback once
|
|
// tag is added
|
|
// . stop at this point
|
|
if ( m_pathDepth >= 3 ) return setSite();
|
|
// or if no more
|
|
if ( m_pathDepth >= m_maxPathDepth ) return setSite();
|
|
|
|
// . make the termid
|
|
// . but here we get are based on "m_pathDepth" which ranges
|
|
// from 1 to N
|
|
// . if m_pathDepth==0 use "www.xyz.com" as site
|
|
// . if m_pathDepth==1 use "www.xyz.com/foo/" as site ...
|
|
char *pend = getPathEnd ( m_url , m_pathDepth );
|
|
// hash up to that
|
|
//char *host = m_u.getHost();
|
|
char *host = getHostFast ( m_url , NULL );
|
|
// hash the prefix first to match XmlDoc::hashNoSplit()
|
|
char *prefix = "siteterm";
|
|
// hash that and we will incorporate it to match XmlDoc::hashNoSplit()
|
|
int64_t ph = hash64 ( prefix , gbstrlen(prefix) );
|
|
// . this should match basically what is in XmlDoc.cpp::hash()
|
|
// . and this now does not include pages that have no outlinks
|
|
// "underneath" them.
|
|
int64_t termId = hash64 ( host , pend - host , ph ) & TERMID_MASK;
|
|
|
|
// get all pages that have this as their termid!
|
|
key144_t start ;
|
|
key144_t end ;
|
|
g_posdb.makeStartKey ( &start, termId );
|
|
g_posdb.makeEndKey ( &end , termId );
|
|
|
|
// . now see how many urls art at this path depth from this hostname
|
|
// . if it is a huge # then we know they are all subsites!
|
|
// because it is too bushy to be anything else
|
|
// . i'd say 100 nodes is good enough to qualify as a homestead site
|
|
|
|
int32_t minRecSizes = 5000000;
|
|
// get the group this list is in
|
|
//uint32_t gid ;
|
|
//gid = getGroupId ( RDB_POSDB , (char *)&start , false ); //split?
|
|
//uint32_t shardNum ;
|
|
//shardNum = getShardNum( RDB_POSDB , (char *)&start , false ); //split?
|
|
|
|
// i guess this is split by termid and not docid????
|
|
int32_t shardNum = g_hostdb.getShardNumByTermId ( &start );
|
|
|
|
// we need a group #. the column #.
|
|
//int32_t split = g_hostdb.getGroupNum ( gid );
|
|
// int16_tcut
|
|
Msg0 *m = &m_msg0;
|
|
// get the list. returns false if blocked.
|
|
if ( ! m->getList ( -1 , // hostId
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxCacheAge
|
|
false , // addToCache
|
|
RDB_POSDB ,
|
|
m_collnum ,
|
|
&m_list ,
|
|
(char *)&start ,
|
|
(char *)&end ,
|
|
minRecSizes ,
|
|
this ,
|
|
gotSiteListWrapper ,
|
|
m_niceness , // MAX_NICENESS
|
|
// default parms follow
|
|
true , // doErrorCorrection?
|
|
true , // includeTree?
|
|
true , // doMerge?
|
|
-1 , // firstHostId
|
|
0 , // startFileNum
|
|
-1 , // numFiles
|
|
999999, // timeout
|
|
-1 , // syncPoint
|
|
-1 , // preferLocalReads
|
|
NULL , // msg5
|
|
NULL , // msg5b
|
|
false , // isrealmerge?
|
|
true , // allowpagecache?
|
|
false , // forceLocalIndexdb?
|
|
false , // doIndexdbSplit? nosplit
|
|
shardNum ) )//split ))
|
|
return false;
|
|
|
|
// return false if this blocked
|
|
if ( ! gotSiteList() ) return false;
|
|
// error?
|
|
if ( g_errno ) return true;
|
|
// or all done
|
|
if ( m_allDone ) return true;
|
|
// otherwise, try the next path component!
|
|
goto top;
|
|
}
|
|
|
|
void gotSiteListWrapper ( void *state ) {
|
|
SiteGetter *THIS = (SiteGetter *)state;
|
|
if ( ! THIS->gotSiteList() ) return;
|
|
// try again?
|
|
if ( THIS->m_tryAgain ) {
|
|
// return if blocked
|
|
if ( ! THIS->getSiteList() ) return;
|
|
// otherwise, if did not block, we are really done because
|
|
// it loops until it blocks
|
|
}
|
|
// call callback if all done now
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
// . returns false if blocked, returns true and sets g_errno on error
|
|
// . returns true with m_allDone set to false to process another subsite
|
|
// . we use voters to set SEC_VOTE_STATIC and SEC_VOTE_DYNAMIC flags
|
|
// in addition to SEC_VOTE_TEXTY and SEC_VOTE_UNIQUE
|
|
bool SiteGetter::gotSiteList ( ) {
|
|
// assume not trying again
|
|
m_tryAgain = false;
|
|
// error?
|
|
if ( g_errno ) {
|
|
// timeouts usually...
|
|
log("site: sitegetter gotList: %s",mstrerror(g_errno));
|
|
// mark it so caller knows
|
|
m_errno = g_errno;
|
|
// so try again without increasing m_pathDepth
|
|
// i've seen a host return EBADRDBID for some reason
|
|
// and put host #0 in an infinite log spam loop so stop it
|
|
if ( g_errno != EBADRDBID ) m_tryAgain = true;
|
|
return true;
|
|
}
|
|
// how many urls at this path depth?
|
|
int32_t count = ( m_list.getListSize() - 6 ) / 6;
|
|
// if we do not have enough to quality this as a subsite path depth
|
|
// try the next
|
|
if ( count < 100 ) {
|
|
// increment and try again
|
|
m_pathDepth++;
|
|
// clear just in case
|
|
g_errno = 0;
|
|
// get another list if we can, m_allDone is no true yet
|
|
if ( m_pathDepth < m_maxPathDepth ) {
|
|
m_tryAgain = true;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// ok, i guess this indicates we have a subsite level
|
|
m_sitePathDepth = m_pathDepth;
|
|
|
|
// this basically means none!
|
|
if ( m_pathDepth >= m_maxPathDepth ) m_sitePathDepth = -1;
|
|
|
|
// . sets m_site and m_siteLen from m_url
|
|
// . this returns false if blocked, true otherwise
|
|
return setSite ( ) ;
|
|
}
|
|
|
|
//void addedTagWrapper ( void *state ) {
|
|
// SiteGetter *THIS = (SiteGetter *)state;
|
|
// // all done
|
|
// THIS->m_callback ( THIS->m_state );
|
|
//}
|
|
|
|
// . return false if blocked, return true with g_errno set on error
|
|
// . returns true if did not block
|
|
bool SiteGetter::setSite ( ) {
|
|
|
|
// no more looping
|
|
m_allDone = true;
|
|
|
|
// assume this is false for now
|
|
//m_isIndependentSubsite = false;
|
|
|
|
|
|
// we need to distinguish this from the "site" tags that
|
|
// Tagdb.cpp adds to new TagRecs that do not have a "site" tag. but
|
|
// for now maybe we can comment out?
|
|
|
|
/*
|
|
MDW: take out for now
|
|
// . extract from tag rec
|
|
// . must not have a "sitepathdepth" for us to do this
|
|
Tag *tag = NULL;
|
|
// get the "site" tag if we should
|
|
if ( m_gr && m_sitePathDepth == -1 ) tag = m_gr->getTag("site");
|
|
// . if we hade a "site" tag and no "sitepathdepth" tag, use that site
|
|
// . but if sitepathdepth is zero, ignore this too and use the hostname
|
|
// as the site. i never want to use domains themselves as a site
|
|
// because XmlDoc::getSite() ends up returning a domain only
|
|
// and thus XmlDoc::getRootXmlDoc() uses the domain as the
|
|
// url and that is not what we want for XmlDoc::getRootLangId()
|
|
if ( tag && tag->getTagDataSize() > 1 && m_sitePathDepth > 0 ) {
|
|
// get length
|
|
m_siteLen = tag->getTagDataSize() - 1;
|
|
// sanity check
|
|
if ( m_siteLen > MAX_SITE_LEN ) {
|
|
m_site [ 0 ] = '\0';
|
|
m_siteLen = 0;
|
|
g_errno = EURLTOOBIG;
|
|
return true;
|
|
}
|
|
// get the data, including terminating \0
|
|
gbmemcpy ( m_site , tag->getTagData() , m_siteLen + 1 );
|
|
// sanity check - must include the \0
|
|
if (m_site[m_siteLen]!= '\0') {char*xx=NULL;*xx=0;}
|
|
// all done
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// . get the host of our normalized url
|
|
// . assume the hostname is the site
|
|
int32_t hostLen;
|
|
char *host = ::getHost ( m_url , &hostLen );
|
|
|
|
// no, assume domain since Tagdb.cpp adds the domain as the value
|
|
// for the "site" tag when it adds a TagRec to Tagdb
|
|
//char *site = ::getDomFast ( m_url , &m_siteLen );
|
|
|
|
// truncated?
|
|
if ( hostLen + 6 > MAX_SITE_LEN ) {
|
|
m_site [ 0 ] = '\0';
|
|
m_siteLen = 0;
|
|
g_errno = EURLTOOBIG;
|
|
return true;
|
|
}
|
|
|
|
char *x = m_site;
|
|
// check it
|
|
if ( ! m_hasSubdomain ) {
|
|
gbmemcpy ( x , "www.", 4 );
|
|
x += 4;
|
|
}
|
|
// save it
|
|
gbmemcpy ( x , host , hostLen );
|
|
x += hostLen;
|
|
|
|
m_siteLen = x - m_site;
|
|
|
|
// null terminate
|
|
m_site [ m_siteLen ] = '\0';
|
|
|
|
// . -1 means to use the hostname as the site
|
|
// . i am trying to obsolete the site filters table here...
|
|
//if ( m_sitePathDepth == -1 ) return true;
|
|
|
|
// TODO:
|
|
// if we are linked to by the hostname root, ignore sitePathDepth
|
|
// as well. that way we separate the subsites from subdirs off the root
|
|
|
|
// we are an independent subsite. Tagdb.cpp needs to know this so
|
|
// it will not inherit from the root url's tagdb rec
|
|
//m_isIndependentSubsite = true;
|
|
|
|
// . otherwise we got a subsite
|
|
// . if m_pathDepth==0 use "www.xyz.com" as site
|
|
// . if m_pathDepth==1 use "www.xyz.com/foo/" as site ...
|
|
/*
|
|
MDW: take out for now
|
|
if ( m_sitePathDepth >= 0 ) {
|
|
char *end = getPathEnd ( m_url , m_sitePathDepth );
|
|
// set the site length
|
|
m_siteLen = end - site;
|
|
}
|
|
|
|
// truncated?
|
|
if ( m_siteLen > MAX_SITE_LEN ) {
|
|
m_site [ 0 ] = '\0';
|
|
m_siteLen = 0;
|
|
g_errno = EURLTOOBIG;
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
//logf(LOG_DEBUG,"site: site of %s is %s",m_url,m_site);
|
|
|
|
// do not add if this is NULL, caller does not want us to block
|
|
//if ( ! m_state ) return true;
|
|
|
|
//if ( ! m_addTags ) return true;
|
|
|
|
return true;
|
|
/*
|
|
// . to prevent slamming indexdb, store the site in tagdb now
|
|
// make it a string so tagdb likes it
|
|
// . this could be -1 which indicates to use hostname!
|
|
char buf[12];
|
|
sprintf ( buf , "%"INT32"",m_sitePathDepth);
|
|
|
|
// sanity check
|
|
if ( m_timestamp == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . now update tagdb with the new date and path depth
|
|
// . this might be -1, which is nice because it gives us a timestamp
|
|
// so we only have to do this like once every 2 weeks or so
|
|
//TagRec gr;
|
|
m_addedTag.addTag ( "sitepathdepth" ,
|
|
// now XmlDoc must provide it to ensure that are
|
|
// injects into the "qatest123" coll are consistent
|
|
m_timestamp ,//getTime()// use now as timestamp
|
|
"sitegit" , // username
|
|
0 , // ip
|
|
buf , // data
|
|
gbstrlen(buf)+1 );// dateSize (includes \0)
|
|
|
|
// we apply the sitepathdepth tag to tag for this subdomain
|
|
int32_t hlen; char *host = getHostFast ( m_url , &hlen );
|
|
|
|
// null term temporarily
|
|
char c = host[hlen];
|
|
host[hlen] = '\0';
|
|
|
|
// add it
|
|
bool status = m_msg9a.addTags ( host , // "sites"
|
|
NULL , // site ptrs
|
|
0 , // num site ptrs
|
|
m_coll ,
|
|
this ,
|
|
addedTagWrapper ,
|
|
m_niceness ,
|
|
// contains the tags to add
|
|
&m_addedTag ,
|
|
false , // nuke tags?
|
|
NULL );// ip vec
|
|
// remove NULL
|
|
host[hlen] = c;
|
|
|
|
// return false if blocked, true otherwise (g_errno could be set)
|
|
return status;
|
|
*/
|
|
}
|
|
|
|
//
|
|
// hardcoded support for popular formats and sites
|
|
//
|
|
bool SiteGetter::setRecognizedSite ( ) {
|
|
|
|
// clear just in case
|
|
g_errno = 0;
|
|
|
|
// get path of url
|
|
char *p = m_url;
|
|
for ( ; *p && *p != ':' ; p++ );
|
|
// error?
|
|
if ( *p != ':' ) return false;
|
|
// skip ://
|
|
p += 3;
|
|
// save host ptr
|
|
char *host = p;
|
|
// then another / for the path
|
|
for ( ; *p && *p != '/' ; p++ );
|
|
// error?
|
|
if ( *p != '/' ) return false;
|
|
//
|
|
// ok, "p" now points to the path
|
|
//
|
|
char *path = p;
|
|
|
|
// convenience vars
|
|
int32_t len = 0;
|
|
|
|
// . deal with site indicators
|
|
// . these are applied to all domains uniformly
|
|
// . if it is xyz.com/users/ use xyz.com/users/fred/ as the site
|
|
|
|
// a lot of times these were not individual blogs, but the blog subsite
|
|
// of a site... http://dccc.org/blog/P4575/
|
|
//if ( strncasecmp(p,"/blogs/" , 7) == 0 ) len = 7;
|
|
//if ( strncasecmp(p,"/blog/" , 6) == 0 ) len = 6;
|
|
// commented out a bunch cuz they were profiles mostly, not blogs...
|
|
if ( strncasecmp(p,"/~" , 2) == 0 ) len = 2;
|
|
// assume this is a username. skip the first /
|
|
//if ( sitepathdepth == 1 ) len = 1;
|
|
if ( strncasecmp(p,"/users/" , 7) == 0 ) len = 7;
|
|
if ( strncasecmp(p,"/user/" , 6) == 0 ) len = 6;
|
|
if ( strncasecmp(p,"/members/" , 9) == 0 ) len = 9;
|
|
if ( strncasecmp(p,"/membres/" , 9) == 0 ) len = 9;
|
|
if ( strncasecmp(p,"/member/" , 8) == 0 ) len = 8;
|
|
if ( strncasecmp(p,"/membre/" , 8) == 0 ) len = 8;
|
|
if ( strncasecmp(p,"/member.php?u=",14) == 0 ) len = 14;
|
|
|
|
// point to after the /users/, /blogs/, /user/, /blog/ or /~xxx/
|
|
p += len;
|
|
// assume there is NOT an alpha char after this
|
|
char username = false;
|
|
// . skip to next / OR ?
|
|
// . stop at . or -, because we do not allow those in usernames and
|
|
// they are often indicative of filenames without file extensions
|
|
// . no, fix http://www.rus-obr.ru/users/maksim-sokolov (no - or _ or.)
|
|
while ( len && *p && *p!= '/'&&*p!='?' ) {
|
|
// sometimes usernames are numbers!!!
|
|
//if ( is_alpha_a(*p) ) username = true;
|
|
// http://stackoverflow.com/users/271376/sigterm
|
|
if ( is_alnum_a(*p) ) username = true;
|
|
p++;
|
|
}
|
|
// if we hit this, not a username
|
|
//if ( *p=='.' || *p == '-' || *p == '_' ) username = false;
|
|
// did we get a match?
|
|
// . www.cits.ucsb.edu/users/michael-osborne
|
|
// . www.cits.ucsb.edu/users/michael-osborne/
|
|
// . after /blog/ or /~ should be another / or \0, not a period,
|
|
// because that indicates probably a filename, which is not right,
|
|
// because we are expecting a username!
|
|
if ( username && p - host + 6 < MAX_SITE_LEN ) {
|
|
// jump up here to store
|
|
storeIt:
|
|
// for parsing
|
|
char *x = m_site;
|
|
// store www first if its a domain only url
|
|
if ( ! m_hasSubdomain ) {
|
|
gbmemcpy ( x , "www." , 4 );
|
|
x += 4;
|
|
}
|
|
// store it
|
|
gbmemcpy ( x , host , p - host );
|
|
x += p - host;
|
|
// set the length of it
|
|
m_siteLen = x - m_site;
|
|
// make it end on a '/' if we can
|
|
if ( m_site[m_siteLen-1] != '/' &&
|
|
// watch out for /?uid=xxxx crap
|
|
m_site[m_siteLen-1] != '=' ) {
|
|
// force the / then
|
|
m_site[m_siteLen] = '/';
|
|
m_siteLen++;
|
|
}
|
|
// null term the site
|
|
m_site [ m_siteLen ] = '\0';
|
|
return true;
|
|
}
|
|
|
|
|
|
//
|
|
// popular homesteads
|
|
//
|
|
int32_t depth = 0;
|
|
// term host
|
|
char c = *path;
|
|
*path = '\0';
|
|
if ( strstr(host,"vimeo.com" ) ) depth = 1;
|
|
if ( strstr(host,"www.myspace.com") ) depth = 1;
|
|
if ( strstr(host,"twitter.com" ) ) depth = 1;
|
|
if ( strstr(host,"www.facebook.com") ) depth = 1;
|
|
// revert
|
|
*path = c;
|
|
|
|
// return false to indicate no recognized site detected
|
|
if ( ! depth ) return false;
|
|
|
|
// skip over the initial root / after the hostname
|
|
p = path + 1;
|
|
|
|
// no path really? root path? just return the hostname then
|
|
if ( ! *p && path - host + 6 < MAX_SITE_LEN ) {
|
|
// for parsing
|
|
char *x = m_site;
|
|
// store www first if its a domain only url
|
|
if ( ! m_hasSubdomain ) {
|
|
gbmemcpy ( x , "www." , 4 );
|
|
x += 4;
|
|
}
|
|
// store it
|
|
gbmemcpy ( x , host , path - host );
|
|
x += path - host;
|
|
m_siteLen = x - m_site;
|
|
m_site [ m_siteLen ] = '\0';
|
|
return true;
|
|
}
|
|
|
|
// for depth
|
|
for ( ; *p ; p++ )
|
|
if ( *p == '/' && --depth == 0 ) break;
|
|
|
|
if ( p - host + 6 >= MAX_SITE_LEN ) return false;
|
|
|
|
goto storeIt;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|