248 lines
7.6 KiB
C++
248 lines
7.6 KiB
C++
#include "Msgaa.h"
|
|
#include "Tagdb.h"
|
|
#include "Msg40.h"
|
|
|
|
// test sites:
|
|
// blogs.ubc.ca/wetsocks/feed/
|
|
// my.donews.com/comboatme/2008/12/17/t-think-they-could-be-hydroxycut-hepatic-failure-upon/
|
|
// blogs.kaixo.com/maildepa/2008/12/23/by-thom-patterson-cnn-the-adorable-shemale-wave-of-millions-of-early/
|
|
// blog.seamolec.org/tim980/2008/12/30/purchase-thief-of-hearts-online/
|
|
// www.homeschoolblogger.com/sagerats/623595/
|
|
// blogs.x7web.com/lamonblog2427/
|
|
// blogs.msdn.com/predos_spot/
|
|
|
|
//
|
|
// POSSIBLE IMPROVEMENTS
|
|
//
|
|
// check usernames to be compound that when split the right way contain
|
|
// a common name, like "dave", "pedro" or "williams". and possibly discard
|
|
// subsites whose path contains categories or other names in the dictionary.
|
|
|
|
//
|
|
// POTENTIAL ARTICLE CONTENT IDENTIFICATION ALGORITHM
|
|
//
|
|
// identify all tag hashes (includes hashes of all parents) on the page
|
|
// which preceed unique content, only found on that page. minimize the list
|
|
// of all such tag hashes, and store into tagdb for that SITE. also store
|
|
// for that domain and hostname if those tag recs have less than 10-20 such
|
|
// tag hashes already. (tag hash hashes all the alpha chars in the tag
|
|
// attributes as well. see DateParse.cpp ALGORITHM description)
|
|
// PROBLEM: duplicated pages, or printer-friendly pages! if pages is very
|
|
// similar content (not tags) to another (test using vectors) then toss it!!
|
|
// keep a count for each tag as how many pages think it is a repeated vs.
|
|
// unique content.
|
|
|
|
static void gotResultsAAWrapper ( Msg40 *msg40 , void *state ) ;
|
|
|
|
Msgaa::Msgaa ( ) {
|
|
m_msg40 = NULL;
|
|
}
|
|
|
|
Msgaa::~Msgaa ( ) {
|
|
if ( m_msg40 ) {
|
|
// free him, we sent his reply
|
|
mdelete ( m_msg40 , sizeof(Msg40),"msgaa");
|
|
delete ( m_msg40 );
|
|
}
|
|
m_msg40 = NULL;
|
|
}
|
|
|
|
#define SAMPLESIZE 100
|
|
|
|
// . also sets m_sitePathDepth to what it should be
|
|
// . -1 indicates unknown (not enough data, etc.) or host/domain is the site
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool Msgaa::addSitePathDepth ( TagRec *gr ,
|
|
Url *url ,
|
|
char *coll ,
|
|
void *state ,
|
|
void (* callback)(void *state) ) {
|
|
|
|
// save it
|
|
m_gr = gr;
|
|
m_url = url;
|
|
m_coll = coll;
|
|
m_state = state;
|
|
m_callback = callback;
|
|
|
|
// set this to unknown for now
|
|
m_sitePathDepth = -1;
|
|
m_oldSitePathDepth = -1;
|
|
|
|
// reset this just in case
|
|
g_errno = 0;
|
|
|
|
// skip this for now!
|
|
return true;
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
|
if ( ! cr ) return true;
|
|
if ( ! cr->m_subsiteDetectionEnabled ) return true;
|
|
|
|
// check the current tag for an age
|
|
Tag *tag = gr->getTag("sitepathdepth");
|
|
// if there and the age is young, skip it
|
|
int32_t age = -1;
|
|
int32_t now = getTimeGlobal();
|
|
if ( tag ) age = now - tag->m_timestamp;
|
|
// if there, at least get it
|
|
if ( tag ) m_oldSitePathDepth = (int32_t)tag->m_data[0];
|
|
// if older than 30 days, we need to redo it
|
|
if ( age > 30*24*60*60 ) age = -1;
|
|
// if age is valid, skip it
|
|
if ( age >= 0 ) {
|
|
// just use what we had, it is not expired
|
|
m_sitePathDepth = (int32_t)tag->m_data[0];
|
|
// all done, we did not block
|
|
return true;
|
|
}
|
|
|
|
// right now we only run on host #0 so we do not flood the cluster
|
|
// with queries...
|
|
if ( g_hostdb.m_hostId != 0 ) return true;
|
|
|
|
|
|
HASH all these things together into 1 termlist!!!
|
|
gbsitesample:<pathdepth> term. everything else is constant. use msg0 then, not msg40
|
|
|
|
rename msgaa to Site.
|
|
|
|
and use Site::set(Url *u) to do all this logic.
|
|
or Site::set(TitleRec *tr) to set it exactly from title rec
|
|
|
|
// make a new Msg40 to get search results with
|
|
try { m_msg40 = new (Msg40); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("msgaa: new(%i): %s", sizeof(Msg40), mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
mnew ( m_msg40 , sizeof(Msg40) , "Msgaa" );
|
|
|
|
// initial path depth
|
|
m_pathDepth = 1;
|
|
|
|
// see how many urls are non-cgi with a pathdepth of 1
|
|
char *p = m_qbuf;
|
|
strcpy ( p , "site:" );
|
|
p += 5;
|
|
gbmemcpy ( p , m_url->getHost() , m_url->getHostLen() );
|
|
p += m_url->getHostLen();
|
|
// sort them by the random score term, gbrandscore (see XmlDoc.cpp)
|
|
p += sprintf (p ,
|
|
" gbpathdepth:%"INT32""
|
|
" gbiscgi:0"
|
|
" gbhasfilename:0"
|
|
// www.xyz.com/viacom-extends-brad-greys-contract/ not!
|
|
" gbpermalink:0 "
|
|
"| gbrandscore:1",
|
|
m_pathDepth);
|
|
|
|
// set our SearchInput
|
|
m_si.reset();
|
|
// a sample of 100 results should be good!
|
|
m_si.m_docsWanted = SAMPLESIZE;
|
|
m_si.m_requireAllTerms = true;
|
|
m_si.m_firstResultNum = 0;
|
|
m_si.m_coll = m_coll;
|
|
m_si.m_doDupContentRemoval = false;
|
|
m_si.m_doSiteClustering = false;
|
|
m_si.m_docsToScanForReranking = 0;
|
|
// the query
|
|
m_si.m_query = m_qbuf;
|
|
m_si.m_queryLen = gbstrlen(m_qbuf);
|
|
// sanity check
|
|
if ( m_si.m_queryLen + 1 > MAX_QBUF_SIZE ) {char*xx=NULL;*xx=0; }
|
|
// do not generate titles or summaries to save time, we just need
|
|
// the url to check for bushiness. Msg20 if "getSummary" is false
|
|
// will already not set the title...
|
|
m_si.m_numLinesInSummary = 0;
|
|
|
|
// perform the Msg40 query
|
|
if ( ! m_msg40->getResults ( &m_si ,
|
|
false , // forward?
|
|
this , // state
|
|
gotResultsAAWrapper ) )
|
|
// return false if that blocks
|
|
return false;
|
|
// did not block
|
|
return gotResults();
|
|
}
|
|
|
|
void gotResultsAAWrapper ( Msg40 *msg40 , void *state ) {
|
|
Msgaa *THIS = (Msgaa *)state;
|
|
THIS->gotResults();
|
|
}
|
|
|
|
// . look at the urls we got in the results
|
|
// . they should be of the for abc.xyz.com/yyyyyy/
|
|
// . returns true and sets g_errno on error
|
|
bool Msgaa::gotResults ( ) {
|
|
// loop over each one
|
|
int32_t n = m_msg40->m_numMsg20s;
|
|
// we need at least half of requested to make a good estimation
|
|
if ( n <= SAMPLESIZE/2 ) return true;
|
|
// make a hashtable
|
|
HashTable ht;
|
|
// get the url
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// get the ith result
|
|
Msg20Reply *r = m_msg40->m_msg20[i]->m_r;
|
|
// get the url string
|
|
char *us = r->ptr_ubuf;
|
|
int32_t uslen = r->size_ubuf - 1;
|
|
Url u; u.set ( us , uslen );
|
|
// get path component # m_pathDepth
|
|
int32_t clen;
|
|
char *c = u.getPathComponent ( i - 1 , &clen );
|
|
// must be there
|
|
if ( ! c || clen <= 0 ) {
|
|
log("msgaa: c is empty");
|
|
continue;
|
|
}
|
|
// now hash it
|
|
int32_t h = hash32 ( c , clen );
|
|
// count in table
|
|
int32_t slot = ht.getSlot ( h );
|
|
// how many times has this occurred in a result's url?
|
|
int32_t count = 0;
|
|
// inc if there
|
|
if ( slot >= 0 ) count = ht.getValueFromSlot ( slot );
|
|
// inc it
|
|
count++;
|
|
// put it back
|
|
if ( slot >= 0 ) ht.setValue ( slot , count );
|
|
// otherwise, add it new
|
|
else if ( ! ht.addKey ( h , count ) ) return true;
|
|
}
|
|
// now scan the hash table and see how many unique path components
|
|
int32_t unique = 0;
|
|
int32_t ns = ht.getNumSlots();
|
|
for ( int32_t i = 0 ;i < ns ; i++ ) {
|
|
// is empty
|
|
int32_t val = ht.getValueFromSlot ( i );
|
|
// count if non-empty
|
|
if ( val > 0 ) unique++;
|
|
}
|
|
// i'd say 50% of the SAMPLESIZE+ search results are required to have
|
|
// unique path components in order for us to consider this to be a
|
|
// subsite with a path depth of m_pathDepth.
|
|
int32_t required = n / 2;
|
|
if ( unique < required ) {
|
|
// ok, do not set m_sitePathDepth, leave it -1
|
|
log("msgaa: only have %"INT32" unique path components at path "
|
|
"depth %"INT32" out of %"INT32" results. %s does not have subsites.",
|
|
unique,m_pathDepth,n,m_url->getUrl());
|
|
return true;
|
|
}
|
|
// i guess we got it
|
|
log("msgaa: have %"INT32" unique path components at path "
|
|
"depth %"INT32" out of %"INT32" results. Enough to declare this as a "
|
|
"subsite for %s .",unique,m_pathDepth,n,m_url->getUrl());
|
|
// ok set it
|
|
m_sitePathDepth = m_pathDepth;
|
|
// we are done
|
|
return true;
|
|
}
|