mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
5119 lines
144 KiB
C++
5119 lines
144 KiB
C++
#include "Msg25.h"
|
|
#include "Linkdb.h"
|
|
#include "UdpSlot.h"
|
|
#include "Serialize.h"
|
|
#include "linkspam.h"
|
|
#include "Xml.h"
|
|
#include "HashTableX.h"
|
|
#include "Titledb.h"
|
|
#include "Collectiondb.h"
|
|
#include "Process.h"
|
|
#include "UdpServer.h"
|
|
#include "HttpMime.h"
|
|
#include "HashTable.h"
|
|
#include "IPAddressChecks.h"
|
|
#include "GbMutex.h"
|
|
#include "ScopedLock.h"
|
|
#include "Conf.h"
|
|
#include "Mem.h"
|
|
#include "SiteGetter.h"
|
|
#include "ip.h"
|
|
#include "Errno.h"
|
|
#include "gbmemcpy.h"
|
|
|
|
#ifdef _VALGRIND_
|
|
#include <valgrind/memcheck.h>
|
|
#endif
|
|
|
|
|
|
// used by Msg25::addNote()
|
|
#define MAX_ENTRY_DOCIDS 10
|
|
class NoteEntry {
|
|
public:
|
|
int32_t m_count;
|
|
char *m_note;
|
|
int64_t m_docIds[MAX_ENTRY_DOCIDS];
|
|
};
|
|
|
|
|
|
static LinkInfo *makeLinkInfo(int32_t ip,
|
|
Msg20Reply **replies,
|
|
int32_t numReplies,
|
|
int64_t linkeeDocId,
|
|
int32_t lastUpdateTime,
|
|
bool onlyNeedGoodInlinks,
|
|
Msg25 *msg25,
|
|
SafeBuf *linkInfoBuf);
|
|
|
|
static int32_t getNumWords(const char *s, int32_t len);
|
|
|
|
|
|
// 1MB read size for now
|
|
#define READSIZE 1000000
|
|
|
|
#define MAX_INTERNAL_INLINKS 10
|
|
|
|
static void gotListWrapper(void *state, RdbList *list, Msg5 *msg5);
|
|
static bool gotLinkTextWrapper(void *state);
|
|
|
|
|
|
Msg25::Msg25() {
|
|
m_numRequests = 0;
|
|
m_linkSpamOut = 0;
|
|
m_numReplyPtrs = 0;
|
|
//m_linkInfo = NULL;
|
|
m_ownReplies = true;
|
|
|
|
// mainly for Coverity
|
|
m_req25 = NULL;
|
|
m_linkInfoBuf = NULL;
|
|
m_url = NULL;
|
|
m_site = NULL;
|
|
m_ourHostHash32 = 0;
|
|
m_ourDomHash32 = 0;
|
|
m_round = 0;
|
|
m_linkHash64 = 0;
|
|
memset(&m_nextKey, 0, sizeof(m_nextKey));
|
|
m_onlyNeedGoodInlinks = false;
|
|
m_docId = 0;
|
|
m_collnum = 0;
|
|
m_state = NULL;
|
|
m_callback = NULL;
|
|
m_siteNumInlinks = 0;
|
|
m_mode = MODE_UNSET;
|
|
m_printInXml = false;
|
|
m_ip = 0;
|
|
m_top = 0;
|
|
m_midDomHash = 0;
|
|
m_gettingList = false;
|
|
m_k = NULL;
|
|
m_maxNumLinkers = 0;
|
|
memset(&m_replyPtrs, 0, sizeof(m_replyPtrs));
|
|
memset(&m_replySizes, 0, sizeof(m_replySizes));
|
|
memset(m_inUse, 0, sizeof(m_inUse));
|
|
m_numDocIds = 0;
|
|
m_cblocks = 0;
|
|
m_uniqueIps = 0;
|
|
m_minRecSizes = 0;
|
|
m_numReplies = 0;
|
|
m_errno = 0;
|
|
m_pbuf = NULL;
|
|
m_oneVotePerIpDom = false;
|
|
m_doLinkSpamCheck = false;
|
|
m_isInjecting = false;
|
|
m_lastUpdateTime = 0;
|
|
m_errors = 0;
|
|
m_noText = 0;
|
|
m_spideringEnabled = false;
|
|
m_dupCount = 0;
|
|
m_spamLinks = 0;
|
|
m_niceness = 0;
|
|
m_numFromSameIp = 0;
|
|
m_sameMidDomain = 0;
|
|
m_spamCount = 0;
|
|
m_maxSpam = 0;
|
|
m_ipDupsLinkdb = 0;
|
|
m_docIdDupsLinkdb = 0;
|
|
m_linkSpamLinkdb = 0;
|
|
m_ipDups = 0;
|
|
m_oldLinkInfo = NULL;
|
|
m_bufPtr = NULL;
|
|
m_bufEnd = NULL;
|
|
m_requestSize = 0;
|
|
m_qbuf = NULL;
|
|
m_qbufSize = 0;
|
|
memset(m_buf, 0, sizeof(m_buf));
|
|
memset(m_request, 0, sizeof(m_request));
|
|
}
|
|
|
|
|
|
Msg25::~Msg25 ( ) {
|
|
reset();
|
|
}
|
|
|
|
|
|
void Msg25::reset() {
|
|
if ( ! m_ownReplies )
|
|
m_numReplyPtrs = 0;
|
|
for ( int32_t i = 0 ; i < m_numReplyPtrs ; i++ )
|
|
mfree ( m_replyPtrs[i], m_replySizes[i], "msg25r");
|
|
// reset array count to 0
|
|
m_numReplyPtrs = 0;
|
|
|
|
m_table.reset();
|
|
m_ipTable.reset();
|
|
m_fullIpTable.reset();
|
|
m_firstIpTable.reset();
|
|
m_docIdTable.reset();
|
|
}
|
|
|
|
|
|
|
|
// . we got a reply back from the msg25 request
|
|
// . reply should just be a LinkInfo class
|
|
// . set XmlDoc::m_linkInfoBuf safebuf to that reply
|
|
// . we store tr to that safebuf in Msg25Request::m_linkInfoBuf
|
|
static void gotMulticastReplyWrapper25(void *state, void *state2) {
|
|
|
|
Msg25Request *req = (Msg25Request *)state;
|
|
|
|
// call callback now if error is set
|
|
if ( g_errno ) {
|
|
req->m_callback ( req->m_state );
|
|
return;
|
|
}
|
|
|
|
Multicast *mcast = req->m_mcast;
|
|
|
|
int32_t replySize;
|
|
int32_t replyMaxSize;
|
|
bool freeit;
|
|
char *reply = mcast->getBestReply (&replySize,&replyMaxSize,&freeit);
|
|
{
|
|
// validate linkinfo
|
|
LinkInfo *linkInfo = (LinkInfo *)reply;
|
|
if (linkInfo->m_version != 0 ||
|
|
linkInfo->m_lisize < 0 || linkInfo->m_lisize != replySize ||
|
|
linkInfo->m_numStoredInlinks < 0 || linkInfo->m_numGoodInlinks < 0) {
|
|
gbshutdownCorrupted();
|
|
}
|
|
}
|
|
|
|
// . store reply in caller's linkInfoBuf i guess
|
|
// . mcast should free the reply
|
|
req->m_linkInfoBuf->safeMemcpy ( reply , replySize );
|
|
{
|
|
// validate linkinfo
|
|
LinkInfo *linkInfo = (LinkInfo *)req->m_linkInfoBuf->getBufStart();
|
|
if (linkInfo->m_version != 0 ||
|
|
linkInfo->m_lisize < 0 || linkInfo->m_lisize != req->m_linkInfoBuf->length() ||
|
|
linkInfo->m_numStoredInlinks < 0 || linkInfo->m_numGoodInlinks < 0) {
|
|
gbshutdownCorrupted();
|
|
}
|
|
}
|
|
|
|
// i guess we gotta free this
|
|
mfree ( reply , replyMaxSize , "rep25" );
|
|
|
|
req->m_callback ( req->m_state );
|
|
}
|
|
|
|
|
|
// . returns false if would block, true otherwise
|
|
// . sets g_errno and returns true on launch error
|
|
// . calls req->m_callback when ready if it would block
|
|
bool getLinkInfo(SafeBuf *reqBuf,
|
|
Multicast *mcast,
|
|
const char *site,
|
|
const char *url,
|
|
bool isSiteLinkInfo,
|
|
int32_t ip,
|
|
int64_t docId,
|
|
collnum_t collnum,
|
|
void *state,
|
|
void (* callback)(void *state),
|
|
bool isInjecting,
|
|
bool printInXml,
|
|
int32_t siteNumInlinks,
|
|
const LinkInfo *oldLinkInfo,
|
|
int32_t niceness,
|
|
bool doLinkSpamCheck,
|
|
bool oneVotePerIpDom,
|
|
int32_t lastUpdateTime,
|
|
bool onlyNeedGoodInlinks,
|
|
int32_t ourHostHash32,
|
|
int32_t ourDomHash32,
|
|
SafeBuf *linkInfoBuf)
|
|
{
|
|
|
|
int32_t siteLen = strlen(site);
|
|
int32_t urlLen = strlen(url);
|
|
|
|
int32_t oldLinkSize = 0;
|
|
if ( oldLinkInfo )
|
|
oldLinkSize = oldLinkInfo->getSize();
|
|
|
|
if( oldLinkSize < 0 ) {
|
|
log(LOG_ERROR,"!!! CORRUPTED LINKINFO detected for docId %" PRId64 " - resetting linkinfo", docId);
|
|
oldLinkSize = 0;
|
|
oldLinkInfo = NULL;
|
|
}
|
|
|
|
int32_t need = sizeof(Msg25Request) + siteLen+1 + urlLen+1 + oldLinkSize;
|
|
|
|
// keep it in a safebuf so caller can just add "SafeBuf m_msg25Req;"
|
|
// to his .h file and not have to worry about freeing it.
|
|
reqBuf->purge();
|
|
|
|
// clear = true. put 0 bytes in there
|
|
if ( ! reqBuf->reserve ( need ,"m25req", true ) )
|
|
return true;
|
|
|
|
Msg25Request *req = (Msg25Request *)reqBuf->getBufStart();
|
|
|
|
req->m_linkInfoBuf = linkInfoBuf;
|
|
|
|
req->m_mcast = mcast;
|
|
|
|
req->ptr_site = const_cast<char*>(site);
|
|
req->size_site = siteLen + 1;
|
|
|
|
req->ptr_url = const_cast<char*>(url);
|
|
req->size_url = urlLen + 1;
|
|
|
|
req->ptr_oldLinkInfo = reinterpret_cast<const char *>(oldLinkInfo);
|
|
if ( oldLinkInfo )
|
|
req->size_oldLinkInfo = oldLinkInfo->getSize();
|
|
else
|
|
req->size_oldLinkInfo = 0;
|
|
|
|
if ( isSiteLinkInfo )
|
|
req->m_mode = Msg25::MODE_SITELINKINFO;
|
|
else
|
|
req->m_mode = Msg25::MODE_PAGELINKINFO;
|
|
|
|
req->m_ip = ip;
|
|
req->m_docId = docId;
|
|
req->m_collnum = collnum;
|
|
req->m_state = state;
|
|
req->m_callback = callback;
|
|
req->m_isInjecting = isInjecting;
|
|
req->m_printInXml = printInXml;
|
|
req->m_siteNumInlinks = siteNumInlinks;
|
|
req->m_niceness = niceness;
|
|
req->m_doLinkSpamCheck = doLinkSpamCheck;
|
|
req->m_oneVotePerIpDom = oneVotePerIpDom;
|
|
req->m_lastUpdateTime = lastUpdateTime;
|
|
req->m_onlyNeedGoodInlinks = onlyNeedGoodInlinks;
|
|
req->m_ourHostHash32 = ourHostHash32;
|
|
req->m_ourDomHash32 = ourDomHash32;
|
|
|
|
Url u;
|
|
u.set ( req->ptr_url );
|
|
|
|
req->m_linkHash64 = (uint64_t)u.getUrlHash64();
|
|
|
|
req->m_siteHash32 = 0LL;
|
|
req->m_siteHash64 = 0LL;
|
|
if ( req->ptr_site ) {
|
|
// hash collection # in with it
|
|
int64_t h64 = hash64n ( req->ptr_site );
|
|
h64 = hash64 ((char *)&req->m_collnum,sizeof(collnum_t),h64);
|
|
req->m_siteHash64 = h64;
|
|
req->m_siteHash32 = hash32n ( req->ptr_site );
|
|
}
|
|
|
|
// send to host for local linkdb lookup
|
|
key224_t startKey ;
|
|
//int32_t siteHash32 = hash32n ( req->ptr_site );
|
|
// access different parts of linkdb depending on the "mode"
|
|
if ( req->m_mode == Msg25::MODE_SITELINKINFO )
|
|
startKey = Linkdb::makeStartKey_uk ( req->m_siteHash32 );
|
|
else
|
|
startKey = Linkdb::makeStartKey_uk (req->m_siteHash32, req->m_linkHash64 );
|
|
// what group has this linkdb list?
|
|
uint32_t shardNum = getShardNum ( RDB_LINKDB, &startKey );
|
|
// use a biased lookup
|
|
int32_t numTwins = g_hostdb.getNumHostsPerShard();
|
|
int64_t sectionWidth = (0xffffffff/(int64_t)numTwins) + 1;
|
|
// these are 192 bit keys, top 32 bits are a hash of the url
|
|
uint32_t x = req->m_siteHash32;//(startKey.n1 >> 32);
|
|
int32_t hostNum = x / sectionWidth;
|
|
int32_t numHosts = g_hostdb.getNumHostsPerShard();
|
|
Host *hosts = g_hostdb.getShard ( shardNum); // Group ( groupId );
|
|
if ( hostNum >= numHosts ) { g_process.shutdownAbort(true); }
|
|
int32_t hostId = hosts [ hostNum ].m_hostId ;
|
|
if( !hosts [ hostNum ].m_spiderEnabled) {
|
|
hostId = g_hostdb.getHostIdWithSpideringEnabled(shardNum, true);
|
|
}
|
|
|
|
logTrace(g_conf.m_logTraceMsg25, "Send msg25 to host [%" PRId32 "] ptr_url [%s] (hashed from [%.*s]) ptr_site [%s] docId [%" PRId64 "] isSiteLinkInfo [%s]",
|
|
hostId, req->ptr_url, u.getUrlLen(), u.getUrl(), req->ptr_site, req->m_docId, isSiteLinkInfo?"true":"false");
|
|
|
|
// . serialize the string buffers
|
|
// . use Msg25Request::m_buf[MAX_NEEDED]
|
|
// . turns the ptr_* members into offsets into req->m_buf[]
|
|
req->serialize();
|
|
|
|
// this should always block
|
|
// if timeout is too low we core in XmlDoc.cpp after getNewSpiderReply() returns a -1 because it blocks for some reason.
|
|
if (!mcast->send((char *)req, req->getStoredSize(), msg_type_25, false, shardNum, false, 0, req, NULL, gotMulticastReplyWrapper25, multicast_infinite_send_timeout, req->m_niceness, hostId, true)) {
|
|
log( LOG_WARN, "linkdb: Failed to send multicast for %s err=%s", u.getUrl(),mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
|
|
// wait for req->m_callback(req->m_state) to be called
|
|
return false;
|
|
}
|
|
|
|
|
|
static HashTableX g_lineTable;
|
|
static GbMutex g_mtxLineTable;
|
|
|
|
|
|
static void sendReplyWrapper(void *state) {
|
|
|
|
int32_t saved = g_errno;
|
|
|
|
Msg25 *m25 = (Msg25 *)state;
|
|
// the original request
|
|
Msg25Request *mr = m25->m_req25;
|
|
// get udp slot for sending back reply
|
|
UdpSlot *slot2 = mr->m_udpSlot;
|
|
|
|
if (m25->m_linkInfoBuf->length() > 0) {
|
|
// validate linkinfo
|
|
LinkInfo *linkInfo = (LinkInfo *)m25->m_linkInfoBuf->getBufStart();
|
|
if (linkInfo->m_version != 0 ||
|
|
linkInfo->m_lisize < 0 || linkInfo->m_lisize != m25->m_linkInfoBuf->length() ||
|
|
linkInfo->m_numStoredInlinks < 0 || linkInfo->m_numGoodInlinks < 0) {
|
|
gbshutdownCorrupted();
|
|
}
|
|
}
|
|
|
|
// shortcut
|
|
SafeBuf *info = m25->m_linkInfoBuf;
|
|
|
|
// steal this buffer
|
|
char *reply1 = info->getBufStart();
|
|
int32_t replySize = info->length();
|
|
|
|
// sanity. no if collrec not found its 0!
|
|
if (!saved && replySize <= 0) {
|
|
saved = g_errno = EBADENGINEER;
|
|
log(LOG_WARN, "linkdb: sending back empty link text reply. did coll get deleted?");
|
|
}
|
|
|
|
// get original request
|
|
Msg25Request *req = (Msg25Request *)slot2->m_readBuf;
|
|
// sanity
|
|
if ( req->m_udpSlot != slot2 ) { g_process.shutdownAbort(true);}
|
|
// if in table, nuke it
|
|
ScopedLock sl(g_mtxLineTable);
|
|
g_lineTable.removeKey ( &req->m_siteHash64 );
|
|
sl.unlock();
|
|
|
|
nextLink:
|
|
|
|
UdpSlot *udpSlot = req->m_udpSlot;
|
|
|
|
// update for next udpSlot
|
|
req = req->m_next;
|
|
|
|
// just dup the reply for each one
|
|
char *reply2 = (char *)mdup(reply1,replySize,"m25repd");
|
|
|
|
if (reply2) {
|
|
// validate linkinfo
|
|
LinkInfo *linkInfo = (LinkInfo *)reply2;
|
|
if (linkInfo->m_version != 0 ||
|
|
linkInfo->m_lisize < 0 || linkInfo->m_lisize != m25->m_linkInfoBuf->length() ||
|
|
linkInfo->m_numStoredInlinks < 0 || linkInfo->m_numGoodInlinks < 0) {
|
|
gbshutdownCorrupted();
|
|
}
|
|
}
|
|
|
|
// error?
|
|
if ( saved || ! reply2 ) {
|
|
int32_t err = saved;
|
|
if ( ! err )
|
|
err = g_errno;
|
|
if ( ! err ) { g_process.shutdownAbort(true); }
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply. error=%s", __FILE__, __func__, __LINE__, mstrerror(err));
|
|
g_udpServer.sendErrorReply(udpSlot,err);
|
|
} else {
|
|
// send it back to requester
|
|
g_udpServer.sendReply(reply2, replySize, reply2, replySize, udpSlot);
|
|
}
|
|
|
|
// if we had a link
|
|
if ( req )
|
|
goto nextLink;
|
|
|
|
// the destructor
|
|
mdelete ( m25 ,sizeof(Msg25),"msg25");
|
|
delete ( m25 );
|
|
}
|
|
|
|
|
|
void handleRequest25(UdpSlot *slot, int32_t netnice) {
|
|
Msg25Request *req = (Msg25Request *)slot->m_readBuf;
|
|
|
|
req->deserialize();
|
|
|
|
// make sure this always NULL for our linked list logic
|
|
req->m_next = NULL;
|
|
|
|
// udp socket for sending back the final linkInfo in m_linkInfoBuf
|
|
// used by sendReply()
|
|
req->m_udpSlot = slot;
|
|
|
|
char ipbuf[16];
|
|
logTrace(g_conf.m_logTraceMsg25,"ip [%s] port [%" PRIu16 "] ptr_url [%s] ptr_site [%s] docId [%" PRId64 "]",
|
|
iptoa(slot->getIp(),ipbuf), slot->getPort(), req->ptr_url, req->ptr_site, req->m_docId);
|
|
|
|
|
|
if ( g_conf.m_logDebugLinkInfo && req->m_mode == Msg25::MODE_SITELINKINFO ) {
|
|
log(LOG_DEBUG, "linkdb: got msg25 request sitehash64=%" PRId64" "
|
|
"site=%s "
|
|
,req->m_siteHash64
|
|
,req->ptr_site
|
|
);
|
|
}
|
|
|
|
ScopedLock sl(g_mtxLineTable);
|
|
|
|
// set up the hashtable if our first time
|
|
if ( ! g_lineTable.isInitialized() )
|
|
g_lineTable.set ( 8,sizeof(Msg25Request *),256, NULL,0,false,"lht25");
|
|
|
|
// . if already working on this same request, wait for it, don't
|
|
// overload server with duplicate requests
|
|
// . hashkey is combo of collection, url, and m_mode
|
|
// . TODO: ensure does not send duplicate "page" link info requests
|
|
// just "site" link info requests
|
|
int32_t slotNum = -1;
|
|
bool isSiteLinkInfo = false;
|
|
if ( req->m_mode == Msg25::MODE_SITELINKINFO ) {
|
|
slotNum = g_lineTable.getSlot ( &req->m_siteHash64 );
|
|
isSiteLinkInfo = true;
|
|
}
|
|
|
|
if ( slotNum >= 0 ) {
|
|
Msg25Request *head = *(Msg25Request **)g_lineTable.getValueFromSlot(slotNum);
|
|
if ( head->m_next )
|
|
req->m_next = head->m_next;
|
|
head->m_next = req;
|
|
// note it for debugging
|
|
log("build: msg25 request waiting in line for %s "
|
|
"udpslot=%p",
|
|
req->ptr_url,slot);
|
|
// we will send a reply back for this guy when done
|
|
// getting the reply for the head msg25request
|
|
return;
|
|
}
|
|
|
|
// make a new Msg25
|
|
Msg25 *m25;
|
|
try { m25 = new Msg25; }
|
|
catch(std::bad_alloc&) {
|
|
g_errno = ENOMEM;
|
|
log(LOG_WARN, "build: msg25: new(%" PRId32"): %s",
|
|
(int32_t)sizeof(Msg25),mstrerror(g_errno));
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply ( slot , g_errno );
|
|
return;
|
|
}
|
|
mnew ( m25 , sizeof(Msg25) , "Msg25" );
|
|
|
|
if ( isSiteLinkInfo ) {
|
|
// add the initial entry
|
|
g_lineTable.addKey ( &req->m_siteHash64 , &req );
|
|
}
|
|
sl.unlock();
|
|
|
|
// point to a real safebuf here for populating with data
|
|
m25->m_linkInfoBuf = &m25->m_realBuf;
|
|
|
|
// set some new stuff. should probably be set in getLinkInfo2()
|
|
// but we are trying to leave that as unaltered as possible to
|
|
// try to reduce debugging.
|
|
m25->m_req25 = req;
|
|
|
|
// this should call our callback when done
|
|
if ( ! m25->getLinkInfo2 ( req->ptr_site ,
|
|
req->ptr_url ,
|
|
isSiteLinkInfo ,
|
|
req->m_ip ,
|
|
req->m_docId ,
|
|
req->m_collnum , // coll
|
|
NULL, // qbuf
|
|
0 , // qbufSize
|
|
m25 , // state
|
|
sendReplyWrapper , // CALLBACK!
|
|
req->m_isInjecting ,
|
|
req->m_printDebugMsgs ,
|
|
req->m_printInXml ,
|
|
req->m_siteNumInlinks ,
|
|
(LinkInfo *)req->ptr_oldLinkInfo ,
|
|
req->m_niceness ,
|
|
req->m_doLinkSpamCheck ,
|
|
req->m_oneVotePerIpDom ,
|
|
req->m_lastUpdateTime ,
|
|
req->m_onlyNeedGoodInlinks ,
|
|
req->m_ourHostHash32 ,
|
|
req->m_ourDomHash32 ,
|
|
m25->m_linkInfoBuf ) ) // SafeBuf 4 output
|
|
return;
|
|
|
|
if(m25->m_linkInfoBuf->length()<=0&&!g_errno){ g_process.shutdownAbort(true); }
|
|
|
|
if ( g_errno == ETRYAGAIN ) { g_process.shutdownAbort(true); }
|
|
|
|
// wait for msg5 to be done reading list. this happens somehow,
|
|
// i'm not 100% sure how. code has too many indirections.
|
|
if ( m25->m_gettingList ) {
|
|
log("linkdb: avoiding core");
|
|
return;
|
|
}
|
|
|
|
// sanity
|
|
if ( !m25->m_msg5.m_msg3.areAllScansCompleted()) { g_process.shutdownAbort(true); }
|
|
|
|
if ( g_errno )
|
|
log("linkdb: error getting linkinfo: %s",mstrerror(g_errno));
|
|
// else
|
|
// log("linkdb: got link info without blocking");
|
|
|
|
// it did not block... g_errno will be set on error so sendReply()
|
|
// should in that case send an error reply.
|
|
sendReplyWrapper ( m25 );
|
|
}
|
|
|
|
|
|
int32_t Msg25Request::getStoredSize() {
|
|
return getMsgStoredSize(sizeof(*this), &size_site, &size_oldLinkInfo);
|
|
}
|
|
|
|
|
|
// . fix the char ptrs for sending over the network
|
|
// . use a for loop like we do in Msg20.cpp if we get too many strings
|
|
void Msg25Request::serialize() {
|
|
char *p = ((char*)this) + sizeof(*this);
|
|
|
|
gbmemcpy ( p , ptr_url , size_url );
|
|
p += size_url;
|
|
|
|
gbmemcpy ( p , ptr_site , size_site );
|
|
p += size_site;
|
|
|
|
gbmemcpy ( p , ptr_oldLinkInfo , size_oldLinkInfo );
|
|
p += size_oldLinkInfo;
|
|
}
|
|
|
|
|
|
void Msg25Request::deserialize() {
|
|
|
|
char *p = ((char*)this) + sizeof(*this);
|
|
|
|
ptr_url = p;
|
|
p += size_url;
|
|
|
|
if ( size_url == 0 )
|
|
ptr_url = NULL;
|
|
|
|
ptr_site = p;
|
|
p += size_site;
|
|
|
|
if ( size_site == 0 )
|
|
ptr_site = NULL;
|
|
|
|
ptr_oldLinkInfo = p;
|
|
p += size_oldLinkInfo;
|
|
|
|
if ( size_oldLinkInfo == 0 )
|
|
ptr_oldLinkInfo = NULL;
|
|
}
|
|
|
|
//////
|
|
//
|
|
// OLD interface below here. use the stuff above now so we can send
|
|
// the request to a single host and multiple incoming requests can
|
|
// wait in line, and we can set network bandwidth too.
|
|
//
|
|
/////
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
// . we need the siteRec of the url for merging the linkInfo's
|
|
// . NOTE: make sure no input vars are on the stack in case we block
|
|
// . reallyGetLinkInfo is set to false if caller does not want it but calls
|
|
// us anyway for some reason forgotten...
|
|
bool Msg25::getLinkInfo2(const char *site,
|
|
const char *url,
|
|
// either MODE_PAGELINKINFO or MODE_SITELINKINFO
|
|
bool isSiteLinkInfo,
|
|
int32_t ip,
|
|
int64_t docId,
|
|
collnum_t collnum,
|
|
char *qbuf,
|
|
int32_t qbufSize,
|
|
void *state,
|
|
void (* callback)(void *state),
|
|
bool isInjecting,
|
|
bool printDebugMsgs,
|
|
bool printInXml,
|
|
int32_t siteNumInlinks,
|
|
LinkInfo *oldLinkInfo,
|
|
int32_t niceness,
|
|
bool doLinkSpamCheck,
|
|
bool oneVotePerIpDom,
|
|
int32_t lastUpdateTime,
|
|
bool onlyNeedGoodInlinks,
|
|
int32_t ourHostHash32,
|
|
int32_t ourDomHash32,
|
|
// put LinkInfo output class in here
|
|
SafeBuf *linkInfoBuf )
|
|
{
|
|
logTrace(g_conf.m_logTraceMsg25,"site [%s] url [%s] isSiteLinkInfo [%s] docId [%" PRId64 "]", site, url, isSiteLinkInfo?"true":"false", docId);
|
|
|
|
// reset the ip table
|
|
reset();
|
|
|
|
if ( isSiteLinkInfo )
|
|
m_mode = MODE_SITELINKINFO;
|
|
else
|
|
m_mode = MODE_PAGELINKINFO;
|
|
m_printInXml = printInXml;
|
|
|
|
if ( printDebugMsgs )
|
|
m_pbuf = &m_tmp;
|
|
else
|
|
m_pbuf = NULL;
|
|
|
|
m_onlyNeedGoodInlinks = onlyNeedGoodInlinks;
|
|
// save safebuf ptr, where we store the link info
|
|
m_linkInfoBuf = linkInfoBuf;
|
|
if ( ! linkInfoBuf ) { g_process.shutdownAbort(true); }
|
|
// sanity check
|
|
if ( m_mode == MODE_PAGELINKINFO && ! docId ) { g_process.shutdownAbort(true); }
|
|
|
|
// get collection rec for our collection
|
|
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
|
// bail if NULL
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
log("build: No collection record found when getting "
|
|
"link info.");
|
|
return true;
|
|
}
|
|
|
|
m_gettingList = false;
|
|
// record this in case we were called by Msg3b with the spiders off
|
|
m_spideringEnabled = g_conf.m_spideringEnabled;
|
|
m_ourHostHash32 = ourHostHash32;
|
|
m_ourDomHash32 = ourDomHash32;
|
|
m_niceness = niceness;
|
|
m_maxNumLinkers = MAX_LINKERS;
|
|
m_errno = 0;
|
|
m_numReplyPtrs = 0;
|
|
m_bufPtr = m_buf;
|
|
m_bufEnd = m_buf + MAX_NOTE_BUF_LEN;
|
|
m_dupCount = 0;
|
|
m_spamLinks = 0;
|
|
m_errors = 0;
|
|
m_noText = 0;
|
|
m_ipDupsLinkdb = 0;
|
|
m_docIdDupsLinkdb = 0;
|
|
m_ipDups = 0;
|
|
m_linkSpamLinkdb = 0;
|
|
m_docId = docId;
|
|
m_collnum = collnum;
|
|
m_callback = callback;
|
|
m_state = state;
|
|
m_oneVotePerIpDom = oneVotePerIpDom;
|
|
m_doLinkSpamCheck = doLinkSpamCheck;
|
|
m_siteNumInlinks = siteNumInlinks; // -1 --> unknown
|
|
m_qbuf = qbuf;
|
|
m_qbufSize = qbufSize;
|
|
m_isInjecting = isInjecting;
|
|
m_oldLinkInfo = oldLinkInfo;
|
|
m_ip = ip;
|
|
m_top = iptop(m_ip);
|
|
m_lastUpdateTime = lastUpdateTime;
|
|
|
|
m_nextKey.setMin();
|
|
|
|
m_table.set (4,sizeof(NoteEntry *),0, NULL,0,false,"msg25tab");
|
|
|
|
m_url = url;
|
|
m_site = site;
|
|
|
|
// and the "mid domain hash" so that ibm.com and ibm.ru cannot both
|
|
// vote even if from totally different ips
|
|
Url u;
|
|
u.set(url);
|
|
m_midDomHash = hash32 ( u.getMidDomain() , u.getMidDomainLen() );
|
|
|
|
//log("debug: entering getlinkinfo this=%" PRIx32,(int32_t)this);
|
|
|
|
// then the url/site hash
|
|
m_linkHash64 = (uint64_t) u.getUrlHash64();
|
|
|
|
m_round = 0;
|
|
|
|
// must have a valid ip
|
|
if ( ! ip || ip == -1 ) {
|
|
log("linkdb: no inlinks because ip is invalid");
|
|
g_errno = EBADENGINEER;
|
|
return true;
|
|
}
|
|
|
|
return doReadLoop();
|
|
}
|
|
|
|
|
|
// . returns false if blocked, returns true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool Msg25::doReadLoop() {
|
|
// sanity. no double entry.
|
|
if (m_gettingList) {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
// . get the top X results from this termlist
|
|
// . but skip link: terms with a 1 (no link text) for a score
|
|
// . these keys are ordered from lowest to highest
|
|
key224_t startKey;
|
|
key224_t endKey;
|
|
|
|
logTrace(g_conf.m_logTraceMsg25, "doReadLoop. m_site [%s] m_url [%s]", m_site, m_url);
|
|
|
|
int32_t siteHash32 = hash32n(m_site);
|
|
|
|
// access different parts of linkdb depending on the "mode"
|
|
if (m_mode == MODE_SITELINKINFO) {
|
|
startKey = Linkdb::makeStartKey_uk(siteHash32);
|
|
endKey = Linkdb::makeEndKey_uk(siteHash32);
|
|
} else {
|
|
startKey = Linkdb::makeStartKey_uk(siteHash32, m_linkHash64);
|
|
endKey = Linkdb::makeEndKey_uk(siteHash32, m_linkHash64);
|
|
}
|
|
|
|
// resume from where we left off?
|
|
if (m_round > 0) {
|
|
gbmemcpy (&startKey, &m_nextKey, LDBKS);
|
|
}
|
|
|
|
// but new links: algo does not need internal links with no link test
|
|
// see Links.cpp::hash() for score table
|
|
|
|
m_minRecSizes = READSIZE;
|
|
|
|
//It's expensive to do include the tree. Can we avoid it?
|
|
//When injecting a mass number of documents, eg a dump from wikipedia
|
|
//then the document titles are usually ok already (inlink texts already
|
|
//match the title). When crawling normally then the tree usyally holds
|
|
//the parent linker document links, so in that case we want the tree.
|
|
bool includeTree = !m_req25->m_isInjecting;
|
|
|
|
logDebug(g_conf.m_logDebugLinkInfo, "msg25: reading linkdb list mode=%s site=%s url=%s docid=%" PRId64" linkdbstartkey=%s",
|
|
m_mode == MODE_SITELINKINFO ? "site" : "page",m_site,m_url,m_docId,KEYSTR(&startKey,LDBKS));
|
|
|
|
if (g_process.isShuttingDown()) {
|
|
log(LOG_DEBUG, "linkdb: shutting down. exiting link text loop.");
|
|
g_errno = ESHUTTINGDOWN;
|
|
return false;
|
|
}
|
|
|
|
m_gettingList = true;
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
|
if (!cr) {
|
|
log(LOG_WARN, "linkdb: no coll for collnum %" PRId32,(int32_t)m_collnum);
|
|
g_errno = ENOCOLLREC;
|
|
return true;
|
|
}
|
|
|
|
// . get the linkdb list
|
|
// . we now get the WHOLE list so we can see how many linkers there are
|
|
// . we need a high timeout because udp server was getting suspended
|
|
// before for 30 seconds and this was timing out and yahoo.com
|
|
// was getting spidered w/o any link text -- that's bad.
|
|
// Now we hang indefinitely. We also fixed UdpServer to resend
|
|
// requests after 30 seconds even though it was fully acked in case
|
|
// the receiving host went down and is now back up.
|
|
if (!m_msg5.getList(RDB_LINKDB,
|
|
cr->m_collnum,
|
|
&m_list,
|
|
(char *)&startKey,
|
|
(char *)&endKey,
|
|
m_minRecSizes,
|
|
includeTree,
|
|
0, // startFileNum
|
|
-1, // numFiles
|
|
this,
|
|
gotListWrapper,
|
|
m_niceness,
|
|
true, // error correct?
|
|
-1, //maxRetries
|
|
false)) { //isRealMerge
|
|
return false;
|
|
}
|
|
|
|
// all done
|
|
m_gettingList = false;
|
|
|
|
// debug log
|
|
logDebug(g_conf.m_logDebugBuild, "build: msg25 call to msg5 did not block");
|
|
|
|
// sanity
|
|
if ( !m_msg5.m_msg3.areAllScansCompleted()) { g_process.shutdownAbort(true); }
|
|
|
|
// return true on error
|
|
if (g_errno) {
|
|
log(LOG_WARN, "build: Had error getting linkers to url %s : %s.", m_url, mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
|
|
// . this returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
return gotList();
|
|
}
|
|
|
|
|
|
static void gotListWrapper(void *state, RdbList *list, Msg5 *msg5) {
|
|
Msg25 *THIS = (Msg25 *) state;
|
|
|
|
//log("debug: entering gotlistwrapper this=%" PRIx32,(int32_t)THIS);
|
|
|
|
// return if it blocked
|
|
// . this calls sendRequests()
|
|
// . which can call gotLinkText(NULL) if none sent
|
|
// . which can call doReadLoop() if list was not empty (lost linker)
|
|
// . which can block on msg0
|
|
if ( ! THIS->gotList() )
|
|
return;
|
|
|
|
// error? wait for all replies to come in...
|
|
if ( THIS->m_numRequests > THIS->m_numReplies ) {
|
|
log(LOG_WARN, "msg25: had error %s numreplies=%" PRId32" numrequests=%" PRId32" "
|
|
"round=%" PRId32,
|
|
mstrerror(g_errno),THIS->m_numReplies,THIS->m_numRequests,
|
|
THIS->m_round);
|
|
return;
|
|
}
|
|
|
|
// the call to gotList() may have launched another msg0 even
|
|
// though it did not return false???
|
|
// THIS FIXED a double entry core!!!! msg0 would return after
|
|
// we had destroyed the xmldoc class behind this!!
|
|
if ( THIS->m_gettingList ) {
|
|
//log("debug: waiting for msg0 1");
|
|
return;
|
|
}
|
|
|
|
//log("debug: calling final callback 1");
|
|
|
|
// otherwise call callback, g_errno probably is set
|
|
THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
|
|
}
|
|
|
|
|
|
// . this returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
bool Msg25::gotList() {
|
|
// all done
|
|
m_gettingList = false;
|
|
|
|
// sanity
|
|
if (!m_msg5.m_msg3.areAllScansCompleted()) {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
// return true on error
|
|
if (g_errno) {
|
|
log(LOG_WARN, "build: Had error getting linkers to url %s : %s.", m_url, mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
|
|
// . record the # of hits we got for weighting the score of the
|
|
// link text iff it's truncated by MAX_LINKERS
|
|
// . TODO: if url is really popular, like yahoo, we should use the
|
|
// termFreq of the link: term!
|
|
// . TODO: we now only read in first 50k linkers so Msg0::getList()
|
|
// doesn't waste space through its stupid buffer pre-allocation.
|
|
// it should not preallocate for us since our niceness is over 1
|
|
// cuz we don't require a real-time signal handler to read our reply.
|
|
m_list.resetListPtr();
|
|
// clear this too
|
|
m_k = (Inlink *)-1;
|
|
|
|
// we haven't got any responses as of yet or sent any requests
|
|
m_numReplies = 0;
|
|
m_numRequests = 0;
|
|
|
|
if (m_round == 0) {
|
|
m_linkSpamOut = 0;
|
|
m_numFromSameIp = 0;
|
|
memset ( m_inUse , 0 , MAX_MSG20_OUTSTANDING );
|
|
|
|
// use this to dedup ips in linkdb to avoid looking up their title recs... saves a lot of lookups
|
|
if (!m_ipTable.set(4,0,256,NULL,0,false,"msg25ips")) {
|
|
return true;
|
|
}
|
|
|
|
int64_t needSlots = m_list.getListSize() / LDBKS;
|
|
|
|
// wtf?
|
|
if (m_list.getListSize() > READSIZE + 10000) {
|
|
log(LOG_WARN, "linkdb: read very big linkdb list %" PRId32" bytes bigger than needed",
|
|
m_list.getListSize() - READSIZE);
|
|
}
|
|
|
|
// triple for hash table speed
|
|
needSlots *= 3;
|
|
|
|
// ensure 256 min
|
|
if (needSlots < 256) {
|
|
needSlots = 256;
|
|
}
|
|
|
|
if (!m_fullIpTable.set(4, 0, needSlots, NULL, 0, false, "msg25ip32")) {
|
|
return true;
|
|
}
|
|
|
|
if (!m_firstIpTable.set(4, 0, needSlots, NULL, 0, false, "msg25fip32")) {
|
|
return true;
|
|
}
|
|
|
|
if (!m_docIdTable.set(8, 0, needSlots, NULL, 0, false, "msg25docid")) {
|
|
return true;
|
|
}
|
|
|
|
// . how many link spam inlinks can we accept?
|
|
// . they do not contribute to quality
|
|
// . they only contribute to link text
|
|
// . their number and weights depend on our ROOT QUALITY!
|
|
// . we need this because our filters are too stringent!
|
|
m_spamCount = 0;
|
|
m_maxSpam = 0;
|
|
m_numDocIds = 0;
|
|
m_cblocks = 0;
|
|
m_uniqueIps = 0;
|
|
}
|
|
|
|
// if we are doing site linkinfo, bail now
|
|
if (m_mode == MODE_SITELINKINFO) {
|
|
logTrace(g_conf.m_logTraceMsg25, "Read from disk. sending requests for linkers to url [%s]. m_siteNumInlinks=%" PRId32". mode=SITELINKINFO",
|
|
m_url, m_siteNumInlinks);
|
|
return sendRequests();
|
|
}
|
|
|
|
// when MODE_PAGELINKINFO we must have a site quality for that site
|
|
if (m_siteNumInlinks < 0) {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
if ( m_siteNumInlinks >= 1000 ) {m_maxSpam = 4000;}
|
|
else if ( m_siteNumInlinks >= 900 ) {m_maxSpam = 3000;}
|
|
else if ( m_siteNumInlinks >= 800 ) {m_maxSpam = 2000;}
|
|
else if ( m_siteNumInlinks >= 700 ) {m_maxSpam = 1000;}
|
|
else if ( m_siteNumInlinks >= 600 ) {m_maxSpam = 100;}
|
|
else if ( m_siteNumInlinks >= 500 ) {m_maxSpam = 20;}
|
|
else if ( m_siteNumInlinks >= 200 ) {m_maxSpam = 15;}
|
|
else if ( m_siteNumInlinks >= 70 ) {m_maxSpam = 10;}
|
|
else if ( m_siteNumInlinks >= 20 ) {m_maxSpam = 7;}
|
|
|
|
logTrace(g_conf.m_logTraceMsg25, "Read from disk. sending requests for linkers to url [%s]. m_siteNumInlinks=%" PRId32". mode=PAGELINKINFO",
|
|
m_url, m_siteNumInlinks);
|
|
|
|
// now send the requests
|
|
m_list.resetListPtr();
|
|
return sendRequests();
|
|
}
|
|
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
bool Msg25::sendRequests() {
|
|
uint64_t lastDocId = 0LL;
|
|
|
|
// smaller clusters cannot afford to launch the full 300 msg20s
|
|
// because it can clog up one host!
|
|
float ratio = (float)g_hostdb.getNumHosts() / 128.0;
|
|
int32_t ourMax = (int32_t)(ratio * (float)MAX_MSG20_OUTSTANDING);
|
|
if (ourMax > MAX_MSG20_OUTSTANDING) {
|
|
ourMax = MAX_MSG20_OUTSTANDING;
|
|
}
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
|
if (!cr) {
|
|
log(LOG_WARN, "linkdb: collnum %" PRId32" is gone 1", (int32_t)m_collnum);
|
|
// that func doesn't set g_errno so we must
|
|
g_errno = ENOCOLLREC;
|
|
return true;
|
|
}
|
|
|
|
// if more than 300 sockets in use max this 1. prevent udp socket clog.
|
|
if (g_udpServer.getNumUsedSlots() >= 300) {
|
|
ourMax = 1;
|
|
}
|
|
|
|
// keep sending requests
|
|
for (;;) {
|
|
// if we still haven't gotten enough good inlinks, quit after
|
|
// looking up this many titlerecs
|
|
if (m_numRequests >= MAX_DOCIDS_TO_SAMPLE) {
|
|
break;
|
|
}
|
|
|
|
// . we only need at most MAX_LINKERS in our sample
|
|
// . but we do keep "losers" until the very end so we can
|
|
// remove them in an order-independent fashion to guarantee
|
|
// consistency. otherwise, "losers" will depend on the order
|
|
// in which the Msg20Replies are received to some degree
|
|
if (m_numReplyPtrs >= MAX_LINKERS) {
|
|
break;
|
|
}
|
|
|
|
// do not have more than this many outstanding Msg23s
|
|
if (m_numRequests - m_numReplies >= ourMax) {
|
|
break;
|
|
}
|
|
|
|
// we may have pre-allocated the LinkText classes for
|
|
// use be currently outstanding Msg20 requests, therefore
|
|
// they are not available at this time... m_numReplyPtrs is
|
|
// how many replies we have kept.
|
|
if (m_numReplyPtrs + m_numRequests - m_numReplies >= MAX_LINKERS) {
|
|
break;
|
|
}
|
|
|
|
// reset g_errno just in case
|
|
g_errno = 0;
|
|
|
|
char isLinkSpam = 0;
|
|
int32_t itop;
|
|
uint32_t ip32;
|
|
uint64_t docId;
|
|
int32_t discovered = 0;
|
|
|
|
// . recycle inlinks from the old link info guy
|
|
// . this keeps our inlinks persistent!!! very nice...
|
|
// . useful for when they disappear on an aggregator site
|
|
if (m_list.isExhausted() && m_round == 0) {
|
|
// recycle old inlinks at this point
|
|
if (m_k == (Inlink *)-1) {
|
|
m_k = nullptr;
|
|
}
|
|
|
|
// get it
|
|
m_k = m_oldLinkInfo ? m_oldLinkInfo->getNextInlink(m_k) : nullptr;
|
|
|
|
// if none left, we really are done
|
|
if (!m_k) {
|
|
break;
|
|
}
|
|
// set these
|
|
itop = m_k->m_ip & 0x00ffffff;
|
|
ip32 = m_k->m_ip;
|
|
isLinkSpam = m_k->m_isLinkSpam;
|
|
docId = m_k->m_docId;
|
|
discovered = m_k->m_firstIndexedDate;
|
|
} else if (m_list.isExhausted() && m_round != 0) {
|
|
break;
|
|
} else if (m_mode == MODE_PAGELINKINFO) {
|
|
// get the current key if list has more left
|
|
key224_t key;
|
|
m_list.getCurrentKey(&key);
|
|
|
|
itop = Linkdb::getLinkerIp24_uk(&key);
|
|
ip32 = Linkdb::getLinkerIp_uk(&key);
|
|
isLinkSpam = Linkdb::isLinkSpam_uk(&key);
|
|
docId = Linkdb::getLinkerDocId_uk(&key);
|
|
discovered = Linkdb::getDiscoveryDate_uk(&key);
|
|
|
|
// update this
|
|
gbmemcpy (&m_nextKey, &key, LDBKS);
|
|
|
|
m_nextKey++;
|
|
} else {
|
|
// otherwise this is a "site" key. we are getting all the
|
|
// inlinks to any page on the site...
|
|
// get the current key if list has more left
|
|
key224_t key;
|
|
m_list.getCurrentKey(&key);
|
|
|
|
itop = Linkdb::getLinkerIp24_uk(&key);
|
|
ip32 = Linkdb::getLinkerIp_uk(&key);
|
|
|
|
isLinkSpam = 0;
|
|
docId = Linkdb::getLinkerDocId_uk(&key);
|
|
|
|
discovered = Linkdb::getDiscoveryDate_uk(&key);
|
|
|
|
// update this
|
|
gbmemcpy (&m_nextKey, &key, LDBKS);
|
|
|
|
m_nextKey++;
|
|
}
|
|
|
|
// advance to next rec if the list has more to go
|
|
if (!m_list.isExhausted()) {
|
|
m_list.skipCurrentRecord();
|
|
}
|
|
|
|
// clear this if we should
|
|
if (!m_doLinkSpamCheck) {
|
|
isLinkSpam = 0;
|
|
}
|
|
|
|
// try using this to save mem then
|
|
if (docId == lastDocId) {
|
|
m_docIdDupsLinkdb++;
|
|
continue;
|
|
}
|
|
|
|
// update this then
|
|
lastDocId = docId;
|
|
|
|
// count unique docids
|
|
m_numDocIds++;
|
|
|
|
//
|
|
// get the next docId
|
|
//
|
|
// . now the 4 hi bits of the score represent special things
|
|
// . see Msg18.cpp:125 where we repeat this
|
|
|
|
// count unique ips for steve's stats
|
|
if (!m_fullIpTable.isInTable(&ip32)) {
|
|
// return true on error
|
|
if (!m_fullIpTable.addKey(&ip32)) {
|
|
return true;
|
|
}
|
|
|
|
// count it
|
|
m_uniqueIps++;
|
|
}
|
|
|
|
// TODO: if inlinker is internal by having the same DOMAIN
|
|
// even though a different ip, we should adjust this logic!!
|
|
if (itop != m_top) {
|
|
int32_t slot = m_ipTable.getSlot(&itop);
|
|
if (slot != -1) {
|
|
m_ipDupsLinkdb++;
|
|
continue;
|
|
}
|
|
|
|
// store it
|
|
if (!m_ipTable.addKey(&itop)) {
|
|
return true;
|
|
}
|
|
|
|
// count unique cblock inlinks
|
|
m_cblocks++;
|
|
} else {
|
|
// if we are local... allow up to 5 votes, weight is diminished
|
|
// count your own, only once!
|
|
if (m_numFromSameIp == 0) {
|
|
m_cblocks++;
|
|
}
|
|
|
|
// count it as internal
|
|
m_numFromSameIp++;
|
|
|
|
// only get link text from first 5 internal linkers,
|
|
// they will all count as one external linker
|
|
if (m_numFromSameIp > MAX_INTERNAL_INLINKS) {
|
|
m_ipDupsLinkdb++;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// count this request as launched
|
|
m_numRequests++;
|
|
|
|
// if linkspam, count this
|
|
if (isLinkSpam) {
|
|
m_linkSpamOut++;
|
|
}
|
|
|
|
// find a msg20 we can use
|
|
int32_t j ;
|
|
for (j = 0; j < MAX_MSG20_OUTSTANDING; j++) {
|
|
if (!m_inUse[j])
|
|
break;
|
|
}
|
|
|
|
// sanity check
|
|
if (j >= MAX_MSG20_OUTSTANDING) {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
// "claim" it
|
|
m_inUse[j] = 1;
|
|
|
|
// . this will return false if blocks
|
|
// . we EXPECT these recs to be there...
|
|
// . now pass in the score for the newAlgo
|
|
Msg20Request *r = &m_msg20Requests[j];
|
|
r->reset();
|
|
|
|
// set the request
|
|
r->m_getLinkText = true;
|
|
r->m_onlyNeedGoodInlinks = m_onlyNeedGoodInlinks;
|
|
|
|
// is linkee a site? then we will try to find link text
|
|
// to any page on that site...
|
|
// if we are in site mode, then m_url should be m_site!
|
|
if (m_mode == MODE_PAGELINKINFO) {
|
|
r->m_isSiteLinkInfo = false;
|
|
r->ptr_linkee = m_url;
|
|
r->size_linkee = strlen(m_url) + 1; // include \0
|
|
logTrace(g_conf.m_logTraceMsg25, "send request with linkee=m_url [%s], docId=%" PRId64, m_url, docId);
|
|
} else {
|
|
r->m_isSiteLinkInfo = true;
|
|
r->ptr_linkee = m_site;
|
|
r->size_linkee = strlen(m_site) + 1; // include \0
|
|
logTrace(g_conf.m_logTraceMsg25, "send request with linkee=m_site [%s], docId=%" PRId64, m_site, docId);
|
|
}
|
|
|
|
r->m_collnum = cr->m_collnum;
|
|
r->m_docId = docId;
|
|
r->m_niceness = m_niceness;
|
|
r->m_state = r;
|
|
r->m_state2 = this;
|
|
r->m_j = j;
|
|
r->m_callback = gotLinkTextWrapper;
|
|
// do NOT get summary stuff!! slows us down...
|
|
r->m_numSummaryLines = 0;
|
|
// get title now for steve
|
|
r->m_titleMaxLen = 300;
|
|
r->m_summaryMaxLen = 0;
|
|
r->m_discoveryDate = discovered;
|
|
// buzz sets the query to see if inlinker has the query terms
|
|
// so we can set <absScore2>
|
|
r->m_langId = langUnknown; // no synonyms i guess
|
|
r->m_prefferedResultLangId = langUnknown;
|
|
r->ptr_qbuf = m_qbuf;
|
|
r->size_qbuf = m_qbufSize;
|
|
|
|
// place holder used below
|
|
r->m_isLinkSpam = isLinkSpam;
|
|
// buzz may not want link spam checks! they are pretty clean.
|
|
r->m_doLinkSpamCheck = m_doLinkSpamCheck;
|
|
// this just gets the LinkInfo class from the TITLEREC
|
|
// so that Msg25 should not be called. thus avoiding an
|
|
// infinite loop!
|
|
// we need the LinkInfo of each inlinker to get the inlinker's
|
|
// numInlinksToSite, which is needed to call
|
|
// makeLinkInfo() below.
|
|
r->m_getLinkInfo = true;
|
|
|
|
r->m_ourHostHash32 = m_ourHostHash32;
|
|
r->m_ourDomHash32 = m_ourDomHash32;
|
|
|
|
// . MAKE A FAKE MSG20REPLY for pre-existing Inlinks
|
|
// . the opposite of Inlink::set(Msg20Reply *)
|
|
// . we used that as a reference
|
|
// . ISSUES:
|
|
// . 1. if inlinker gets banned, we still recycle it
|
|
// . 2. if ad id gets banned, we still recycle it
|
|
// . 3. we cannot dedup by the vectors, because we do not
|
|
// store those in the Inlink class (Msg25::isDup())
|
|
if (m_k && m_k != (Inlink *)-1) {
|
|
Msg20Reply *rep = &m_msg20Replies[j];
|
|
rep->reset();
|
|
m_k->setMsg20Reply(rep);
|
|
|
|
// let receiver know we are a recycle
|
|
rep->m_recycled = 1;
|
|
|
|
// . this returns true if we are done
|
|
// . g_errno is set on error, and true is returned
|
|
if (gotLinkText(r)) {
|
|
return true;
|
|
}
|
|
|
|
// keep going
|
|
continue;
|
|
}
|
|
|
|
logDebug(g_conf.m_logDebugLinkInfo, "msg25: getting single link mode=%s site=%s url=%s docid=%" PRId64" request=%" PRId32,
|
|
m_mode == MODE_SITELINKINFO ? "site" : "page", m_site, m_url, docId, m_numRequests - 1);
|
|
|
|
// returns false if blocks, true otherwise
|
|
bool status = m_msg20s[j].getSummary(r);
|
|
// if blocked launch another
|
|
if (!status) {
|
|
continue;
|
|
}
|
|
|
|
// . this returns true if we are done
|
|
// . g_errno is set on error, and true is returned
|
|
if (gotLinkText(r)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// we may still be waiting on some replies to come in
|
|
if ( m_numRequests > m_numReplies )
|
|
return false;
|
|
|
|
// if the list had linkdb recs in it, but we launched no msg20s
|
|
// because they were "lost" then we end up here.
|
|
|
|
// . otherwise, we got everyone, so go right to the merge routine
|
|
// . returns false if not all replies have been received
|
|
// . returns true if done
|
|
// . sets g_errno on error
|
|
// . if all replies are in then this can call doReadLoop() and
|
|
// return false!
|
|
return gotLinkText ( NULL );
|
|
}
|
|
|
|
|
|
static bool gotLinkTextWrapper(void *state) {
|
|
Msg20Request *req = (Msg20Request *)state;
|
|
// get our Msg25
|
|
Msg25 *THIS = (Msg25 *)req->m_state2;
|
|
|
|
//log("debug: entering gotlinktextwrapper this=%" PRIx32,(int32_t)THIS);
|
|
|
|
// . this returns false if we're still awaiting replies
|
|
// . returns true if all replies have been received and processed
|
|
if ( THIS->gotLinkText ( req ) ) {
|
|
|
|
//log("debug: calling final callback 2");
|
|
if ( THIS->m_gettingList )
|
|
return false;
|
|
|
|
// . now call callback, we're done
|
|
// . g_errno will be set on critical error
|
|
THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
|
|
return true;
|
|
}
|
|
// if gotLinkText() called doReadLoop() it blocked calling msg0
|
|
if ( THIS->m_gettingList )
|
|
return false;
|
|
// . try to send more requests
|
|
// . return if it blocked
|
|
// . shit, this could call doReadLoop() now that we have added
|
|
// the lostdate filter, because there will end up being no requests
|
|
// sent out even though the list was of positive size, so it
|
|
// will try to read the next round of msg0.
|
|
if ( ! THIS->sendRequests ( ) )
|
|
return false;
|
|
// . shit, therefore, return false if we did launch a msg0 after this
|
|
if ( THIS->m_gettingList )
|
|
return false;
|
|
|
|
//log("debug: calling final callback 3");
|
|
|
|
// otherwise we're done
|
|
THIS->m_callback ( THIS->m_state );//, THIS->m_linkInfo );
|
|
return true;
|
|
}
|
|
|
|
|
|
static const char *getExplanation(const char *note) {
|
|
|
|
if ( ! note ) return NULL;
|
|
if ( strcmp(note,"good")==0) return NULL;
|
|
|
|
static const char * const s_notes[] = {
|
|
|
|
"same mid domain",
|
|
"inlinker's domain, excluding TLD, is same as page it "
|
|
"links to",
|
|
|
|
"linker banned or filtered",
|
|
"inlinker's domain has been manually banned",
|
|
|
|
"no link text",
|
|
"inlink contains no text, probably an image",
|
|
|
|
"banned by ad id",
|
|
"inlinking page contains a google ad id that has been "
|
|
"manually banned",
|
|
|
|
"ip dup",
|
|
"inlinker is from the same C Block as another inlinker",
|
|
|
|
"first ip dup",
|
|
"first recorded C Block of inlinker matches another inlinker",
|
|
|
|
"post page",
|
|
"inlink is from a page that contains a form tag whose "
|
|
"submit url contains character sequence that are indicative "
|
|
"of posting a comment, thereby indicating that the inlink "
|
|
"could be in a comment section",
|
|
|
|
"path is cgi",
|
|
"inlinker url contains a question mark",
|
|
|
|
"similar link desc",
|
|
"the text surrounding the anchor text of this inlink is "
|
|
"too similar to that of another processed inlink",
|
|
|
|
"similar content",
|
|
"the inlinker's page content is "
|
|
"too similar to that of another processed inlink",
|
|
|
|
"doc too big",
|
|
"inlinker's page is too large, and was truncated, and "
|
|
"might have lost some indicators",
|
|
|
|
"link chain middle",
|
|
"inlink is in the middle of a list of inlinks, without "
|
|
"any non-link text separating it, inidicative of text ads",
|
|
|
|
"link chain right",
|
|
"inlink is at the end of a list of inlinks, without "
|
|
"any non-link text separating it, inidicative of text ads",
|
|
|
|
"link chain left",
|
|
"inlink is at the beginning of a list of inlinks, without "
|
|
"any non-link text separating it, inidicative of text ads",
|
|
|
|
"near sporny outlink",
|
|
"inlink is near another outlink on that page which contains "
|
|
"porn words in its domain or url",
|
|
|
|
"70.8*.",
|
|
"inlinker is from an IP address that is of the form "
|
|
"70.8*.*.* which is a notorious block of spam",
|
|
|
|
".info tld",
|
|
"inlinker's tld is .info, indicative of spam",
|
|
|
|
".biz tld",
|
|
"inlinker's tld is .biz, indicative of spam",
|
|
|
|
"textarea tag",
|
|
"inlinker page contains a textarea html tag, indicative "
|
|
"of being in a comment section",
|
|
|
|
"stats page",
|
|
"inlinker is from a web access stats page",
|
|
|
|
"has dmoz path",
|
|
"inlinker url looks like a dmoz mirror url",
|
|
|
|
"guestbook in hostname",
|
|
"inlinker is from a guestbook site",
|
|
|
|
"ad table",
|
|
"inlink appears to be in a table of ad links",
|
|
|
|
"duplicateIPCClass",
|
|
"duplicate ip c block"
|
|
|
|
};
|
|
|
|
int32_t n = sizeof(s_notes)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i += 2 ) {
|
|
if ( strcmp(note,s_notes[i]) != 0 ) continue;
|
|
return s_notes[i+1];
|
|
}
|
|
|
|
if ( strncmp(note,"path has",8) == 0 )
|
|
return "inlinker's url contains keywords indicative "
|
|
"of a comment page, guestbook page or "
|
|
"link exchange";
|
|
|
|
return "inlinker's page contains the described text, indicative of "
|
|
"being a link exchange or being in a comment section or "
|
|
"being an otherwise spammy page";
|
|
}
|
|
|
|
|
|
// . returns false if not all replies have been received (or timed/erroredout)
|
|
// . returns true if done
|
|
// . sets g_errno on error
|
|
bool Msg25::gotLinkText(Msg20Request *msg20req) {
|
|
int32_t j = -1;
|
|
Msg20 *msg20 = NULL;
|
|
Msg20Reply *msg20reply = NULL;
|
|
// the alloc size of the reply
|
|
int32_t replySize = 0;
|
|
|
|
// set the reply
|
|
if(msg20req) {
|
|
j = msg20req->m_j;
|
|
if(m_msg20Requests[j].m_j != j)
|
|
g_process.shutdownAbort(true);
|
|
// get the msg20
|
|
msg20 = &m_msg20s[j];
|
|
// set the reply
|
|
msg20reply = msg20->m_r;
|
|
// the reply size
|
|
replySize = msg20->m_replyMaxSize;
|
|
// inc # of replies
|
|
m_numReplies++;
|
|
// discount this if was linkspam
|
|
if ( msg20req->m_isLinkSpam )
|
|
m_linkSpamOut--;
|
|
// "make available" msg20 and msg20Request #j for re-use
|
|
m_inUse [ j ] = 0;
|
|
// . propagate internal error to g_errno
|
|
// . if g_errno was set then the reply will be empty
|
|
if ( msg20reply && msg20reply->m_errno && ! g_errno )
|
|
g_errno = msg20reply->m_errno;
|
|
// if it had an error print it for now
|
|
if ( msg20reply && msg20reply->m_errno )
|
|
log(LOG_WARN, "query: msg25: msg20 had error for docid %" PRId64" : "
|
|
"%s",msg20reply->m_docId, mstrerror(msg20reply->m_errno));
|
|
}
|
|
|
|
// what is the reason it cannot vote...?
|
|
const char *note = NULL;
|
|
int32_t noteLen = 0;
|
|
|
|
// assume it CAN VOTE for now
|
|
bool good = true;
|
|
|
|
// just log then reset g_errno if it's set
|
|
if ( g_errno ) {
|
|
// a dummy docid
|
|
int64_t docId = -1LL;
|
|
// set it right
|
|
if ( msg20reply ) docId = msg20reply->m_docId;
|
|
// we often restrict link: termlist lookup to indexdb root
|
|
// file, so we end up including terms from deleted docs...
|
|
// this we get a lot of ENOTFOUND errors.
|
|
// MDW: we no longer do this restriction...
|
|
log(LOG_DEBUG,
|
|
"build: Got error getting link text from one document: "
|
|
"%s. Will have to restart later. docid=%" PRId64".",
|
|
mstrerror(g_errno),docId);
|
|
// this is a special case
|
|
if ( g_errno == ECANCELLED ||
|
|
g_errno == ENOCOLLREC ||
|
|
g_errno == ENOMEM ||
|
|
g_errno == ENOSLOTS ) {
|
|
m_errors++;
|
|
if ( m_numReplies < m_numRequests ) return false;
|
|
if ( m_gettingList ) {
|
|
log("linkdb: gotLinkText: gettinglist1");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
// otherwise, keep going, but this reply can not vote
|
|
good = false;
|
|
m_errors++;
|
|
note = mstrerror ( g_errno );
|
|
// reset g_errno
|
|
g_errno = 0;
|
|
}
|
|
|
|
// are we an "internal" inlink?
|
|
bool internal = false;
|
|
|
|
if (msg20reply) {
|
|
if (iptop(msg20reply->m_firstIp) == m_top || iptop(msg20reply->m_ip) == m_top) {
|
|
internal = true;
|
|
}
|
|
|
|
// . if the mid domain hash of the inlinker matches ours, no voting
|
|
// . this is set to 0 for recycles
|
|
if (good && msg20reply->m_midDomHash == m_midDomHash && !internal) {
|
|
good = false;
|
|
m_sameMidDomain++;
|
|
note = "same mid domain";
|
|
}
|
|
|
|
// is the inlinker banned?
|
|
if (good && msg20reply->m_isBanned) {
|
|
// it is no longer good
|
|
good = false;
|
|
// inc the general count, too
|
|
m_spamLinks++;
|
|
// count each *type* of "link spam". the type is given
|
|
// by linkText->m_note and is a string...
|
|
note = "linker banned or filtered";
|
|
}
|
|
|
|
// get the linker url
|
|
Url linker;
|
|
linker.set(msg20reply->ptr_ubuf, msg20reply->size_ubuf);
|
|
|
|
// sanity check, Xml::set() requires this...
|
|
if (msg20reply->size_rssItem > 0 && msg20reply->ptr_rssItem[msg20reply->size_rssItem - 1] != 0) {
|
|
log(LOG_WARN, "admin: received corrupt rss item of size %" PRId32" not null terminated from linker %s",
|
|
msg20reply->size_rssItem,msg20reply->ptr_ubuf);
|
|
// ignore it for now
|
|
msg20reply->size_rssItem = 0;
|
|
msg20reply->ptr_rssItem = NULL;
|
|
}
|
|
|
|
// . if no link text, count as error
|
|
// . linkText->getLinkTextLen()
|
|
if (good &&
|
|
msg20reply->size_linkText <= 0 &&
|
|
msg20reply->size_rssItem <= 0) {
|
|
good = false;
|
|
m_noText++;
|
|
note = "no link text";
|
|
}
|
|
|
|
// discount if LinkText::isLinkSpam() or isLinkSpam2() said it
|
|
// should not vote
|
|
if (good && ! internal && msg20reply->m_isLinkSpam &&
|
|
// we can no allow link spam iff it is below the max!
|
|
++m_spamCount >= m_maxSpam ) {
|
|
// it is no longer good
|
|
good = false;
|
|
// inc the general count, too
|
|
m_spamLinks++;
|
|
// count each *type* of "link spam". the type is given
|
|
// by linkText->m_note and is a string...
|
|
note = msg20reply-> ptr_note;
|
|
noteLen = msg20reply->size_note - 1; // includes \0
|
|
}
|
|
}
|
|
|
|
// loop over all the replies we got so far to see if "msg20reply" is a dup
|
|
// or if another reply is a dup of "msg20reply"
|
|
int32_t n = m_numReplyPtrs;
|
|
|
|
// do not do the deduping if no reply given
|
|
// or if "msg20reply" already considered bad
|
|
if (!msg20reply || !good) {
|
|
n = 0;
|
|
}
|
|
|
|
// this is the "dup"
|
|
Msg20Reply *dup = nullptr;
|
|
int32_t dupi = -1;
|
|
|
|
// . we do not actually remove the Msg20Replies at this point because
|
|
// this filter is dependent on the order in which we receive the
|
|
// Msg20Replies. we do the removal below after all replies are in.
|
|
// . NO! not anymore, i don't want to hit MAX_LINKERS and end up
|
|
// removing all the dups below and end up with hardly any inlinkers
|
|
for (int32_t i = 0; !internal && i < n; i++) {
|
|
// get the reply in a ptr
|
|
Msg20Reply *p = m_replyPtrs[i];
|
|
|
|
// allow internal inlinks to match
|
|
if (iptop(p->m_ip) == m_top || iptop(p->m_firstIp) == m_top) {
|
|
continue;
|
|
}
|
|
|
|
// is "p" a dup of us? (or we of it?)
|
|
const char *dupNote = isDup(msg20reply, p);
|
|
|
|
// if it is not a dup, keep going
|
|
if (!dupNote) {
|
|
continue;
|
|
}
|
|
|
|
// getLoser() returns the lowest-scoring reply of "msg20reply" and "p"
|
|
Msg20Reply *tmp = getLoser(msg20reply, p);
|
|
// is it worse than the current "dup"? if so, update "dup"
|
|
dup = getLoser(tmp, dup);
|
|
|
|
// get the "i" value
|
|
if (dup == msg20reply) {
|
|
dupi = j;
|
|
}
|
|
|
|
if ( dup == p ) {
|
|
dupi = i;
|
|
}
|
|
|
|
// we got a dup
|
|
good = false;
|
|
note = dupNote;
|
|
}
|
|
|
|
if (dup) {
|
|
m_dupCount++;
|
|
|
|
// if "p" is the lower-scoring dup, put "msg20reply" in its place, and then
|
|
// set "msg20reply" to "p" doing a swap operation
|
|
if (dup != msg20reply) {
|
|
// sanity check
|
|
if (dupi < 0) {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
// HACK: swap them
|
|
Msg20Reply *tmp = m_replyPtrs[dupi];
|
|
int32_t tmpSize = m_replySizes[dupi];
|
|
m_replyPtrs[dupi] = msg20reply;
|
|
m_replySizes[dupi] = replySize;
|
|
msg20reply = tmp;
|
|
replySize = tmpSize;
|
|
// make Msg20 point to that old "dup" reply
|
|
msg20->m_r = msg20reply;
|
|
msg20->m_replyMaxSize = replySize;
|
|
}
|
|
}
|
|
|
|
if (msg20reply && good) {
|
|
int32_t iptop1 = iptop(msg20reply->m_ip);
|
|
int32_t iptop2 = iptop(msg20reply->m_firstIp);
|
|
if (m_firstIpTable.isInTable(&iptop1) ||
|
|
m_firstIpTable.isInTable(&iptop2)) {
|
|
good = false;
|
|
m_ipDups++;
|
|
note = "first ip dup";
|
|
}
|
|
// add to table. return true with g_errno set on error
|
|
if (!m_firstIpTable.addKey(&iptop1) || !m_firstIpTable.addKey(&iptop2)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
|
|
// BUT do not set good to false so it is stored so we can look
|
|
// at the linktext for indexing purposes anyway, but we do not
|
|
// want to count it towards the # of good siteinlinks because
|
|
// its internal
|
|
if (internal && !note) {
|
|
m_ipDups++;
|
|
note = "ip dup";
|
|
}
|
|
|
|
if (msg20reply && !good && !note) {
|
|
note = "unknown reason";
|
|
}
|
|
|
|
// compile the reason it could not vote
|
|
if (msg20reply && !good) {
|
|
// set "noteLen" if not yet set
|
|
if ( note && noteLen == 0 ) {
|
|
noteLen = strlen ( note );
|
|
}
|
|
|
|
// add it to our table
|
|
addNote ( note , noteLen , msg20reply->m_docId );
|
|
// . free the reply since it cannot vote
|
|
// . no, it should be auto-freed when the msg20 is re-used
|
|
}
|
|
|
|
bool store = true;
|
|
if (!good) {
|
|
store = false;
|
|
}
|
|
|
|
// . if doing for display, show good
|
|
// . steve needs to see the bad guys as well ppl can learn
|
|
if ( m_pbuf ) {
|
|
store = true;
|
|
}
|
|
|
|
// now for showing recommended link sources let's include the
|
|
// bad boys because we might have mislabelled them as bad, b/c maybe
|
|
// google thinks they are good! fix for
|
|
// XmlDoc::getRecommendedLinksBuf()
|
|
if (!m_onlyNeedGoodInlinks) {
|
|
store = true;
|
|
}
|
|
|
|
// how is this NULL?
|
|
if (!msg20reply) {
|
|
store = false;
|
|
}
|
|
|
|
if (store) {
|
|
// save the reply
|
|
m_replyPtrs[m_numReplyPtrs] = msg20reply;
|
|
m_replySizes[m_numReplyPtrs] = replySize;
|
|
// why we do this?
|
|
if (note && !msg20reply->ptr_note) {
|
|
msg20reply->ptr_note = (char *)note;
|
|
msg20reply->size_note = noteLen + 1;
|
|
}
|
|
|
|
// store this in the reply for convenience
|
|
msg20reply->m_discoveryDate = msg20req->m_discoveryDate;
|
|
m_numReplyPtrs++;
|
|
|
|
// do not allow Msg20 to free it
|
|
msg20->m_r = NULL;
|
|
}
|
|
|
|
// free the reply buf of this msg20 now to save mem because
|
|
// we can't send out like 100,000 of these for yahoo.com to find
|
|
// less than 1000 good ones!
|
|
// tell msg20 to free the reply if not null
|
|
if (msg20) {
|
|
msg20->reset();
|
|
}
|
|
|
|
// wait for all replies to come in
|
|
if (m_numReplies < m_numRequests) {
|
|
return false;
|
|
}
|
|
|
|
if (m_gettingList) {
|
|
log("linkdb: gotLinkText: gettinglist2");
|
|
return false;
|
|
}
|
|
|
|
//
|
|
//
|
|
// READ MORE FROM LINKDB to avoid truncation
|
|
//
|
|
//
|
|
|
|
// no! now we shrink a list by removing dup docids from it
|
|
// in Msg0.cpp before sending back to save memory and cpu and
|
|
// network. so it can be well below m_minRecSizes and still need
|
|
// to go on to the next round
|
|
if (m_list.getListSize() > 0 && m_numReplyPtrs < MAX_LINKERS) {
|
|
// count it
|
|
m_round++;
|
|
|
|
logDebug(g_conf.m_logDebugLinkInfo, "linkdb: recalling round=%" PRId32" for %s=%s req=%p numlinkerreplies=%" PRId32,
|
|
m_round, m_mode == MODE_SITELINKINFO ? "site" : "page", m_site, m_req25, m_numReplyPtrs);
|
|
|
|
// and re-call. returns true if did not block.
|
|
// returns true with g_errno set on error.
|
|
if (!doReadLoop()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
//
|
|
//
|
|
// process all "good" Msg20Replies
|
|
//
|
|
//
|
|
|
|
logDebug(g_conf.m_logDebugLinkInfo, "msg25: making final linkinfo mode=%s site=%s url=%s docid=%" PRId64,
|
|
m_mode == MODE_SITELINKINFO ? "site" : "page", m_site, m_url, m_docId);
|
|
|
|
const CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
|
if (!cr) {
|
|
log(LOG_WARN, "linkdb: collnum %" PRId32" is gone 2",(int32_t)m_collnum);
|
|
// that func doesn't set g_errno so we must
|
|
g_errno = ENOCOLLREC;
|
|
return true;
|
|
}
|
|
|
|
const char *coll = cr->m_coll;
|
|
|
|
// . this returns NULL and sets g_errno on error
|
|
// . returns an allocated ptr to a LinkInfo class
|
|
// . we are responsible for freeing
|
|
// . LinkInfo::getSize() returns the allocated size
|
|
makeLinkInfo(m_ip,
|
|
m_replyPtrs,
|
|
m_numReplyPtrs,
|
|
m_docId, // linkee docid
|
|
m_lastUpdateTime,
|
|
m_onlyNeedGoodInlinks,
|
|
this ,
|
|
m_linkInfoBuf );
|
|
// return true with g_errno set on error
|
|
if (!m_linkInfoBuf->length()) {
|
|
log("build: msg25 linkinfo set: %s",mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
|
|
// if nothing to print out, be on our way
|
|
if (!m_pbuf) {
|
|
return true;
|
|
}
|
|
|
|
/////////////////////////////////////////
|
|
//
|
|
// print out for PageParser.cpp
|
|
//
|
|
/////////////////////////////////////////
|
|
|
|
// sort by docid so we get consistent output on PageParser.cpp
|
|
char sflag = 1;
|
|
while ( sflag ) {
|
|
sflag = 0;
|
|
for ( int32_t i = 1 ; i < m_numReplyPtrs ; i++ ) {
|
|
// sort by quality first
|
|
char q1 = m_replyPtrs[i-1]->m_siteRank;//docQuality;
|
|
char q2 = m_replyPtrs[i ]->m_siteRank;//docQuality;
|
|
if ( q1 > q2 ) continue;
|
|
// if tied, check docids
|
|
int64_t d1 = m_replyPtrs[i-1]->m_docId;
|
|
int64_t d2 = m_replyPtrs[i ]->m_docId;
|
|
if ( d1 == d2 )
|
|
log(LOG_DEBUG, "build: got same docid in msg25 "
|
|
"d=%" PRId64" url=%s",d1,
|
|
m_replyPtrs[i]->ptr_ubuf);
|
|
if ( q1 == q2 && d1 <= d2 ) continue;
|
|
// swap them
|
|
Msg20Reply *tmp = m_replyPtrs [i-1];
|
|
int32_t size = m_replySizes [i-1];
|
|
m_replyPtrs [i-1] = m_replyPtrs [i ];
|
|
m_replySizes[i-1] = m_replySizes [i ];
|
|
m_replyPtrs [i ] = tmp;
|
|
m_replySizes[i ] = size;
|
|
sflag = 1;
|
|
}
|
|
}
|
|
|
|
time_t ttt = 0;
|
|
struct tm tm_buf;
|
|
struct tm *timeStruct = localtime_r(&ttt,&tm_buf);
|
|
m_lastUpdateTime = ttt;
|
|
char buf[64];
|
|
if ( timeStruct )
|
|
strftime ( buf, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
|
else
|
|
sprintf(buf,"UNKNOWN time");
|
|
|
|
const char *ss = "site";
|
|
if ( m_mode == MODE_PAGELINKINFO ) ss = "page";
|
|
|
|
LinkInfo *info = (LinkInfo *)m_linkInfoBuf->getBufStart();
|
|
|
|
int32_t siteRank = ::getSiteRank ( info->m_numGoodInlinks );
|
|
|
|
char ipbuf[16];
|
|
if ( m_printInXml ) {
|
|
|
|
m_pbuf->safePrintf("\t<desc>inlinks to %s</desc>\n",ss);
|
|
|
|
m_pbuf->safePrintf("\t<sampleCreatedUTC>%" PRIu32
|
|
"</sampleCreatedUTC>\n"
|
|
, m_lastUpdateTime
|
|
);
|
|
|
|
// m_url should point into the Msg25Request buffer
|
|
const char *u = m_url;
|
|
if ( u )
|
|
m_pbuf->safePrintf("\t<url><![CDATA[%s]]></url>\n",u);
|
|
|
|
// m_site should point into the Msg25Request buffer
|
|
const char *site = m_site;
|
|
if ( site )
|
|
m_pbuf->safePrintf("\t<site><![CDATA[%s]]></site>\n",
|
|
site);
|
|
|
|
int64_t d = m_docId;
|
|
if ( d && d != -1LL )
|
|
m_pbuf->safePrintf("\t<docId>%" PRId64"</docId>\n",d);
|
|
|
|
m_pbuf->safePrintf(
|
|
"\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
|
|
|
|
"\t<totalSiteInlinksProcessed>%" PRId32
|
|
"</totalSiteInlinksProcessed>\n"
|
|
|
|
"\t<totalGoodSiteInlinksProcessed>%" PRId32
|
|
"</totalGoodSiteInlinksProcessed>\n"
|
|
|
|
"\t<numUniqueCBlocksLinkingToPage>%" PRId32
|
|
"</numUniqueCBlocksLinkingToPage>\n"
|
|
|
|
"\t<numUniqueIpsLinkingToPage>%" PRId32
|
|
"</numUniqueIpsLinkingToPage>\n"
|
|
|
|
, iptoa(m_ip,ipbuf)
|
|
// the total # of inlinkers. we may not have
|
|
// read all of them from disk though.
|
|
, m_numDocIds
|
|
, info->m_numGoodInlinks
|
|
, m_cblocks
|
|
, m_uniqueIps
|
|
);
|
|
} else
|
|
m_pbuf->safePrintf( "<table width=100%%>"
|
|
"<td bgcolor=lightyellow>\n"
|
|
"<b>Summary of inlinks to %s "
|
|
"%s</b>\n"
|
|
"<br><br>"
|
|
|
|
"<table cellpadding=3 "
|
|
"border=1 width=100%%>\n"
|
|
|
|
"<tr>"
|
|
"<td>sample created</td>"
|
|
"<td>%s</td>"
|
|
"<td>when this info was last "
|
|
"computed</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>IP address</td>"
|
|
"<td>%s</td>"
|
|
"<td> "
|
|
"</td>"
|
|
"<td> </td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>total inlinkers</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>how many inlinks we have total. "
|
|
"Max: %" PRId32"."
|
|
//" Bad docids are removed so may be "
|
|
//"less than that max."
|
|
"</td>"
|
|
"<td> </td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>unique cblocks</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>unique EXTERNAL cblock inlinks</td>"
|
|
"<td> </td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>unique ips</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>unique EXTERNAL IP inlinks</td>"
|
|
"<td> </td>"
|
|
"</tr>\n"
|
|
,
|
|
ss,
|
|
m_url,
|
|
buf, //m_lastUpdateTime,
|
|
iptoa(m_ip,ipbuf),
|
|
// the total # of inlinkers. we may not
|
|
// have read all of them from disk though
|
|
m_numDocIds ,
|
|
(int32_t)READSIZE/(int32_t)LDBKS,
|
|
m_cblocks,
|
|
m_uniqueIps
|
|
);
|
|
|
|
if ( m_mode == MODE_SITELINKINFO && m_printInXml )
|
|
m_pbuf->safePrintf("\t<siteRank>%" PRId32"</siteRank>\n" , siteRank );
|
|
|
|
// print link spam types
|
|
int32_t ns = m_table.getNumSlots();
|
|
for ( int32_t i = 0 ; i < ns ; i++ ) {
|
|
// skip empty slots
|
|
if ( m_table.isEmpty(i) )
|
|
continue;
|
|
// who is in this slot
|
|
NoteEntry *e = *(NoteEntry **)m_table.getValueFromSlot(i);
|
|
const char *exp = getExplanation ( e->m_note );
|
|
// show it
|
|
if ( m_printInXml ) {
|
|
m_pbuf->safePrintf ( "\t<inlinkStat>\n");
|
|
m_pbuf->safePrintf ( "\t\t<name><![CDATA[" );
|
|
//m_pbuf->htmlEncode(e->m_note,strlen(e->m_note),0);
|
|
m_pbuf->safeStrcpy ( e->m_note );
|
|
m_pbuf->safePrintf ( "]]></name>\n");
|
|
if ( exp )
|
|
m_pbuf->safePrintf ( "\t\t<explanation>"
|
|
"<![CDATA[%s]]>"
|
|
"</explanation>\n",
|
|
exp);
|
|
m_pbuf->safePrintf ( "\t\t<count>%" PRId32"</count>\n",
|
|
e->m_count );
|
|
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
|
|
} else {
|
|
m_pbuf->safePrintf ( "<tr><td>%s", e->m_note );
|
|
//if ( exp )
|
|
// m_pbuf->safePrintf ( " - %s", exp );
|
|
m_pbuf->safePrintf("</td>");
|
|
m_pbuf->safePrintf (
|
|
"<td><font color=red>%" PRId32"</font>"
|
|
"</td>"
|
|
"<td>reason could not vote</td>"
|
|
"<td>"
|
|
, e->m_count );
|
|
}
|
|
// print some docids that had this problem
|
|
for ( int32_t j = 0 ; j < MAX_ENTRY_DOCIDS ; j++ ) {
|
|
if ( e->m_docIds[j] == -1LL ) break;
|
|
if ( ! m_printInXml )
|
|
m_pbuf->safePrintf ("<a href=\"/admin/titledb"
|
|
"?c=%s&d=%" PRId64"\">"
|
|
"%" PRId32"</a> ",
|
|
coll,e->m_docIds[j],j);
|
|
}
|
|
if ( ! m_printInXml )
|
|
m_pbuf->safePrintf ( " </td></tr>\n" );
|
|
}
|
|
|
|
if ( ! m_printInXml )
|
|
m_pbuf->safePrintf(
|
|
|
|
"<tr>"
|
|
"<td>ip dup</td>"
|
|
"<td><font color=red>%" PRId32"</font></td>"
|
|
"<td>"
|
|
"basically the same ip, but we looked "
|
|
"it up anyway."
|
|
"</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>ip dup linkdb</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>"
|
|
"linkdb saved us from having to "
|
|
"look up this many title recs from the "
|
|
"same ip C block.</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>docid dup linkdb</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>"
|
|
"linkdb saved us from having to "
|
|
"look up this many title recs from the "
|
|
"same docid.</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>link spam linkdb</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>"
|
|
"linkdb saved us from having to "
|
|
"look up this many title recs because "
|
|
"they were pre-identified as link "
|
|
"spam.</td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td><b>good</b></td>"
|
|
"<td><b>%" PRId32"</b></td>"
|
|
"<td>"
|
|
//"may include anomalies and some "
|
|
//"link farms discovered later. "
|
|
"# inlinkers with positive weight"
|
|
//"limited to MAX_LINKERS = %" PRId32
|
|
"</td>"
|
|
"<td> </td>"
|
|
"</tr>\n"
|
|
|
|
/*
|
|
"<tr>"
|
|
"<td>good extrapolated</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>extrapolate the good links to get "
|
|
"around the MAX_LINKERS limitation</td>"
|
|
"<td> </td>"
|
|
"</tr>\n"
|
|
|
|
"<tr>"
|
|
"<td>X factor (not linear!)</td>"
|
|
"<td>%" PRId32"%%</td>"
|
|
"<td>~ good extrapolated / good = "
|
|
"%" PRId32" / %" PRId32"</td>"
|
|
"<td> </td>"
|
|
"</tr>\n"
|
|
*/
|
|
,
|
|
(int32_t)m_ipDups ,
|
|
(int32_t)m_ipDupsLinkdb ,
|
|
(int32_t)m_docIdDupsLinkdb ,
|
|
(int32_t)m_linkSpamLinkdb ,
|
|
info->m_numGoodInlinks
|
|
// good and max
|
|
//(int32_t)m_linkInfo->getNumInlinks() ,
|
|
);
|
|
|
|
if ( m_mode == MODE_SITELINKINFO && ! m_printInXml )
|
|
m_pbuf->safePrintf(
|
|
"<tr><td><b>siterank</b></td>"
|
|
"<td><b>%" PRId32"</b></td>"
|
|
"<td>based on # of good inlinks</td>"
|
|
"</tr>",
|
|
siteRank
|
|
);
|
|
|
|
if ( ! m_printInXml )
|
|
m_pbuf->safePrintf("</table>"
|
|
"<br>"
|
|
"<br>" );
|
|
|
|
// xml?
|
|
if ( m_printInXml && m_ipDups ) {
|
|
// ip dups
|
|
m_pbuf->safePrintf ( "\t<inlinkStat>\n"
|
|
"\t\t<name><![CDATA[");
|
|
m_pbuf->safePrintf ( "duplicateIPCClass" );
|
|
m_pbuf->safePrintf ( "]]></name>\n");
|
|
|
|
m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
|
|
m_pbuf->safePrintf ( "inlinker is form the same C Block "
|
|
"as another inlinker we processed");
|
|
m_pbuf->safePrintf ( "]]></explanation>\n");
|
|
|
|
m_pbuf->safePrintf ( "\t\t<count>%" PRId32"</count>\n",
|
|
m_ipDups );
|
|
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
|
|
}
|
|
if ( m_printInXml && m_ipDupsLinkdb ) {
|
|
// ip dups
|
|
m_pbuf->safePrintf ( "\t<inlinkStat>\n"
|
|
"\t\t<name><![CDATA[");
|
|
m_pbuf->safePrintf ( "duplicateIPCClass" );
|
|
m_pbuf->safePrintf ( "]]></name>\n");
|
|
m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
|
|
m_pbuf->safePrintf ( "inlinker is form the same C Block "
|
|
"as another inlinker we processed");
|
|
m_pbuf->safePrintf ( "]]></explanation>\n");
|
|
|
|
m_pbuf->safePrintf ( "\t\t<count>%" PRId32"</count>\n",
|
|
m_ipDupsLinkdb );
|
|
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
|
|
}
|
|
if ( m_printInXml && m_docIdDupsLinkdb ) {
|
|
// ip dups
|
|
m_pbuf->safePrintf ( "\t<inlinkStat>\n"
|
|
"\t\t<name><![CDATA[");
|
|
m_pbuf->safePrintf ( "duplicateDocId" );
|
|
m_pbuf->safePrintf ( "]]></name>\n");
|
|
m_pbuf->safePrintf ( "\t\t<explanation><![CDATA[");
|
|
m_pbuf->safePrintf ( "inlinker is on the same page "
|
|
"as another inlinker we processed");
|
|
m_pbuf->safePrintf ( "]]></explanation>\n");
|
|
m_pbuf->safePrintf ( "\t\t<count>%" PRId32"</count>\n",
|
|
m_ipDupsLinkdb );
|
|
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
|
|
}
|
|
if ( m_printInXml && m_linkSpamLinkdb ) {
|
|
// link spam
|
|
m_pbuf->safePrintf ( "\t<inlinkStat>\n"
|
|
"\t\t<name><![CDATA[");
|
|
m_pbuf->safePrintf ( "generalLinkSpam" );
|
|
m_pbuf->safePrintf ( "]]></name>\n");
|
|
m_pbuf->safePrintf ( "\t\t<count>%" PRId32"</count>\n",
|
|
m_linkSpamLinkdb );
|
|
m_pbuf->safePrintf ( "\t</inlinkStat>\n");
|
|
}
|
|
|
|
|
|
const char *tt = "";
|
|
if ( m_mode == MODE_SITELINKINFO ) tt = "site ";
|
|
|
|
if ( ! m_printInXml ) {
|
|
m_pbuf->safePrintf( "<table cellpadding=3 border=1>"
|
|
"<tr>"
|
|
"<td colspan=20>Inlinks "
|
|
"to %s%s (IP=%s) " // pagePop=%" PRId32" "
|
|
"</td>"
|
|
"</tr>\n"
|
|
"<tr>"
|
|
"<td>#</td>"
|
|
"<td>docId</td>"
|
|
"<td>note</td>"
|
|
"<td>url</td>"
|
|
"<td>site</td>"
|
|
"<td>title</td>"
|
|
//"<td>reason</td>"
|
|
"<td>IP</td>"
|
|
"<td>firstIP</td>"
|
|
//"<td>external</td>"
|
|
"<td>lang</td>"
|
|
"<td>discovered</td>"
|
|
"<td>pubdate</td>"
|
|
"<td>site rank</td>"
|
|
"<td># words in link text</td>"
|
|
"<td>link text bytes</td>"
|
|
"<td>link text</td>",
|
|
tt,m_url,iptoa(m_ip,ipbuf));
|
|
if ( m_mode == MODE_SITELINKINFO )
|
|
m_pbuf->safePrintf("<td>link url</td>");
|
|
m_pbuf->safePrintf("<td>neighborhood</td>"
|
|
"</tr>\n" );
|
|
}
|
|
|
|
//CollectionRec *cr = g_collectiondb.getRec ( m_coll );
|
|
// print out each Inlink/Msg20Reply
|
|
for ( int32_t i = 0 ; i < m_numReplyPtrs ; i++ ) {
|
|
// point to a reply
|
|
Msg20Reply *r = m_replyPtrs[i];
|
|
// are we internal
|
|
bool internal = false;
|
|
if ( is_same_network_linkwise(r->m_ip,m_ip) )
|
|
internal = true;
|
|
if ( is_same_network_linkwise(r->m_firstIp,m_ip))
|
|
internal = true;
|
|
// the "external" string
|
|
//char *ext = "Y"; if ( internal ) ext = "N";
|
|
// are we an "anomaly"?
|
|
const char *note = r->ptr_note;
|
|
if ( r->m_isLinkSpam && !note )
|
|
note = "unknown";
|
|
// get our ip as a string
|
|
//char *ips = iptoa(r->m_ip);
|
|
// print the link text itself
|
|
char *txt = r->ptr_linkText;
|
|
// get length of link text
|
|
int32_t tlen = r->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
//float weight = 1.0;
|
|
//if ( note ) weight = 0.0;
|
|
//if ( internal ) weight = 0.0;
|
|
// datedb date
|
|
char dbuf[128];
|
|
if ( r->m_datedbDate > 1 ) {
|
|
time_t ttt = (time_t)r->m_datedbDate;
|
|
struct tm tm_buf;
|
|
char buf[64];
|
|
sprintf(dbuf,"%s UTC",
|
|
asctime_r(gmtime_r(&ttt,&tm_buf),buf) );
|
|
} else
|
|
sprintf(dbuf,"---");
|
|
|
|
char discBuf[128];
|
|
time_t dd = (time_t)r->m_discoveryDate;
|
|
if ( dd ) {
|
|
struct tm tm_buf;
|
|
struct tm *timeStruct = gmtime_r(&dd,&tm_buf);
|
|
if ( timeStruct )
|
|
strftime ( discBuf, 128 ,
|
|
"<nobr>%b %d %Y</nobr>" ,
|
|
timeStruct);
|
|
else
|
|
sprintf(discBuf,"UNKNOWN DATE");
|
|
} else
|
|
sprintf(discBuf,"---");
|
|
|
|
const char *title = r->ptr_tbuf;
|
|
if ( ! title ) title = "";
|
|
|
|
// show the linking docid, the its weight
|
|
if ( m_printInXml ) {
|
|
const char *ns = note;
|
|
if ( ! note ) ns = "good";
|
|
//if ( internal ) note = "internal";
|
|
m_pbuf->safePrintf("\t<inlink>\n"
|
|
|
|
"\t\t<docId>%" PRId64"</docId>\n"
|
|
|
|
"\t\t<url><![CDATA[%s]]></url>\n"
|
|
|
|
"\t\t<site><![CDATA[%s]]></site>\n"
|
|
|
|
"\t\t<title><![CDATA[%s]]></title>\n"
|
|
|
|
//"\t\t<weight>%.01f</weight>\n"
|
|
|
|
"\t\t<note><![CDATA[%s]]>"
|
|
"</note>\n"
|
|
|
|
, r->m_docId
|
|
, r->ptr_ubuf
|
|
, r->ptr_site
|
|
, title
|
|
//, weight
|
|
, ns
|
|
);
|
|
|
|
// get explanation of note
|
|
const char *exp = getExplanation ( ns );
|
|
if ( exp )
|
|
m_pbuf->safePrintf("\t\t<explanation>"
|
|
"<![CDATA[%s]]>"
|
|
"</explanation>\n"
|
|
, exp );
|
|
|
|
m_pbuf->safePrintf("\t\t<ipAddress>"
|
|
"<![CDATA[%s]]>"
|
|
"</ipAddress>\n"
|
|
,iptoa(r->m_ip,ipbuf) );
|
|
|
|
m_pbuf->safePrintf("\t\t<firstIpAddress>"
|
|
"<![CDATA[%s]]>"
|
|
"</firstIpAddress>\n"
|
|
,iptoa(r->m_firstIp,ipbuf) );
|
|
|
|
m_pbuf->safePrintf(
|
|
|
|
"\t\t<onSite>%" PRId32"</onSite>\n"
|
|
|
|
"\t\t<discoveryDateUTC>%" PRIu32
|
|
"</discoveryDateUTC>\n"
|
|
|
|
"\t\t<language><![CDATA[%s]]>"
|
|
"</language>\n"
|
|
|
|
"\t\t<siteRank>%" PRId32"</siteRank>\n"
|
|
, (int32_t)internal
|
|
, (uint32_t)dd
|
|
, getLanguageString(r->m_language)
|
|
, (int32_t)r->m_siteRank
|
|
);
|
|
m_pbuf->safePrintf("\t\t<linkText><![CDATA[");
|
|
m_pbuf->htmlEncode ( txt,tlen,0);
|
|
m_pbuf->safePrintf("]]>"
|
|
"</linkText>\n"
|
|
);
|
|
if ( m_mode == MODE_SITELINKINFO )
|
|
m_pbuf->safePrintf("\t<linkUrl><![CDATA[%s]]>"
|
|
"</linkUrl>\n",
|
|
r->ptr_linkUrl);
|
|
m_pbuf->safePrintf(
|
|
"\t</inlink>\n"
|
|
);
|
|
continue;
|
|
}
|
|
|
|
m_pbuf->safePrintf( "<tr>"
|
|
"<td>%" PRId32"</td>" // #i
|
|
"<td>"
|
|
"<a href=\"/print?page=1&d=%" PRId64"\">"
|
|
"%" PRId64"</a></td>" // docid
|
|
"<td><nobr>"//%.1f"
|
|
,i+1
|
|
,r->m_docId
|
|
,r->m_docId
|
|
//,weight
|
|
);
|
|
if ( note )
|
|
m_pbuf->safePrintf("%s", note );
|
|
else
|
|
m_pbuf->safePrintf("<b>good</b>");
|
|
|
|
m_pbuf->safePrintf( "</nobr></td>"//wghtnte
|
|
"<td>%s</td>" // url
|
|
"<td>%s</td>" // site
|
|
"<td>%s</td>", // title
|
|
r->ptr_ubuf,
|
|
r->ptr_site,
|
|
title);
|
|
m_pbuf->safePrintf("<td><a href=\"/search?q=ip%%3A"
|
|
"%s&c=%s&n=200\">%s</a></td>" // ip
|
|
, iptoa(r->m_ip,ipbuf)
|
|
, coll
|
|
, iptoa(r->m_ip,ipbuf)
|
|
);
|
|
m_pbuf->safePrintf("<td>%s</td>"
|
|
, iptoa(r->m_firstIp,ipbuf)
|
|
);
|
|
m_pbuf->safePrintf( //"<td>%s</td>" // external
|
|
"<td>%s</td>" // language
|
|
"<td>%s</td>" // discoverydate
|
|
"<td>%s</td>" // datedbdate
|
|
"<td><font color=red><b>%" PRId32
|
|
"</b></font></td>" // site rank
|
|
"<td>%" PRId32"</td>" // nw
|
|
"<td>%" PRId32"</td>" // textLen
|
|
"<td><nobr>", // text
|
|
//ext,
|
|
getLanguageString(r->m_language),
|
|
discBuf,
|
|
dbuf,//r->m_datedbDate,
|
|
(int32_t)r->m_siteRank, // docQuality,
|
|
(int32_t)r->m_linkTextNumWords ,
|
|
tlen );
|
|
// only bold if good
|
|
if ( ! note )
|
|
m_pbuf->safePrintf("<b>");
|
|
// this is in utf8 already
|
|
m_pbuf->safeMemcpy ( txt , tlen );
|
|
// only bold if good
|
|
if ( ! note )
|
|
m_pbuf->safePrintf("</b>");
|
|
// wrap it up
|
|
m_pbuf->safePrintf("</nobr></td>");
|
|
// print url that is linked to in the case of site inlinks
|
|
if ( m_mode == MODE_SITELINKINFO )
|
|
m_pbuf->safePrintf("<td>%s</td>",r->ptr_linkUrl);
|
|
// print the neighborhood
|
|
m_pbuf->safePrintf("<td>");
|
|
txt = r->ptr_surroundingText;
|
|
tlen = r->size_surroundingText - 1;
|
|
if(!txt) {
|
|
m_pbuf->safePrintf("--\n");
|
|
m_pbuf->safePrintf("</td></tr>\n");
|
|
continue;
|
|
}
|
|
// this is utf8 already
|
|
m_pbuf->safeMemcpy ( txt , tlen );
|
|
m_pbuf->safePrintf("</td></tr>\n");
|
|
}
|
|
|
|
if ( ! m_printInXml ) {
|
|
m_pbuf->safePrintf( "</table>\n<br>\n" );
|
|
m_pbuf->safePrintf( "</td></table>\n<br><br>\n" );
|
|
}
|
|
|
|
// print site rank
|
|
if ( m_printInXml && m_mode == MODE_SITELINKINFO )
|
|
m_pbuf->safePrintf("\t<siteRankTable>\n");
|
|
|
|
if ( ! m_printInXml && m_mode == MODE_SITELINKINFO )
|
|
m_pbuf->safePrintf("<table border=1>"
|
|
"<tr><td colspan=2>"
|
|
"<center>siteRankTable</center>"
|
|
"</td></tr>"
|
|
"<tr><td># good inlinks</td>"
|
|
"<td>siteRank</td></tr>"
|
|
);
|
|
|
|
// print site rank table
|
|
int32_t lastsr = -1;
|
|
for ( int32_t i = 0 ; i < 11000 && m_mode == MODE_SITELINKINFO ; i++ ) {
|
|
int32_t sr = ::getSiteRank ( i );
|
|
if ( sr == lastsr ) continue;
|
|
lastsr = sr;
|
|
if ( m_printInXml )
|
|
m_pbuf->safePrintf("\t\t<row>"
|
|
"\t\t\t<numInlinks>%" PRId32
|
|
"</numInlinks>\n"
|
|
"\t\t\t<siteRank>%" PRId32
|
|
"</siteRank>\n"
|
|
"\t\t</row>\n"
|
|
,i,sr);
|
|
else
|
|
m_pbuf->safePrintf("<tr><td>%" PRId32"</td><td>%" PRId32"</td></tr>"
|
|
,i,sr);
|
|
}
|
|
if ( m_printInXml && m_mode == MODE_SITELINKINFO )
|
|
m_pbuf->safePrintf("\t</siteRankTable>\n");
|
|
else if ( m_mode == MODE_SITELINKINFO )
|
|
m_pbuf->safePrintf("</table>"
|
|
"<br>" );
|
|
|
|
|
|
//m_pbuf->safePrintf("<b>*</b><i>The maximum of these two weights "
|
|
// "is used.</i><br><br>");
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . return the "worst" of the two Msg20Replies
|
|
// . in the case of a dup, we use this to determine which one is kicked out
|
|
// . if one is NULL, return the other
|
|
Msg20Reply *Msg25::getLoser(Msg20Reply *r, Msg20Reply *p) {
|
|
if ( ! p ) return r;
|
|
if ( ! r ) return p;
|
|
// if "r" is internal, but p is not, r is the loser
|
|
bool rinternal = false;
|
|
bool pinternal = false;
|
|
if ( iptop(r->m_ip ) == m_top ) rinternal = true;
|
|
if ( iptop(r->m_firstIp) == m_top ) rinternal = true;
|
|
if ( iptop(p->m_ip ) == m_top ) pinternal = true;
|
|
if ( iptop(p->m_firstIp) == m_top ) pinternal = true;
|
|
if ( rinternal && ! pinternal )
|
|
return r;
|
|
// and vice versa
|
|
if ( pinternal && ! rinternal )
|
|
return p;
|
|
if ( p->m_siteRank < r->m_siteRank ) return p;
|
|
if ( r->m_siteRank < p->m_siteRank ) return r;
|
|
// . if they had the same quality... check docid
|
|
// . the lower the docid, the "better" it is. this behavior
|
|
// is opposite of the quality behavior.
|
|
if ( p->m_docId > r->m_docId ) return p;
|
|
// fall back to r then
|
|
return r;
|
|
}
|
|
|
|
|
|
// this returns true if the two vecs are "percentSimilar" or more similar
|
|
static bool isSimilar_sorted ( int32_t *vec0 ,
|
|
int32_t *vec1 ,
|
|
int32_t nv0 , // how many int32_ts in vec?
|
|
int32_t nv1 , // how many int32_ts in vec?
|
|
// they must be this similar or more to return true
|
|
int32_t percentSimilar) {
|
|
// if both empty, assume not similar at all
|
|
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
|
|
// if either is empty, return 0 to be on the safe side
|
|
if ( *vec0 == 0 ) return 0;
|
|
if ( *vec1 == 0 ) return 0;
|
|
|
|
// do not include last 0
|
|
nv0--;
|
|
nv1--;
|
|
int32_t total = nv0 + nv1;
|
|
|
|
// so if the "noMatched" count ever EXCEEDS (not equals) this
|
|
// "brink" we can bail early because there's no chance of getting
|
|
// the similarity "percentSimilar" provided. should save some time.
|
|
int32_t brink = ((100-percentSimilar) * total) / 100;
|
|
|
|
// scan each like doing a merge
|
|
int32_t *p0 = vec0;
|
|
int32_t *p1 = vec1;
|
|
int32_t yesMatched = 0;
|
|
int32_t noMatched = 0;
|
|
|
|
mergeLoop:
|
|
|
|
// stop if both exhausted. we didn't bail on brink, so it's a match
|
|
if ( *p0 == 0 && *p1 == 0 )
|
|
return true;
|
|
|
|
if ( *p0 < *p1 || *p1 == 0 ) {
|
|
p0++;
|
|
if ( ++noMatched > brink ) return false;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
if ( *p1 < *p0 || *p0 == 0 ) {
|
|
p1++;
|
|
if ( ++noMatched > brink ) return false;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
yesMatched += 2;
|
|
p1++;
|
|
p0++;
|
|
goto mergeLoop;
|
|
}
|
|
|
|
|
|
/// @todo ALC move to unit test
|
|
/*
|
|
// test similarity
|
|
int32_t v1[] = {86845183, 126041601, 193138017, 194832692, 209041345, 237913907,
|
|
253753116, 420176029, 425806029, 469664463, 474491119, 486025959, 524746875,
|
|
565034969, 651889954, 723451712, 735373612, 740115430, 889005385,
|
|
1104585188, 1180264907, 1190905206, 1555245401, 1585281138, 1775919002,
|
|
1780336562, 1784029178, 1799261433, 2013337516, 2095261394, 2137774538, 0};
|
|
int32_t v2[] = {51207128, 126041601, 237913907, 253753116, 315255440, 394767298,
|
|
420176029, 435382723, 469664463, 486025959, 536944585, 556667308, 565034969,
|
|
615792190, 624608202, 629600018, 807226240, 1107373572, 1113238204,
|
|
1134807359, 1135960080, 1200900964, 1527062593, 1585281138, 1634165777,
|
|
1694464250, 1802457437, 1943916889, 1960218442, 2058631149, -2130866760, 0};
|
|
|
|
int32_t nv1 = sizeof(v1)/4;
|
|
int32_t nv2 = sizeof(v2)/4;
|
|
if ( isSimilar_sorted (v1,v2,nv1,nv2,80,0) ) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
*/
|
|
|
|
// . is "p" a dup of "r"?
|
|
// . we will kick out the worst one so it cannot vote
|
|
// . returns NULL if not a dup
|
|
// . returns NULL with g_errno set on error
|
|
const char *Msg25::isDup(Msg20Reply *r, Msg20Reply *p) {
|
|
|
|
// reset this
|
|
g_errno = 0;
|
|
|
|
// do ip tops match? (top 2 bytes of ip addresses)
|
|
bool internal = false;
|
|
if ( iptop(p->m_ip) == m_top ) internal = true;
|
|
if ( iptop(p->m_firstIp) == m_top ) internal = true;
|
|
if ( internal && m_oneVotePerIpDom )
|
|
return "ip dup";
|
|
|
|
// see if he is too similar to another, if so he is not a good voter
|
|
int32_t *v1 = (int32_t *)r->ptr_vector1;
|
|
int32_t *v2 = (int32_t *)r->ptr_vector2;
|
|
int32_t *v3 = (int32_t *)r->ptr_vector3;
|
|
|
|
int32_t nv1 = r->size_vector1 / 4;
|
|
int32_t nv2 = r->size_vector2 / 4;
|
|
int32_t nv3 = r->size_vector3 / 4;
|
|
|
|
// get vectors for Msg20Reply "p"
|
|
int32_t *x1 = (int32_t *)p->ptr_vector1;
|
|
int32_t *x2 = (int32_t *)p->ptr_vector2;
|
|
int32_t *x3 = (int32_t *)p->ptr_vector3;
|
|
|
|
int32_t nx1 = p->size_vector1 / 4;
|
|
int32_t nx2 = p->size_vector2 / 4;
|
|
int32_t nx3 = p->size_vector3 / 4;
|
|
|
|
|
|
// doc j is 0% to 100% similar to doc i
|
|
// . but we need to remove the wordpairs found
|
|
// in common so they aren't used against
|
|
// another doc, so we say 'true' here
|
|
// . returns -1 and sets g_errno on error
|
|
// . vX vectors can be NULL if the linker was "linkSpam" because
|
|
// Msg20.cpp's handler does not set them in that case
|
|
|
|
// these count the terminating 0 int32_t as a component
|
|
if ( v1 && x1 && nv1 >= 2 && nx1 >= 2 ) {
|
|
//p1 = (int32_t)computeSimilarity (v1,x1,NULL,NULL,NULL,ni);
|
|
// are these two vecs 80% or more similar?
|
|
if ( isSimilar_sorted (v1,x1,nv1,nx1,80) ) {
|
|
//if ( p1 < 80 )
|
|
// log("test p1 failed");
|
|
return "similar content";
|
|
}
|
|
}
|
|
|
|
// only consider p2 if each vector is beefy. these
|
|
// can be small because these vectors represent the
|
|
// word pairs just to the right of the link in the
|
|
// content, and before any "breaking" tag thereafter.
|
|
// no, there are too many little ads, so disregard the beeft
|
|
// requirement.
|
|
if ( v2 && x2 && nv2 >= 2 && nx2 >= 2 ) {
|
|
//p2 = (int32_t)computeSimilarity (v2,x2,NULL,NULL,NULL,ni);
|
|
// are these two vecs 80% or more similar?
|
|
if ( isSimilar_sorted (v2,x2,nv2,nx2,80) ) {
|
|
//if ( p2 < 80 )
|
|
// log("test p2 failed");
|
|
return "similar link desc";
|
|
}
|
|
}
|
|
|
|
// compare tag id pair vectors
|
|
if ( v3 && x3 && nv3 >= 2 && nx3 >= 2 ) {
|
|
//p3 = (int32_t)computeSimilarity (v3,x3,NULL,NULL,NULL,ni);
|
|
// are these two vecs 80% or more similar?
|
|
if ( isSimilar_sorted (v3,x3,nv3,nx3,100) ) {
|
|
//if ( p3 < 100 )
|
|
// log("test p3 failed");
|
|
return "similar tag template";
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
|
|
bool Msg25::addNote(const char *note, int32_t noteLen, int64_t docId) {
|
|
// return right away if no note
|
|
if ( ! note || noteLen <= 0 ) return true;
|
|
// get hash
|
|
int32_t h = hash32 ( note , noteLen );
|
|
NoteEntry **pentry = (NoteEntry **)m_table.getValue ( &h );
|
|
// if this "type" has not been recorded, add it
|
|
if ( ! pentry && h ) {
|
|
char *p = m_bufPtr;
|
|
if ( p + sizeof(NoteEntry) + noteLen + 1 >= m_bufEnd ) {
|
|
log("build: increase buf size in Msg25.");
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
// store the entry
|
|
NoteEntry *e = (NoteEntry *)p;
|
|
p += sizeof(NoteEntry);
|
|
// init that entry
|
|
e->m_count = 1;
|
|
e->m_note = p;
|
|
e->m_docIds[0] = docId;
|
|
e->m_docIds[1] = -1LL;
|
|
// store note into the buffer, NULL terminated
|
|
gbmemcpy ( p , note , noteLen ); p += noteLen;
|
|
*p++ = '\0';
|
|
// add to the table
|
|
int32_t slot = -1;
|
|
if ( ! m_table.addKey(&h,&e,&slot))
|
|
log("build: Msg25 could not add note.");
|
|
// did we add successfully?
|
|
if ( slot < 0 )
|
|
return false;
|
|
// advance to next spot
|
|
m_bufPtr = p;
|
|
return true;
|
|
}
|
|
|
|
if( pentry ) {
|
|
// cast it
|
|
NoteEntry *entry = *pentry;
|
|
if( entry ) {
|
|
// get the count
|
|
entry->m_count++;
|
|
// add docids to the list
|
|
for(int32_t i=0; i < MAX_ENTRY_DOCIDS; i++) {
|
|
// skip if not empty
|
|
if ( entry->m_docIds[i] != -1LL )
|
|
continue;
|
|
// take it over, its empty if it is -1LL
|
|
entry->m_docIds[i] = docId;
|
|
// next one should be -1 now
|
|
if ( i + 1 < MAX_ENTRY_DOCIDS )
|
|
entry->m_docIds[i+1] = -1LL;
|
|
// all done
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// increase the count
|
|
//if ( val ) *(int32_t *)val = *(int32_t *)val + 1;
|
|
return true;
|
|
}
|
|
|
|
|
|
//////////
|
|
//
|
|
// LINKINFO
|
|
//
|
|
//////////
|
|
|
|
// . after Msg25 calls LinkInfo::set() (was merge()) make it call
|
|
// Msg25::print(SafeBuf *pbuf) to print the stats. it can access
|
|
// LinkInfo's Inlinks to get their weights, etc.
|
|
// . returns the LinkInfo on success
|
|
// . returns NULL and sets g_errno on error
|
|
static LinkInfo *makeLinkInfo(int32_t ip,
|
|
Msg20Reply **replies,
|
|
int32_t numReplies,
|
|
int64_t linkeeDocId,
|
|
int32_t lastUpdateTime,
|
|
bool onlyNeedGoodInlinks,
|
|
Msg25 *msg25,
|
|
SafeBuf *linkInfoBuf)
|
|
{
|
|
// we can estimate our quality here
|
|
int32_t numGoodInlinks = 0;
|
|
|
|
// set m_linkTextNumWords
|
|
for ( int32_t i = 0 ; i < numReplies ; i++ ) {
|
|
|
|
// get the reply
|
|
Msg20Reply *r = replies[i];
|
|
// replies are NULL if MsgE had an error, like ENOTFOUND
|
|
if ( ! r ) continue;
|
|
|
|
// get the link text itself
|
|
char *txt = r->ptr_linkText;
|
|
int32_t txtLen = r->size_linkText;
|
|
// discount terminating \0
|
|
if ( txtLen > 0 ) txtLen--;
|
|
// get approx # of words in link text
|
|
int32_t nw = 0;
|
|
if ( txtLen > 0 )
|
|
nw = getNumWords(txt,txtLen);
|
|
// store it
|
|
r->m_linkTextNumWords = nw;
|
|
|
|
// linkspam?
|
|
if ( r->ptr_note ) {
|
|
r->m_isLinkSpam = 1;
|
|
continue;
|
|
}
|
|
|
|
bool internal = false;
|
|
if ( is_same_network_linkwise(r->m_ip,ip))
|
|
internal = true;
|
|
if ( is_same_network_linkwise(r->m_firstIp,ip))
|
|
internal = true;
|
|
|
|
// if its internal do not count towards good, but do
|
|
// indeed store it!
|
|
if ( internal )
|
|
continue;
|
|
|
|
// otherwise count as good
|
|
numGoodInlinks++;
|
|
}
|
|
|
|
int32_t count = 0;
|
|
// . now just store the Inlinks whose weights are > 0
|
|
// . how much space do we need?
|
|
int32_t need = 0;
|
|
// how much space do we need?
|
|
for ( int32_t i = 0 ; i < numReplies ; i++ ) {
|
|
// get the reply
|
|
Msg20Reply *r = replies[i];
|
|
// replies are NULL if MsgE had an error, like ENOTFOUND
|
|
if ( ! r ) continue;
|
|
|
|
if ( r->m_isLinkSpam ) {
|
|
// linkdb debug
|
|
logDebug(g_conf.m_logDebugLinkInfo, "linkdb: inlink #%" PRId32" is link spam: %s", i,r->ptr_note);
|
|
if ( onlyNeedGoodInlinks )
|
|
continue;
|
|
}
|
|
// do a quick set
|
|
Inlink k; k.set ( r );
|
|
// get space
|
|
need += k.getStoredSize ( );
|
|
// count it
|
|
count++;
|
|
}
|
|
|
|
// we need space for our header
|
|
need += sizeof(LinkInfo);
|
|
if ( ! linkInfoBuf->reserve ( need , "LinkInfo" ) )
|
|
return NULL;
|
|
// set ourselves to this new buffer
|
|
LinkInfo *info = (LinkInfo *)(linkInfoBuf->getBufStart());
|
|
memset(info,0,sizeof(*info));
|
|
|
|
// set our header
|
|
info->m_version = 0;
|
|
info->m_lisize = need;
|
|
info->m_lastUpdated = lastUpdateTime;//getTime();
|
|
// how many Inlinks we stored in info->m_buf[]
|
|
info->m_numStoredInlinks = count;
|
|
// the gross total of inlinks we got, both internal and external
|
|
info->m_totalInlinkingDocIds = msg25->m_numDocIds;
|
|
|
|
// . only valid if titlerec version >= 119
|
|
// . how many unique c blocks link to us?
|
|
// . includes your own internal c block
|
|
info->m_numUniqueCBlocks = msg25->m_cblocks;
|
|
// . only valid if titlerec version >= 119
|
|
// . how many unique ips link to us?
|
|
// . this count includes internal IPs as well
|
|
info->m_numUniqueIps = msg25->m_uniqueIps;
|
|
// keep things consistent for the "qatest123" coll
|
|
info->m_reserved1 = 0;
|
|
info->m_reserved2 = 0;
|
|
// how many total GOOD inlinks we got. does not include internal cblock
|
|
info->m_numGoodInlinks = numGoodInlinks;
|
|
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(info,sizeof(*info));
|
|
#endif
|
|
|
|
// point to our buf
|
|
char *p = info->m_buf;
|
|
char *pend = linkInfoBuf->getBufStart() + need;
|
|
// count the ones we store that are internal
|
|
int32_t icount3 = 0;
|
|
// now set each inlink
|
|
|
|
for ( int32_t i = 0 ; i < numReplies ; i++ ) {
|
|
// get the reply
|
|
Msg20Reply *r = replies[i];
|
|
// replies are NULL if MsgE had an error, like ENOTFOUND
|
|
if ( ! r ) continue;
|
|
|
|
if ( r->m_isLinkSpam && onlyNeedGoodInlinks ) continue;
|
|
// are we internal?
|
|
bool internal = false;
|
|
if ( is_same_network_linkwise(r->m_ip,ip) )
|
|
internal = true;
|
|
if ( is_same_network_linkwise(r->m_firstIp,ip) )
|
|
internal = true;
|
|
if ( internal ) icount3++;
|
|
// set the Inlink
|
|
Inlink k;
|
|
// store it. our ptrs will reference into the Msg20Reply buf
|
|
k.set ( r );
|
|
|
|
// . this will copy itself into "p"
|
|
// . "true" --> makePtrsRefNewBuf
|
|
int32_t wrote = 0;
|
|
|
|
char *s = k.serialize ( &wrote , p , pend - p , true );
|
|
// sanity check
|
|
if ( s != p ) { g_process.shutdownAbort(true); }
|
|
// sanity check
|
|
if ( k.getStoredSize() != wrote ) { g_process.shutdownAbort(true);}
|
|
// note it if recycled
|
|
if ( k.m_recycled )
|
|
logf(LOG_DEBUG,"build: recycling Inlink %s for linkee "
|
|
"%" PRId64, k.getUrl(),linkeeDocId);
|
|
// advance
|
|
p += wrote;
|
|
}
|
|
|
|
// . sanity check, should have used up all the buf exactly
|
|
// . so we can free the buf with k->getStoredSize() being the allocSize
|
|
if ( p != pend ) { g_process.shutdownAbort(true); }
|
|
|
|
// how many guys that we stored were internal?
|
|
info->m_numInlinksInternal = (char)icount3;
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(info,pend-(char*)info);
|
|
#endif
|
|
|
|
linkInfoBuf->setLength ( need );
|
|
|
|
// success
|
|
return info;
|
|
}
|
|
|
|
|
|
Inlink *LinkInfo::getNextInlink(Inlink *k) {
|
|
// if none, return NULL
|
|
if ( m_numStoredInlinks == 0 ) return NULL;
|
|
// if k is NULL, return the first
|
|
if ( ! k ) {
|
|
// set it to the first one
|
|
k = (Inlink *)m_buf;
|
|
// done
|
|
return k;
|
|
}
|
|
// point to next
|
|
int32_t size = k->getStoredSize();
|
|
// get the inlink to return
|
|
Inlink *next = (Inlink *)((char *)k + size);
|
|
// return NULL if breached
|
|
int64_t x = (char *)next - (char *)this;
|
|
// was that the end of them?
|
|
if ( x >= m_lisize ) return NULL;
|
|
// otherwise, we are still good
|
|
return next;
|
|
}
|
|
|
|
bool Inlink::setXmlFromRSS(Xml *xml) {
|
|
// compute the length (excludes the \0's)
|
|
int32_t len = size_rssItem - 1;
|
|
|
|
// return false and set g_errno if this fails
|
|
return xml->set( getRSSItem(), len, TITLEREC_CURRENT_VERSION, CT_XML );
|
|
}
|
|
|
|
|
|
bool LinkInfo::hasLinkText() const {
|
|
// loop through the Inlinks
|
|
for(const Inlink *k = NULL; (k = getNextInlink(k)) ; )
|
|
if ( k->size_linkText > 1 ) return true;
|
|
return false;
|
|
}
|
|
|
|
|
|
void Inlink::set ( const Msg20Reply *r ) {
|
|
// set ourselves now
|
|
m_ip = r->m_ip;
|
|
m_firstIp = r->m_firstIp;
|
|
m_wordPosStart = r->m_wordPosStart;
|
|
m_docId = r->m_docId;
|
|
m_firstSpidered = r->m_firstSpidered;
|
|
m_lastSpidered = r->m_lastSpidered;
|
|
m_datedbDate = r->m_datedbDate;
|
|
m_firstIndexedDate = r->m_firstIndexedDate;
|
|
m_numOutlinks = r->m_numOutlinks;
|
|
|
|
m_isPermalink = r->m_isPermalink;
|
|
m_outlinkInContent = r->m_outlinkInContent;
|
|
m_outlinkInComment = r->m_outlinkInComment;
|
|
m_isLinkSpam = r->m_isLinkSpam;
|
|
m_recycled = r->m_recycled;
|
|
|
|
m_country = r->m_country;
|
|
m_language = r->m_language;
|
|
m_siteRank = r->m_siteRank;
|
|
|
|
// MDW: use a new way. construct m_buf. 64-bit stuff.
|
|
int32_t poff = 0;
|
|
char *p = m_buf;
|
|
|
|
int32_t need =
|
|
r->size_ubuf +
|
|
r->size_linkText +
|
|
r->size_surroundingText +
|
|
r->size_rssItem +
|
|
r->size_categories +
|
|
r->size_templateVector;
|
|
|
|
char *pend = p + need;
|
|
// -10 to add \0's for remaining guys in case of breach
|
|
pend -= 10;
|
|
|
|
|
|
size_urlBuf = r->size_ubuf;
|
|
size_linkText = r->size_linkText;
|
|
size_surroundingText = r->size_surroundingText;
|
|
size_rssItem = r->size_rssItem;
|
|
size_categories = r->size_categories;
|
|
size_gigabitQuery = 0;
|
|
size_templateVector = r->size_templateVector;
|
|
|
|
|
|
/////////////
|
|
|
|
off_urlBuf = poff;
|
|
gbmemcpy ( p , r->ptr_ubuf , size_urlBuf );
|
|
poff += size_urlBuf;
|
|
p += size_urlBuf;
|
|
|
|
/////////////
|
|
|
|
off_linkText = poff;
|
|
gbmemcpy ( p , r->ptr_linkText , size_linkText );
|
|
poff += size_linkText;
|
|
p += size_linkText;
|
|
|
|
/////////////
|
|
|
|
off_surroundingText = poff;
|
|
if ( p + r->size_surroundingText < pend ) {
|
|
gbmemcpy (p,r->ptr_surroundingText , size_surroundingText );
|
|
}
|
|
else {
|
|
size_surroundingText = 1;
|
|
*p = '\0';
|
|
}
|
|
poff += size_surroundingText;
|
|
p += size_surroundingText;
|
|
|
|
/////////////
|
|
|
|
off_rssItem = poff;
|
|
if ( p + r->size_rssItem < pend ) {
|
|
gbmemcpy ( p , r->ptr_rssItem , size_rssItem );
|
|
} else {
|
|
size_rssItem = 1;
|
|
*p = '\0';
|
|
}
|
|
poff += size_rssItem;
|
|
p += size_rssItem;
|
|
|
|
/////////////
|
|
|
|
off_categories = poff;
|
|
if ( p + r->size_categories < pend ) {
|
|
gbmemcpy ( p , r->ptr_categories , size_categories );
|
|
} else {
|
|
size_categories = 1;
|
|
*p = '\0';
|
|
}
|
|
poff += size_categories;
|
|
p += size_categories;
|
|
|
|
/////////////
|
|
|
|
off_gigabitQuery = poff;
|
|
size_gigabitQuery = 1;
|
|
*p = '\0';
|
|
poff += size_gigabitQuery;
|
|
p += size_gigabitQuery;
|
|
|
|
/////////////
|
|
|
|
off_templateVector = poff;
|
|
if ( p + r->size_templateVector < pend ) {
|
|
gbmemcpy (p , r->ptr_templateVector , size_templateVector );
|
|
} else {
|
|
size_templateVector = 1;
|
|
*p = '\0';
|
|
}
|
|
poff += size_templateVector;
|
|
p += size_templateVector;
|
|
|
|
}
|
|
|
|
|
|
// Msg25 calls this to make a "fake" msg20 reply for recycling Inlinks
|
|
// that are no longer there... preserves rssInfo, etc.
|
|
void Inlink::setMsg20Reply(Msg20Reply *r) {
|
|
|
|
r->m_ip = m_ip;
|
|
r->m_firstIp = m_firstIp;
|
|
r->m_wordPosStart = m_wordPosStart;
|
|
r->m_docId = m_docId;
|
|
r->m_firstSpidered = m_firstSpidered;
|
|
|
|
r->m_lastSpidered = m_lastSpidered;
|
|
r->m_datedbDate = m_datedbDate;
|
|
r->m_firstIndexedDate = m_firstIndexedDate;
|
|
r->m_numOutlinks = m_numOutlinks;
|
|
|
|
r->m_isPermalink = m_isPermalink;
|
|
r->m_outlinkInContent = m_outlinkInContent;
|
|
r->m_outlinkInComment = m_outlinkInComment;
|
|
|
|
r->m_isLinkSpam = m_isLinkSpam;
|
|
|
|
r->m_country = m_country;
|
|
r->m_language = m_language;
|
|
r->m_siteRank = m_siteRank;
|
|
r->m_isAdult = false; //appears to be irrelevant when dealing with links
|
|
|
|
r->ptr_ubuf = const_cast<char*>(getUrl());
|
|
r->ptr_linkText = getLinkText();//ptr_linkText;
|
|
r->ptr_surroundingText = getSurroundingText();//ptr_surroundingText;
|
|
r->ptr_rssItem = getRSSItem();//ptr_rssItem;
|
|
r->ptr_categories = getCategories();//ptr_categories;
|
|
r->ptr_templateVector = getTemplateVector();//ptr_templateVector;
|
|
|
|
r->size_ubuf = size_urlBuf;
|
|
r->size_linkText = size_linkText;
|
|
r->size_surroundingText = size_surroundingText;
|
|
r->size_rssItem = size_rssItem;
|
|
r->size_categories = size_categories;
|
|
r->size_templateVector = size_templateVector;
|
|
}
|
|
|
|
|
|
void Inlink::reset() {
|
|
// clear ourselves out
|
|
memset ( (char *)this,0,sizeof(Inlink) - MAXINLINKSTRINGBUFSIZE);
|
|
}
|
|
|
|
|
|
// . set a new Inlink from an older versioned Inlink
|
|
// . this is how we handle versioning
|
|
void Inlink::set2(const Inlink *old) {
|
|
// clear ouselves
|
|
reset();
|
|
|
|
int fullSize = old->getStoredSize();
|
|
|
|
// return how many bytes we processed
|
|
memcpy ( this, old, fullSize );
|
|
}
|
|
|
|
|
|
int32_t Inlink::getStoredSize() const {
|
|
int32_t size = sizeof(Inlink) - MAXINLINKSTRINGBUFSIZE;
|
|
|
|
size += size_urlBuf;
|
|
size += size_linkText;
|
|
size += size_surroundingText;
|
|
size += size_rssItem;
|
|
size += size_categories;
|
|
size += size_gigabitQuery;
|
|
size += size_templateVector;
|
|
|
|
return size;
|
|
}
|
|
|
|
|
|
// . return ptr to the buffer we serialize into
|
|
// . return NULL and set g_errno on error
|
|
char *Inlink::serialize(int32_t *retSize ,
|
|
char *userBuf ,
|
|
int32_t userBufSize ,
|
|
bool makePtrsRefNewBuf) const {
|
|
// make a buffer to serialize into
|
|
char *buf = NULL;
|
|
int32_t need = getStoredSize();
|
|
// big enough?
|
|
if ( need <= userBufSize ) buf = userBuf;
|
|
// alloc if we should
|
|
if ( ! buf ) buf = (char *)mmalloc ( need , "Ra" );
|
|
// bail on error, g_errno should be set
|
|
if ( ! buf ) return NULL;
|
|
// set how many bytes we will serialize into
|
|
*retSize = need;
|
|
// copy the easy stuff
|
|
char *p = buf;
|
|
char *pend = buf + need;
|
|
gbmemcpy ( p, this, need );
|
|
p += need;
|
|
|
|
if ( p != pend ) { g_process.shutdownAbort(true); }
|
|
|
|
return buf;
|
|
}
|
|
|
|
|
|
// used by PageTitledb.cpp
|
|
bool LinkInfo::print(SafeBuf *sb, const char *coll) const {
|
|
int32_t count = 1;
|
|
|
|
// loop through the link texts
|
|
for(const Inlink *k = NULL; (k = getNextInlink(k)); count++) {
|
|
const char *s = k->getLinkText();//ptr_linkText;
|
|
int32_t slen = k->size_linkText - 1;
|
|
const char *d = k->getSurroundingText();//ptr_surroundingText;
|
|
int32_t dlen = k->size_surroundingText - 1;
|
|
const char *r = k->getRSSItem();//ptr_rssItem;
|
|
int32_t rlen = k->size_rssItem - 1;
|
|
const char *g = k->getGigabitQuery();
|
|
int32_t glen = k->size_gigabitQuery - 1;
|
|
const char *c = k->getCategories();//ptr_categories;
|
|
int32_t clen = k->size_categories - 1;
|
|
if ( slen < 0 ) slen = 0;
|
|
if ( dlen < 0 ) dlen = 0;
|
|
if ( rlen < 0 ) rlen = 0;
|
|
if ( glen < 0 ) glen = 0;
|
|
if ( clen < 0 ) clen = 0;
|
|
if ( ! c || clen <= 0 ) c = "";
|
|
|
|
// encode the rss item further
|
|
StackBuf<128> buf3b;
|
|
if ( rlen > 0 ) {
|
|
buf3b.htmlEncode(r, rlen, true);
|
|
}
|
|
buf3b.nullTerm();
|
|
|
|
// . print link text and score into table
|
|
// . there is a ton more info in Inlink class to print if
|
|
// you want to throw it in here...
|
|
char ipbuf[16];
|
|
sb->safePrintf(
|
|
"<tr><td colspan=2>link #%04" PRId32" "
|
|
"("
|
|
//"baseScore=%010" PRId32", "
|
|
"d=<a href=\"/admin/titledb?c=%s&"
|
|
"d=%" PRId64"\">%016" PRId64"</a>, "
|
|
"siterank=%" PRId32", "
|
|
"outlinks=%05" PRId32", "
|
|
"ip=%s "
|
|
"numLinksToSite=%" PRId32" "
|
|
//"anomaly=%" PRId32" "
|
|
"<b>url</b>=\"%s\" "
|
|
"<b>txt=</b>\""
|
|
"%s\" "
|
|
"<b>neigh=</b>\"%s\" "
|
|
"<b>rssItem</b>=\"%s\" "
|
|
"<b>gigabits</b>=\"%s\" "
|
|
"<b>categories</b>=\"%s\" "
|
|
//"<b>templateVec=\"</b>%s\" "
|
|
"</td></tr>\n",
|
|
count ,
|
|
//(int32_t)k->m_baseScore ,
|
|
coll ,
|
|
k->m_docId,
|
|
k->m_docId,
|
|
//(int32_t)k->m_docQuality,
|
|
(int32_t)k->m_siteRank,
|
|
(int32_t)k->m_numOutlinks ,
|
|
iptoa(k->m_ip,ipbuf),
|
|
(int32_t)k->m_siteNumInlinks,
|
|
//(int32_t)k->m_isAnomaly,
|
|
k->getUrl(),//ptr_urlBuf, // the linker url
|
|
s, // buf,
|
|
d, // buf2,
|
|
buf3b.getBufStart(),
|
|
g, // buf4,
|
|
c
|
|
);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
bool LinkInfo::hasRSSItem() {
|
|
for ( Inlink *k=NULL;(k=getNextInlink(k));)
|
|
// rss item?
|
|
if ( k->size_rssItem > 10 ) return true;
|
|
return false;
|
|
}
|
|
|
|
|
|
///////////////
|
|
//
|
|
// LINKS CLASS
|
|
//
|
|
///////////////
|
|
|
|
// . use siteRec to set m_extractRedirects from <extractRedirectsFromLinks>
|
|
// . this is used because yahoo has links like:
|
|
// yahoo.com/drst/pop/2/10/576995/37640326/*http://www.m.com/
|
|
// . we need yahoo's link: terms to be more precise for our ban/unban algorithm
|
|
|
|
static int32_t getLinkBufferSize(int32_t numLinks);
|
|
|
|
|
|
Links::Links() {
|
|
m_allocBuf = NULL;
|
|
m_allocSize = 0;
|
|
m_linkBuf = NULL;
|
|
m_allocLinks = 0;
|
|
m_spamNote = NULL;
|
|
m_numLinks = 0;
|
|
m_baseUrl = NULL;
|
|
m_numOutlinksAdded = 0;
|
|
m_doQuickSet = false;
|
|
|
|
// Coverity
|
|
m_parentUrl = NULL;
|
|
m_xml = NULL;
|
|
m_parentIsPermalink = false;
|
|
m_buf = NULL;
|
|
m_bufPtr = NULL;
|
|
m_linkPtrs = NULL;
|
|
m_linkLens = NULL;
|
|
m_linkNodes = NULL;
|
|
m_linkHashes = NULL;
|
|
m_hostHashes = NULL;
|
|
m_domHashes = NULL;
|
|
m_linkFlags = NULL;
|
|
m_spamNotes = NULL;
|
|
m_hasRSS = false;
|
|
m_isFeedBurner = false;
|
|
m_numNodes = 0;
|
|
m_hasRelNoFollow = false;
|
|
m_stripParams = false;
|
|
m_addSiteRootFlags = false;
|
|
m_coll = NULL;
|
|
m_flagged = false;
|
|
m_hasSelfPermalink = false;
|
|
m_hasRSSOutlink = false;
|
|
m_hasSubdirOutlink = false;
|
|
m_rssOutlinkPtr = NULL;
|
|
m_rssOutlinkLen = 0;
|
|
}
|
|
|
|
|
|
Links::~Links() {
|
|
reset();
|
|
}
|
|
|
|
|
|
void Links::reset() {
|
|
if (m_allocBuf) mfree(m_allocBuf, m_allocSize, "Links");
|
|
m_allocBuf = NULL;
|
|
m_allocSize = 0;
|
|
if (m_linkBuf) mfree(m_linkBuf, getLinkBufferSize(m_allocLinks),
|
|
"Links");
|
|
m_linkBuf = NULL;
|
|
m_allocLinks = 0;
|
|
m_spamNote = NULL;
|
|
m_numLinks = 0;
|
|
m_flagged = false;
|
|
m_hasRSS = false;
|
|
m_isFeedBurner = false;
|
|
m_hasSelfPermalink = false;
|
|
m_hasRSSOutlink = false;
|
|
m_hasSubdirOutlink = false;
|
|
m_rssOutlinkPtr = NULL;
|
|
}
|
|
|
|
|
|
bool Links::set(bool useRelNoFollow,
|
|
Xml *xml, Url *parentUrl,
|
|
// use null for this if you do not want to use it
|
|
Url *baseUrl,
|
|
int32_t version,
|
|
bool parentIsPermalink,
|
|
const Links *oldLinks,
|
|
bool doQuickSet)
|
|
{
|
|
reset();
|
|
|
|
// always for this to true now since we need them for linkdb
|
|
bool setLinkHash = true;
|
|
if ( doQuickSet ) setLinkHash = false;
|
|
|
|
m_xml = xml;
|
|
m_baseUrl = parentUrl;
|
|
m_parentUrl = parentUrl;
|
|
m_doQuickSet = doQuickSet;
|
|
m_parentIsPermalink = parentIsPermalink;
|
|
|
|
m_numLinks = 0;
|
|
m_numNodes = xml->getNumNodes();
|
|
m_bufPtr = NULL;
|
|
|
|
m_hasRelNoFollow = false;
|
|
|
|
// ok, let's remove it for the links: hashing, it just makes more
|
|
// sense this way i think. we can normalize the links: terms in the
|
|
// query if you are worried about it.
|
|
m_stripParams = true;
|
|
|
|
// get the <base href=> tag if any (12)
|
|
if ( baseUrl ) m_baseUrl = baseUrl;
|
|
|
|
// visit each node in the xml tree. a node can be a tag or a non-tag.
|
|
for ( int32_t i=0; i < m_numNodes ; i++ ) {
|
|
// . continue if this tag ain't an <a href> tag
|
|
// . atom feeds have a <link href=""> field in them
|
|
int32_t id = xml->getNodeId ( i );
|
|
|
|
// reset
|
|
linkflags_t flags = 0;
|
|
|
|
if ( id != TAG_A &&
|
|
id != TAG_LINK && // rss feed url
|
|
id != TAG_LOC && // sitemap.xml url
|
|
id != TAG_AREA &&
|
|
id != TAG_ENCLOSURE &&
|
|
id != TAG_WEBLOG &&
|
|
id != TAG_URLFROM && // <UrlFrom> for ahrefs.com
|
|
id != TAG_FBORIGLINK )
|
|
continue;
|
|
|
|
const char *urlattr = "href";
|
|
if (id == TAG_WEBLOG) {
|
|
urlattr ="url";
|
|
}
|
|
|
|
if ( id == TAG_FBORIGLINK ) m_isFeedBurner = true;
|
|
|
|
// if it's a back tag continue
|
|
if ( xml->isBackTag ( i ) ) {
|
|
continue;
|
|
}
|
|
|
|
// rel attribute
|
|
int32_t slen = 0;
|
|
const char *s = xml->getString ( i , "rel", &slen ) ;
|
|
if (useRelNoFollow && slen == 8 && strncasecmp(s, "nofollow", 8) == 0) {
|
|
// if this flag is set then::hasSpamLinks() will always
|
|
// return false. the site owner is taking the necessary
|
|
// precautions to prevent log spam.
|
|
m_hasRelNoFollow = true;
|
|
|
|
// . do not ignore it now, just flag it
|
|
// . fandango has its ContactUs with a nofollow!
|
|
flags |= LF_NOFOLLOW;
|
|
}
|
|
|
|
// only allow specific link rel (whitelist)
|
|
// http://microformats.org/wiki/existing-rel-values#HTML5_link_type_extensions
|
|
if (id == TAG_LINK) {
|
|
bool allowed = false;
|
|
|
|
switch (slen) {
|
|
case 4:
|
|
if (strncasecmp(s, "feed", 4) == 0 ||
|
|
strncasecmp(s, "home", 4) == 0) {
|
|
allowed = true;
|
|
}
|
|
break;
|
|
case 7:
|
|
if (strncasecmp(s, "sitemap", 7) == 0) {
|
|
allowed = true;
|
|
}
|
|
break;
|
|
case 9:
|
|
if (strncasecmp(s, "alternate", 9) == 0 ||
|
|
strncasecmp(s, "canonical", 9) == 0) {
|
|
allowed = true;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (!allowed) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// get the href field of this anchor tag
|
|
int32_t linkLen;
|
|
const char *link = xml->getString ( i, urlattr, &linkLen );
|
|
|
|
// if no href, but we are a <link> tag then the url may
|
|
// follow, like in an rss feed.
|
|
if ( linkLen==0 &&
|
|
(id == TAG_LINK ||
|
|
id == TAG_LOC || // sitemap.xml urls
|
|
id == TAG_URLFROM ||
|
|
id == TAG_FBORIGLINK) ) {
|
|
// the the <link> node
|
|
const char *node = xml->getNode(i);
|
|
int32_t nodeLen = xml->getNodeLen(i);
|
|
// but must NOT end in "/>" then
|
|
if ( node[nodeLen-2] == '/' ) continue;
|
|
// expect the url like <link> url </link> then
|
|
if ( i+2 >= m_numNodes ) continue;
|
|
if ( xml->getNodeId(i+2) != id ) continue;
|
|
if ( ! xml->isBackTag(i+2) ) continue;
|
|
// ok assume url is next node
|
|
link = xml->getNode(i+1);
|
|
linkLen = xml->getNodeLen(i+1);
|
|
// watch out for CDATA
|
|
if ( linkLen > 12 &&
|
|
strncasecmp(link, "<![CDATA[", 9) == 0 ) {
|
|
link += 9;
|
|
linkLen -= 12;
|
|
}
|
|
}
|
|
|
|
// . it doesn't have an "href" field (could be "name" field)
|
|
// . "link" may not be NULL if empty, so use linkLen
|
|
if ( linkLen == 0 )
|
|
continue;
|
|
|
|
// skip spaces in the front (should be utf8 compatible)
|
|
while ( linkLen > 0 && is_wspace_a(*link) ) {
|
|
link++;
|
|
linkLen--;
|
|
}
|
|
|
|
// don't add this link if it begins with javascript:
|
|
if ( linkLen >= 11 && strncasecmp (link,"javascript:",11) ==0){
|
|
// well... a lot of times the provided function has
|
|
// the url as an arg to a popup window
|
|
int32_t oclen = 0;
|
|
const char *oc = xml->getString(i,"onclick",&oclen);
|
|
// if none, bail
|
|
if ( ! oc ) continue;
|
|
// set end
|
|
const char *ocend = oc + oclen - 2;
|
|
const char *ocurl = nullptr;
|
|
// scan for "'/" which should indicate the url
|
|
for ( ; oc < ocend ; oc++ ) {
|
|
if ( *oc !='\'' ) continue;
|
|
if ( oc[1]!='/' ) continue;
|
|
// set the start
|
|
ocurl = oc + 1;
|
|
// and stop the scan
|
|
break;
|
|
}
|
|
// if none, bail
|
|
if ( ! ocurl ) continue;
|
|
// now find the end of the url
|
|
const char *ocurlend = ocurl + 1;
|
|
for ( ; ocurlend < ocend ; ocurlend++ )
|
|
if ( *ocurlend == '\'' ) break;
|
|
// assign it now
|
|
link = ocurl;
|
|
linkLen = ocurlend - ocurl;
|
|
// and continue
|
|
}
|
|
|
|
if ( linkLen == 0 )
|
|
continue;
|
|
|
|
// it's a page-relative link
|
|
if ( link[0]=='#' ) continue;
|
|
|
|
// ignore mailto: links
|
|
if ( linkLen >= 7 && strncasecmp( link , "mailto:" , 7 ) == 0 )
|
|
continue;
|
|
|
|
// if we have a sequence of alnum chars (or hpyhens) followed
|
|
// by a ':' then that is a protocol. we only support http and
|
|
// https protocols right now. let "p" point to the ':'.
|
|
const char *p = link;
|
|
int32_t pmaxLen = linkLen;
|
|
if ( pmaxLen > 20 ) pmaxLen = 20;
|
|
const char *pend = link + pmaxLen;
|
|
while ( p < pend && (is_alnum_a(*p) || *p=='-') ) p++;
|
|
|
|
// is the protocol, if it exists, a valid one like http or
|
|
// https? if not, ignore it. we only support FQDNs
|
|
// (fully qualified domain names) here really. so if you
|
|
// have something like mylocalhostname:8000/ it is not going
|
|
// to work anymore. you would need "use /etc/hosts" enabled
|
|
// for that to work, too.
|
|
bool proto = true;
|
|
if ( p < pend && *p == ':' ) {
|
|
proto = false;
|
|
int32_t plen = p - link;
|
|
if ( plen == 4 && strncasecmp(link,"http" ,plen) == 0 )
|
|
proto = true;
|
|
if ( plen == 5 && strncasecmp(link,"https",plen) == 0 )
|
|
proto = true;
|
|
}
|
|
|
|
// skip if proto invalid like callto:+4355645998 or
|
|
// mailto:jimbob@hoho.com
|
|
if ( ! proto ) continue;
|
|
|
|
// add it
|
|
char ptmp [ MAX_URL_LEN + 1 + 1 ];
|
|
// keep an underpad of 1 byte in case we need to prepend a /
|
|
char *tmp = ptmp + 1;
|
|
if ( linkLen > MAX_URL_LEN ) {
|
|
// only log this once just so people know, don't spam
|
|
// the log with it.
|
|
static bool s_flag = true;
|
|
if (s_flag) {
|
|
s_flag = false;
|
|
log(LOG_INFO, "build: Link len %" PRId32" is longer "
|
|
"than max of %" PRId32". Link will not "
|
|
"be added to spider queue or "
|
|
"indexed for link: search.",
|
|
linkLen,(int32_t)MAX_URL_LEN);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// see if the <link> tag has a "type" file
|
|
bool isRSS = false;
|
|
|
|
int32_t typeLen;
|
|
const char *type = xml->getString(i, "type", &typeLen );
|
|
// . MDW: imported from Xml.cpp:
|
|
// . check for valid type:
|
|
// type="application/atom+xml" (atom)
|
|
// type="application/rss+xml" (RSS 1.0/2.0)
|
|
// type="application/rdf+xml" (RDF)
|
|
// type="text/xml" (RSS .92) support?
|
|
if (type) {
|
|
if (strncasecmp(type, "application/atom+xml", 20) == 0 ||
|
|
strncasecmp(type, "application/rss+xml", 19) == 0 ||
|
|
strncasecmp(type, "text/xml", 8) == 0) {
|
|
isRSS = true;
|
|
}
|
|
}
|
|
|
|
|
|
// make sure we got rel='alternate' or rel="alternate", etc.
|
|
if (isRSS) {
|
|
int32_t relLen = 0;
|
|
const char *rel = xml->getString(i, "rel", &relLen);
|
|
|
|
if (rel) {
|
|
if (strncasecmp(rel, "alternate", 9) != 0) {
|
|
isRSS = false;
|
|
}
|
|
|
|
// skip if a reply! rss feeds have these links to comments and just ignore them for now
|
|
// http://dancleary.blogspot.com/feeds/posts/default uses edit:
|
|
if ((relLen == 7 && strncasecmp(rel, "replies", 7) == 0) ||
|
|
(relLen == 4 && strncasecmp(rel, "edit", 4) == 0)) {
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
// store it
|
|
if ( isRSS ) m_hasRSS = true;
|
|
|
|
//TODO: should we urlEncode here?
|
|
// i didn't know this, but links can have encoded html entities
|
|
// like & and > etc. in them and we have to decode
|
|
// assign the new decoded length.
|
|
// this is not compatible with m_doQuickSet because we store
|
|
// the "link" ptr into the array of link ptrs, and this uses
|
|
// the "tmp" buf.
|
|
// nono, need this now otherwise it hits that linkNode<0
|
|
// error msg in XmlDoc.cpp. but for Msg13 spider compression
|
|
// you might want to do something else then i guess...
|
|
linkLen = htmlDecode( tmp, link, linkLen, false );
|
|
|
|
// use tmp buf
|
|
link = tmp;
|
|
|
|
if (!addLink(link, linkLen, i, setLinkHash, version, isRSS, id, flags)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// . flag the links we have that are old (spidered last time)
|
|
// . set LF_OLDLINK flag
|
|
return flagOldLinks ( oldLinks );
|
|
}
|
|
|
|
|
|
// just a NULL-terminated text buffer/file of links to add
|
|
bool Links::set(const char *buf) {
|
|
reset();
|
|
// need "coll" for Url::isSiteRoot(), etc.
|
|
//m_coll = coll;
|
|
m_parentUrl = NULL;
|
|
m_baseUrl = NULL;
|
|
m_addSiteRootFlags = false;
|
|
m_xml = NULL;
|
|
const char *p = buf;
|
|
while ( *p ) {
|
|
// skip spaces
|
|
while ( *p && is_wspace_a(*p) ) p++;
|
|
// get the length of the link
|
|
const char *q = p;
|
|
while ( *q && ! is_wspace_a(*q) ) q++;
|
|
int32_t len = q - p;
|
|
// add the link
|
|
if ( ! addLink ( p , len , -1 , true ,
|
|
TITLEREC_CURRENT_VERSION , false,
|
|
TAG_A , 0 ) )
|
|
return false;
|
|
// advance
|
|
p = q;
|
|
}
|
|
// assume none are flagged as old, LF_OLDLINK
|
|
m_flagged = true;
|
|
return true;
|
|
}
|
|
|
|
|
|
bool Links::print(SafeBuf *sb) {
|
|
sb->safePrintf(
|
|
"<table cellpadding=3 border=1>\n"
|
|
"<tr>"
|
|
"<td>#</td>"
|
|
"<td colspan=40>"
|
|
// table header row
|
|
"Outlink"
|
|
"</td>"
|
|
"</tr>"
|
|
);
|
|
// find the link point to our url
|
|
int32_t i;
|
|
for ( i = 0 ; i < m_numLinks ; i++ ) {
|
|
char *link = getLinkPtr(i);
|
|
int32_t linkLen = getLinkLen(i);
|
|
sb->safePrintf("<tr><td>%" PRId32"</td><td>",i);
|
|
sb->safeMemcpy(link,linkLen);
|
|
sb->safePrintf("</td></tr>\n");
|
|
}
|
|
sb->safePrintf("</table>\n<br>\n");
|
|
return true;
|
|
}
|
|
|
|
|
|
bool Links::addLink(const char *link, int32_t linkLen, int32_t nodeNum,
|
|
bool setLinkHash, int32_t titleRecVersion,
|
|
bool isRSS, int32_t tagId,
|
|
int32_t flagsArg )
|
|
{
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(link,linkLen);
|
|
#endif
|
|
|
|
// don't add 0 length links
|
|
if ( linkLen <= 0 ) return true;
|
|
|
|
// do we need to alloc more link space?
|
|
if (m_numLinks >= m_allocLinks) {
|
|
int32_t newAllocLinks;
|
|
|
|
if (!m_allocLinks) newAllocLinks =10000;
|
|
else if (m_allocLinks<100000) newAllocLinks =m_allocLinks*2;
|
|
else newAllocLinks =m_allocLinks+100000;
|
|
|
|
// how much mem do we need for newAllocLinks links?
|
|
int32_t newAllocSize = getLinkBufferSize(newAllocLinks);
|
|
|
|
char *newBuf = (char*)mmalloc(newAllocSize, "Links");
|
|
if (!newBuf) return false;
|
|
|
|
// a ptr to it
|
|
char *p = newBuf;
|
|
// debug msg
|
|
log(LOG_DEBUG, "build: resizing Links ptr buffer to %" PRId32,
|
|
newAllocSize);
|
|
|
|
char **newLinkPtrs = (char**)p;
|
|
p += newAllocLinks * sizeof(char *) ;
|
|
|
|
int32_t *newLinkLens = (int32_t*)p;
|
|
p += newAllocLinks * sizeof(int32_t) ;
|
|
|
|
int32_t *newLinkNodes = (int32_t*)p;
|
|
p += newAllocLinks * sizeof(int32_t) ;
|
|
|
|
uint64_t *newLinkHashes = (uint64_t *)p;
|
|
p += newAllocLinks * sizeof(uint64_t) ;
|
|
|
|
uint64_t *newHostHashes = (uint64_t *)p;
|
|
p += newAllocLinks * sizeof(uint64_t) ;
|
|
|
|
int32_t *newDomHashes = (int32_t *)p;
|
|
p += newAllocLinks * sizeof(int32_t);
|
|
|
|
linkflags_t *newLinkFlags = (linkflags_t *)p;
|
|
p += newAllocLinks * sizeof(linkflags_t) ;
|
|
|
|
const char **newSpamNotes = (const char **)p;
|
|
p += newAllocLinks * sizeof(char **);
|
|
|
|
// sanity check -- check for breach
|
|
if ( p > newBuf + newAllocSize ) { g_process.shutdownAbort(true); }
|
|
|
|
if (m_linkBuf){
|
|
gbmemcpy(newLinkPtrs, m_linkPtrs,
|
|
m_numLinks * sizeof(char*));
|
|
gbmemcpy(newLinkLens, m_linkLens,
|
|
m_numLinks * sizeof(int32_t));
|
|
gbmemcpy(newLinkNodes, m_linkNodes,
|
|
m_numLinks * sizeof(int32_t));
|
|
gbmemcpy(newLinkHashes, m_linkHashes,
|
|
m_numLinks * sizeof(uint64_t));
|
|
gbmemcpy(newHostHashes, m_hostHashes,
|
|
m_numLinks * sizeof(uint64_t));
|
|
gbmemcpy(newDomHashes, m_domHashes,
|
|
m_numLinks * sizeof(int32_t));
|
|
gbmemcpy(newLinkFlags, m_linkFlags,
|
|
m_numLinks * sizeof(linkflags_t));
|
|
gbmemcpy(newSpamNotes,m_spamNotes,
|
|
m_numLinks * sizeof(char *));
|
|
int32_t oldSize = getLinkBufferSize(m_allocLinks);
|
|
mfree(m_linkBuf, oldSize, "Links");
|
|
}
|
|
m_allocLinks = newAllocLinks;
|
|
m_linkBuf = newBuf;
|
|
m_linkPtrs = newLinkPtrs;
|
|
m_linkLens = newLinkLens;
|
|
m_linkNodes = newLinkNodes;
|
|
m_linkHashes = newLinkHashes;
|
|
m_hostHashes = newHostHashes;
|
|
m_domHashes = newDomHashes;
|
|
m_linkFlags = newLinkFlags;
|
|
m_spamNotes = newSpamNotes;
|
|
}
|
|
|
|
// normalize the link and prepend base url if needed
|
|
Url url;
|
|
|
|
if (titleRecVersion >= 125) {
|
|
// strip ending spaces
|
|
while (linkLen > 0 && is_wspace_a(link[linkLen - 1])) {
|
|
--linkLen;
|
|
}
|
|
}
|
|
|
|
/////
|
|
//
|
|
// hack fix. if link has spaces in it convert to +'s
|
|
// will fix urls like those in anchor tags on
|
|
// http://www.birmingham-boxes.co.uk/catagory.asp
|
|
//
|
|
/////
|
|
bool hasSpaces = false;
|
|
char tmp[MAX_URL_LEN+2];
|
|
for ( int32_t k = 0 ; k < linkLen ; k++ ) {
|
|
if ( link[k] == ' ' ) hasSpaces = true;
|
|
// watch out for unterminated quotes
|
|
if ( link[k] == '>' ) { hasSpaces = false; break; }
|
|
}
|
|
bool hitQuestionMark = false;
|
|
int32_t src = 0;
|
|
int32_t dst = 0;
|
|
for ( ;hasSpaces && linkLen<MAX_URL_LEN && src<linkLen ; src++ ){
|
|
// if not enough buffer then we couldn't do the conversion.
|
|
if ( dst+3 >= MAX_URL_LEN ) { hasSpaces = false; break; }
|
|
if ( link[src] == '?' )
|
|
hitQuestionMark = true;
|
|
if ( link[src] != ' ' ) {
|
|
tmp[dst++] = link[src];
|
|
continue;
|
|
}
|
|
// if we are part of the cgi stuff, use +
|
|
if ( hitQuestionMark ) {
|
|
tmp[dst++] = '+';
|
|
continue;
|
|
}
|
|
// if before the '?' then use %20
|
|
tmp[dst++] = '%';
|
|
tmp[dst++] = '2';
|
|
tmp[dst++] = '0';
|
|
}
|
|
if ( hasSpaces ) {
|
|
link = tmp;
|
|
linkLen = dst;
|
|
tmp[dst] = '\0';
|
|
}
|
|
|
|
url.set( m_baseUrl, link, linkLen, false, m_stripParams,
|
|
// convert /index.html to /
|
|
// turned this back on per john's request
|
|
// will cause undeletable data in existing indexes.
|
|
true,
|
|
titleRecVersion );
|
|
|
|
// sometimes there's links like:
|
|
// http://'+ycso[8]+ \n'commentsn?blog_id=... which is within
|
|
// <script></script> tags
|
|
if ( url.getDomainLen() <= 0 || url.getHostLen() <= 0 ) return true;
|
|
|
|
// stop http://0x0017.0000000000000000000000000000000000000024521276/
|
|
// which somehow make it through without this!!
|
|
if ( ! url.isIp() && url.getTLDLen() <= 0 ) return true;
|
|
|
|
// Allocate more link buffer space?
|
|
int32_t bufSpace ;
|
|
if ( m_allocBuf ) bufSpace = m_allocSize - (m_bufPtr-m_allocBuf);
|
|
else bufSpace = 0;
|
|
// allocate dynamic buffer for lotsa links
|
|
if ( url.getUrlLen() + 1 > bufSpace ) {
|
|
// grow by 100K
|
|
int32_t newAllocSize;// = m_allocSize+LINK_BUF_SIZE;
|
|
if ( ! m_allocSize ) newAllocSize = LINK_BUF_SIZE;
|
|
else if (m_allocSize < 1024*1024) newAllocSize = m_allocSize*2;
|
|
else newAllocSize = m_allocSize + 1024*1024;
|
|
// MDW: a realloc would be more efficient here.
|
|
char *newBuf = (char*)mmalloc(newAllocSize, "Links");
|
|
if ( ! newBuf ) {
|
|
log(LOG_WARN, "build: Links failed to realloc.");
|
|
return false;
|
|
}
|
|
log(LOG_DEBUG, "build: resizing Links text buffer to %" PRId32,
|
|
newAllocSize);
|
|
if ( m_allocBuf ) {
|
|
gbmemcpy ( newBuf , m_allocBuf , m_allocSize );
|
|
// update pointers to previous buffer
|
|
int64_t offset = newBuf - m_allocBuf;
|
|
char *allocEnd = m_allocBuf + m_allocSize;
|
|
for (int32_t i = 0 ; i < m_numLinks ; i++ ) {
|
|
if ( m_linkPtrs[i] < m_allocBuf ) continue;
|
|
if ( m_linkPtrs[i] >= allocEnd ) continue;
|
|
m_linkPtrs[i] += offset;
|
|
}
|
|
m_bufPtr += offset;
|
|
mfree ( m_allocBuf , m_allocSize , "Links");
|
|
}
|
|
else m_bufPtr = newBuf;
|
|
|
|
m_allocBuf = newBuf;
|
|
m_allocSize = newAllocSize;
|
|
}
|
|
|
|
// add some info
|
|
m_linkPtrs [ m_numLinks ] = m_bufPtr;
|
|
m_linkLens [ m_numLinks ] = url.getUrlLen();
|
|
m_linkNodes [ m_numLinks ] = nodeNum;
|
|
// serialize the normalized link into the buffer
|
|
gbmemcpy ( m_bufPtr , url.getUrl(), url.getUrlLen() );
|
|
m_bufPtr += url.getUrlLen();
|
|
|
|
// and NULL terminate it
|
|
*m_bufPtr++ = '\0';
|
|
|
|
logTrace(g_conf.m_logTraceMsg25, "Stored link: [%s]", m_linkPtrs[m_numLinks]);
|
|
|
|
|
|
// . set link hash if we need to
|
|
// . the Vector class uses these link hashes for determining similarity
|
|
// of this document to another for purposes of fightling link spam
|
|
// . we essentially compare the linking web pages against one another
|
|
// and if we find one that is similar to another we weight it's
|
|
// link text down. The more similar the more the penalty. We just
|
|
// see what links it has in common with the others for now...
|
|
if ( setLinkHash ) {
|
|
// sanity
|
|
if ( m_doQuickSet ) { g_process.shutdownAbort(true); }
|
|
// get url length
|
|
int32_t ulen = url.getUrlLen();
|
|
// subtract the cgi length
|
|
if ( url.isCgi() ) ulen -= 1 + url.getQueryLen();
|
|
// store it's hash
|
|
m_linkHashes [ m_numLinks ] = url.getUrlHash64();
|
|
m_hostHashes [ m_numLinks ] = url.getHostHash64();
|
|
m_domHashes [ m_numLinks ] = url.getDomainHash32();
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&(m_linkHashes[m_numLinks]),sizeof(m_linkHashes[m_numLinks]));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&(m_hostHashes[m_numLinks]),sizeof(m_hostHashes[m_numLinks]));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&(m_domHashes[m_numLinks]), sizeof(m_domHashes[m_numLinks]));
|
|
#endif
|
|
}
|
|
|
|
// set the bits in the flags byte
|
|
linkflags_t flags = flagsArg; // 0;
|
|
// set flag bit #0 if it is an "internal" link -- from same hostname
|
|
if ( m_baseUrl && url.getHostLen() == m_baseUrl->getHostLen() &&
|
|
strncmp(url.getHost(),m_baseUrl->getHost(),url.getHostLen())==0){
|
|
flags |= LF_SAMEHOST; //0x01;
|
|
flags |= LF_SAMEDOM; //0x02
|
|
}
|
|
else if (m_baseUrl &&url.getDomainLen() == m_baseUrl->getDomainLen() &&
|
|
strncmp(url.getDomain(),m_baseUrl->getDomain(),
|
|
url.getDomainLen())==0) {
|
|
flags |= LF_SAMEDOM;
|
|
// . memoori was adding www.construction.com which redirected
|
|
// to construction.com/index.asp and it did not add the
|
|
// outlinks because "spider internal links only" was true.
|
|
// i.e. Msg16::m_sameHostLinks was true
|
|
// . if not same host but domains match, consider it internal
|
|
// if hosts only differ by a www. this should fix that.
|
|
if ( m_baseUrl->isHostWWW() && !url .hasSubdomain() )
|
|
flags |= LF_SAMEHOST;
|
|
if ( url. isHostWWW() && !m_baseUrl->hasSubdomain() )
|
|
flags |= LF_SAMEHOST;
|
|
}
|
|
|
|
const char *tld = url.getTLD();
|
|
int32_t tlen = url.getTLDLen();
|
|
if ( tlen == 3 && ! strncmp(tld,"edu",3) ) flags |= LF_EDUTLD;
|
|
if ( tlen == 3 && ! strncmp(tld,"gov",3) ) flags |= LF_GOVTLD;
|
|
|
|
// rss?
|
|
if ( isRSS ) {
|
|
// flag it
|
|
flags |= LF_RSS;
|
|
// we had one
|
|
m_hasRSSOutlink = true;
|
|
// store the first one
|
|
if ( ! m_rssOutlinkPtr ) {
|
|
m_rssOutlinkPtr = m_linkPtrs[m_numLinks];
|
|
m_rssOutlinkLen = m_linkLens[m_numLinks];
|
|
}
|
|
}
|
|
|
|
|
|
if ( tagId == TAG_A ) flags |= LF_AHREFTAG;
|
|
else if ( tagId == TAG_LINK ) flags |= LF_LINKTAG;
|
|
else if ( tagId == TAG_FBORIGLINK ) flags |= LF_FBTAG;
|
|
|
|
// a self link?
|
|
if ( m_parentUrl &&
|
|
// MUST be a PROPER subset, links to itself do not count!
|
|
url.getUrlLen() == m_parentUrl->getUrlLen() &&
|
|
strncmp(url.getUrl(), m_parentUrl->getUrl(),
|
|
m_parentUrl->getUrlLen())==0)
|
|
{
|
|
flags |= LF_SELFLINK;
|
|
// turn this flag on
|
|
if ( nodeNum >= 0 ) m_xml->getNodePtr(nodeNum)->m_isSelfLink = 1;
|
|
}
|
|
|
|
// same site?
|
|
if (m_parentUrl) {
|
|
SiteGetter parentSiteGetter;
|
|
parentSiteGetter.getSite(m_parentUrl->getUrl(), nullptr, 0, 0);
|
|
SiteGetter siteGetter;
|
|
siteGetter.getSite(url.getUrl(), nullptr, 0, 0);
|
|
|
|
if (strcmp(siteGetter.getSite(), parentSiteGetter.getSite()) == 0) {
|
|
flags |= LF_SAMESITE;
|
|
}
|
|
}
|
|
|
|
// now check for the "permalink" key word or "permanent link" keyphrase
|
|
// TEST CASES:
|
|
//http://www.celebritybabyscoop.com/2008/12/28/jennifer-garner-is-still-pregnant/ + fp_1765421_garner_jennifer_znk_122808jpg/
|
|
//http://www.celebritybabyscoop.com/2008/12/27/gwen-stefani-family-spread-holiday-cheer/
|
|
// http://www.thetrendwatch.com/2008/12/22/how-big-shows-are-becoming-utterly-blaze/ + events-are-boring/
|
|
// http://thinkprogress.org/2008/12/26/bush-pardon-campaign/
|
|
if ( ( flags & LF_SELFLINK ) && ( flags & LF_AHREFTAG ) &&
|
|
// must be valid
|
|
nodeNum >= 0 )
|
|
{
|
|
XmlNode *nodes = m_xml->getNodes();
|
|
// get back tag
|
|
int32_t max = nodeNum + 20;
|
|
if ( max > m_xml->getNumNodes() )
|
|
max = m_xml->getNumNodes();
|
|
int32_t nn = nodeNum + 1;
|
|
while ( nn < max && nodes[nn].m_nodeId != TAG_A ) nn++;
|
|
if ( nn < max ) {
|
|
char *s = nodes[nodeNum].m_node;
|
|
char *send = nodes[nn].m_node;
|
|
for ( ; s < send ; s++ ) {
|
|
if ( *s != 'p' && *s != 'P' ) continue;
|
|
if ( ! strncasecmp(s,"permalink",9) )
|
|
break;
|
|
if ( ! strncasecmp(s,"permanent link",14) )
|
|
break;
|
|
}
|
|
if ( s < send ) {
|
|
flags |= LF_SELFPERMALINK;
|
|
m_hasSelfPermalink = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// get each url length without the cgi
|
|
int32_t len1 = url.getUrlLen() - url.getQueryLen();
|
|
int32_t len2 = 0;
|
|
if ( m_parentUrl )
|
|
len2 = m_parentUrl->getUrlLen() - m_parentUrl->getQueryLen();
|
|
// discount the '?' cuz it is not included in the queryLen right now
|
|
if ( url.getQueryLen() ) len1--;
|
|
if ( m_parentUrl && m_parentUrl->getQueryLen() ) len2--;
|
|
|
|
// . is it in a subdir of us?
|
|
// TEST CASES:
|
|
// http://joedecie.livejournal.com/28834.html?thread=167074#t167074
|
|
if ( m_parentUrl &&
|
|
// MUST be a PROPER subset, links to itself do not count!
|
|
len1 > len2 &&
|
|
strncmp(url.getUrl(), m_parentUrl->getUrl(),len2)==0) {
|
|
flags |= LF_SUBDIR;
|
|
m_hasSubdirOutlink = true;
|
|
}
|
|
|
|
|
|
// FIXME:
|
|
// http://www.packers.com/news/releases/2008/12/24/1/email_to_a_friend/
|
|
|
|
// FIXME:
|
|
// only has one hyphen but is indeed a permalink!
|
|
// http://marccooper.com/xmas-vacation/
|
|
|
|
// TEST CASES:
|
|
// href="...ami.php?url=http://lewebpedagogique.com/blog/2008/11/16/
|
|
// la-seconde-guerre-mondiale-cours ... will have its cgi ignored so
|
|
// such links as this one will not be considered permalinks
|
|
const char *pathOverride = NULL;
|
|
bool ignoreCgi = false;
|
|
if ( (flags & LF_SUBDIR) && m_parentIsPermalink ) {
|
|
pathOverride = url.getUrl() + m_parentUrl->getUrlLen();
|
|
// must be same host
|
|
if ( m_parentUrl->getHostLen() != url.getHostLen() )
|
|
pathOverride = NULL;
|
|
// same host str check
|
|
else if ( strncmp( m_parentUrl->getHost() ,
|
|
url.getHost() ,
|
|
url.getHostLen()) != 0 )
|
|
pathOverride = NULL;
|
|
// must be in bounds
|
|
else if ( url.getUrlLen() <= m_parentUrl->getUrlLen() )
|
|
pathOverride = NULL;
|
|
// if we are a permalink, ignore cgi for seeing if they are
|
|
if ( pathOverride ) ignoreCgi = true;
|
|
}
|
|
|
|
// if it is a subset of a permalink parent, it is not a "true"
|
|
// permalink if it concatenates the word "comment" onto the parent url,
|
|
// it is likely a permalink for a comment, which does not really count
|
|
// TEST CASES:
|
|
// www.flickr.com/photos/korayem/2947977582/comment72157608088269210/
|
|
// robertsquier.blogspot.com/2008/10/drawing-tv.html?showComment=.2..
|
|
// profootballtalk.com/2008/12/28/vikings-win-nfc-north/comment-page-1/
|
|
bool permCheck = true;
|
|
if ( m_doQuickSet ) permCheck = false;
|
|
if ( permCheck && ignoreCgi && strstr(pathOverride,"comment") )
|
|
permCheck = false;
|
|
if ( permCheck && ignoreCgi && strstr(pathOverride,"Comment") )
|
|
permCheck = false;
|
|
if ( permCheck && ignoreCgi && strstr(pathOverride,"COMMENT") )
|
|
permCheck = false;
|
|
|
|
// . are we probably a permalink?
|
|
// . we do not have a TagRec at this point so we just can not
|
|
// tell whether we are siteRoot, and therefore NOT a permalink
|
|
linkflags_t extraFlags = 0;
|
|
if ( permCheck &&
|
|
::isPermalink (
|
|
NULL , // Links ptr
|
|
&url , // the url
|
|
CT_HTML , // contentType
|
|
NULL , // LinkInfo ptr
|
|
isRSS ,
|
|
NULL , // note ptr
|
|
pathOverride ,
|
|
ignoreCgi ,
|
|
// might include LF_STRONGPERM
|
|
&extraFlags ) )
|
|
{
|
|
flags |= LF_PERMALINK;
|
|
flags |= extraFlags;
|
|
}
|
|
|
|
// set in flag array
|
|
m_linkFlags [ m_numLinks ] = flags;
|
|
|
|
// set to NULL for now -- call setLinkSpam() later...
|
|
m_spamNotes [ m_numLinks ] = NULL;
|
|
|
|
// inc the count
|
|
m_numLinks++;
|
|
return true;
|
|
}
|
|
|
|
|
|
// . does link #i have link text?
|
|
// . link text must have at least one alnum in it
|
|
bool Links::hasLinkText(int32_t n) {
|
|
// return 0 if no link to our "url"
|
|
if ( n >= m_numLinks ) return false;
|
|
// get the node range so we can call Xml::getText()
|
|
int32_t node1 = m_linkNodes [ n ];
|
|
|
|
// post-dating this change back to version 75, since it happened
|
|
// sometime right before this version bump, it allows for
|
|
// the least amount of docs to be indexed wrong
|
|
// only for <a> tags
|
|
if (node1 >= m_xml->getNumNodes()) return false;
|
|
if (m_xml->getNodeId(node1) != TAG_A) return false;
|
|
|
|
// find the </a> to this <a href> tag, or next <a href> tag
|
|
int32_t node2 = m_xml->getNodeNum ( node1+1,9999999,"a",1);
|
|
// if not found use the last node in the document
|
|
if ( node2 < 0 ) node2 = m_xml->getNumNodes();
|
|
// check for text node in (node1,node2) range
|
|
for ( int32_t i = node1+1 ; i < node2 ; i++ ) {
|
|
// continue if a tag
|
|
if ( m_xml->isTag(i) ) continue;
|
|
// otherwise, it's text
|
|
char *s = m_xml->getNode (i);
|
|
char *send = s + m_xml->getNodeLen(i);
|
|
// . does it have any alnums in it?
|
|
// . may be tricked by html entities like #187; or something
|
|
for ( ; s < send ; s += getUtf8CharSize(s) )
|
|
if ( is_alnum_utf8 ( s ) ) return true;
|
|
}
|
|
// otherwise, we found no text node with an alnum
|
|
return false;
|
|
}
|
|
|
|
|
|
// . stores link text into "buf" and returns the length
|
|
// . TODO: speed up so we don't have to set Url for every link in doc
|
|
int32_t Links::getLinkText(const char *linkee,
|
|
bool getSiteLinkInfo,
|
|
char *buf,
|
|
int32_t bufMaxLen,
|
|
char **itemPtr,
|
|
int32_t *itemLen,
|
|
int32_t *retNode1,
|
|
int32_t *retLinkNum,
|
|
int32_t *errCode)
|
|
{
|
|
log(LOG_DEBUG, "build: Links::getLinkText: linkee=%s, getSiteLinkInfo: %s", linkee, getSiteLinkInfo?"true":"false");
|
|
|
|
// assume none
|
|
if( retNode1 ) {
|
|
*retNode1 = -1;
|
|
}
|
|
|
|
// assume no link text
|
|
buf[0] = '\0';
|
|
|
|
// assume no item
|
|
if ( itemPtr ) {
|
|
*itemPtr = NULL;
|
|
}
|
|
if ( itemLen ) {
|
|
*itemLen = 0;
|
|
}
|
|
// assume no error
|
|
if( errCode ) {
|
|
*errCode = 0;
|
|
}
|
|
|
|
|
|
const char *no_www_linkee = NULL;
|
|
|
|
// if it is site based, skip the protocol because the site might
|
|
// be just a domain and not a subdomain
|
|
if ( getSiteLinkInfo ) {
|
|
const char *pp = strstr ( linkee, "://");
|
|
// skip scheme
|
|
if( pp ) {
|
|
linkee = pp + 3;
|
|
}
|
|
|
|
// If linkee starts with www., keep a pointer to the domain WITHOUT "www." prefix
|
|
// as we have seen a problem with links without www not being
|
|
// found for pages even though linkdb says it has a link
|
|
// ("build: Got linknode=-1 < 0. Cached linker AAA does not have outlink to
|
|
// www.domain.com like linkdb says it should.") - therefore, we try both
|
|
// for siteinfo requests (SiteGetter [stupidly] adds www for domains without it).
|
|
pp = strstr(linkee, "www.");
|
|
if( pp == linkee ) {
|
|
no_www_linkee = pp + 4;
|
|
}
|
|
}
|
|
|
|
int32_t linkeeLen = strlen(linkee);
|
|
|
|
// find the link point to our url
|
|
int32_t i;
|
|
for ( i = 0 ; i < m_numLinks ; i++ ) {
|
|
char *link = getLinkPtr(i);
|
|
int32_t linkLen = getLinkLen(i);
|
|
logTrace(g_conf.m_logTraceMsg25, " Check %" PRId32 ": %.*s", i, linkLen, link);
|
|
|
|
// now see if its a full match
|
|
// special case if site
|
|
if ( getSiteLinkInfo ) {
|
|
// Lets skip protocol in link too, to make check more accurate and
|
|
// not just a simple strstr with no position check
|
|
const char *prot = strstr(link, "://");
|
|
if( prot ) {
|
|
link = (char*)prot + 3;
|
|
}
|
|
|
|
const char *linkee_used = linkee;
|
|
// See if link matches linkee (destination) with "www." prefix
|
|
char *match = strstr(link, linkee);
|
|
if( !match && no_www_linkee ) {
|
|
// Nope, See if link matches linkee (destination) without "www." prefix
|
|
linkee_used = no_www_linkee;
|
|
match = strstr(link, no_www_linkee);
|
|
}
|
|
|
|
if( match ) {
|
|
if( link - match == 0 ) {
|
|
logTrace(g_conf.m_logTraceMsg25, "match for site check [%s] [%s]. usedNoWWW=%s", link, linkee_used, linkee_used==no_www_linkee?"true":"false");
|
|
break;
|
|
}
|
|
logTrace(g_conf.m_logTraceMsg25, "match but at wrong position (%d) for site check [%s] [%s]. usedNoWWW=%s", (int)(link-match), link, linkee_used, linkee_used==no_www_linkee?"true":"false");
|
|
}
|
|
|
|
logTrace(g_conf.m_logTraceMsg25, "no match site check [%s] [%s]", link, linkee);
|
|
continue;
|
|
}
|
|
// continue if don't match
|
|
if ( linkLen != linkeeLen ) {
|
|
logTrace(g_conf.m_logTraceMsg25, "Length differs [%s] [%s]", link, linkee);
|
|
continue;
|
|
}
|
|
// continue if don't match
|
|
if ( strcmp ( link , linkee ) != 0 ) {
|
|
logTrace(g_conf.m_logTraceMsg25, "no match [%s] [%s]", link, linkee);
|
|
continue;
|
|
}
|
|
// otherwise it's a hit
|
|
logTrace(g_conf.m_logTraceMsg25, "MATCH [%s] [%s]", link, linkee);
|
|
break;
|
|
}
|
|
|
|
// return 0 if no link to our "url"
|
|
if ( i >= m_numLinks ) {
|
|
logTrace(g_conf.m_logTraceMsg25, "NO match found, returning 0");
|
|
return 0;
|
|
}
|
|
|
|
*retLinkNum = i;
|
|
|
|
logTrace(g_conf.m_logTraceMsg25, "retLinkNum: %" PRId32 "", *retLinkNum);
|
|
|
|
return getLinkText2(i,buf,bufMaxLen,itemPtr,itemLen,retNode1,errCode);
|
|
}
|
|
|
|
|
|
int32_t Links::getLinkText2(int32_t i,
|
|
char *buf,
|
|
int32_t bufMaxLen,
|
|
char **itemPtr,
|
|
int32_t *itemLen,
|
|
int32_t *retNode1,
|
|
int32_t *errCode)
|
|
{
|
|
logTrace(g_conf.m_logTraceMsg25, "Get link text for link %" PRId32 "", i);
|
|
|
|
// get the node range so we can call Xml::getText()
|
|
int32_t node1 = m_linkNodes [ i ];
|
|
|
|
// . <area href=> tags have no link text
|
|
// . fix for http://www.cs.umass.edu/%7Everts/index.html 's
|
|
// link to phdcomics.com . it was picking up bogus link text
|
|
// from page tail.
|
|
XmlNode *xmlNodes = m_xml->getNodes();
|
|
if ( xmlNodes[node1].m_nodeId == TAG_AREA ) {
|
|
logTrace(g_conf.m_logTraceMsg25, "END, Area tag link - ignoring");
|
|
if( errCode ) {
|
|
*errCode = ENOLINKTEXT_AREATAG;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// what delimeter are we using? this only applies to rss/atom feeds.
|
|
char del[16];
|
|
int32_t dlen = 0;
|
|
int32_t rss = m_xml->isRSSFeed();
|
|
if ( rss == 1 ) {
|
|
gbmemcpy(del, "item\0", 5);
|
|
dlen = 4;
|
|
}
|
|
else if ( rss == 2 ) {
|
|
gbmemcpy(del, "entry\0", 6);
|
|
dlen = 5;
|
|
}
|
|
logTrace(g_conf.m_logTraceMsg25, "rss=%" PRId32 "", rss);
|
|
|
|
// if rss or atom page, return the whole xml <item> or <entry>
|
|
//if ( itemBuf && del ) {
|
|
if ( dlen > 0 ) {
|
|
// bail if not wanted
|
|
if ( ! itemPtr ) {
|
|
logTrace(g_conf.m_logTraceMsg25, "END, no itemPtr");
|
|
return 0;
|
|
}
|
|
|
|
//log ( LOG_INFO, "Links: Getting Link Item For Url" );
|
|
int32_t xmlNumNodes = m_xml->getNumNodes();
|
|
|
|
// . must come from a <link> node, not a <a>
|
|
// . can also be an <enclosure> tag now too
|
|
if ( xmlNodes[node1].m_nodeId == TAG_A ) {
|
|
goto skipItem;
|
|
}
|
|
|
|
// get item delimeter length
|
|
//int32_t dlen = strlen(del);
|
|
// back pedal node until we hit <item> or <entry> tag
|
|
int32_t j ;
|
|
for ( j = node1 ; j > 0 ; j-- ) {
|
|
// skip text nodes
|
|
if ( xmlNodes[j].m_nodeId == TAG_TEXTNODE ) continue;
|
|
// check the tag
|
|
if(xmlNodes[j].m_tagNameLen != dlen) continue;
|
|
if(strncasecmp(xmlNodes[j].m_tagName,del,dlen) != 0)
|
|
continue;
|
|
break;
|
|
}
|
|
// . if j is 0 we never found the <item> or <entry> tag
|
|
// because rss and atom feeds never start with such a tag
|
|
// . but we could be in the <channel> section, which is ok
|
|
// so i commented this out
|
|
//if ( j == 0 ) return 0;
|
|
// ptr to the start of it
|
|
char *s = xmlNodes[j].m_node;
|
|
// save this
|
|
if ( retNode1 ) *retNode1 = j;
|
|
// the end ptr
|
|
//char *send = s + xmlNodes[j].m_nodeLen;
|
|
const char *send = m_xml->getContent() + m_xml->getContentLen();
|
|
// . start at the first tag in this element/item
|
|
// . we will copy the blurb on the interval [j,k)
|
|
for ( int32_t k = j+1 ; k < xmlNumNodes ; k++ ) {
|
|
// get the next node in line
|
|
XmlNode *nn = &xmlNodes[k];
|
|
// . break out if would be too long
|
|
// . save room for terminating \0
|
|
//if (nn->m_node+nn->m_nodeLen-s > itemBufSize-1)break;
|
|
// break out if done
|
|
if ( k >= xmlNumNodes ) break;
|
|
// skip text nodes
|
|
if ( nn->m_nodeId == TAG_TEXTNODE ) continue;
|
|
// skip script sections, inside script tags
|
|
if ( nn->m_nodeId == TAG_SCRIPTTEXT ) continue;
|
|
if(nn->m_tagNameLen != dlen) continue;
|
|
if(strncasecmp(nn->m_tagName,del,dlen) != 0) continue;
|
|
//if ( nn->m_tagNameLen != dlen ) continue;
|
|
//if ( strncasecmp(nn->m_tagName,del,dlen)) continue;
|
|
// we got the end of the item, set "send"
|
|
send = nn->m_node + nn->m_nodeLen;
|
|
// and we're done, break out
|
|
break;
|
|
}
|
|
// . if "send" is still NULL then the item/entry blurb was too
|
|
// big to fit into our buffer, or it never had a closing tag
|
|
// . but if the feed just had a <channel> section and not items
|
|
// then use the whole thing
|
|
//if ( ! send ) return 0;
|
|
// this is a blurb, send it back as such
|
|
*itemPtr = s;
|
|
if( itemLen ) {
|
|
*itemLen = send - s;
|
|
}
|
|
// rss feeds do not have conventional link text
|
|
return 0;
|
|
}
|
|
skipItem:
|
|
// find the </a> to this <a href> tag, or next <a href> tag
|
|
int32_t node2 = m_xml->getNodeNum ( node1+1,9999999,"a",1);
|
|
// if not found use the last node in the document
|
|
if ( node2 < 0 ) node2 = 99999999;
|
|
|
|
logTrace(g_conf.m_logTraceMsg25, "Getting text for Xml node %" PRId32 "", node1);
|
|
|
|
int32_t bufLen = m_xml->getText( buf, bufMaxLen, node1, node2, false );
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(buf,bufLen);
|
|
#endif
|
|
|
|
// set it
|
|
if ( retNode1 ) {
|
|
*retNode1 = node1;
|
|
logTrace(g_conf.m_logTraceMsg25, "Setting retNode1 to %" PRId32 "", *retNode1);
|
|
}
|
|
|
|
// hunt for an alnum in the link text
|
|
// Sligtly unusual looping because we may have cut the last utf8
|
|
// character in half by using the limited buffer. This happens quite
|
|
// often when the closing </a> tag is missing and the text is utf8.
|
|
char *p = buf;
|
|
char *pend = buf + bufLen;
|
|
while ( p < pend ) {
|
|
int character_len = getUtf8CharSize(p);
|
|
if ( p+character_len > pend ) //truncated utf8 character
|
|
break;
|
|
if ( is_alnum_utf8(p) )
|
|
break;
|
|
p += character_len;
|
|
}
|
|
// if no alnum then return 0 as the link text len
|
|
if ( p >= pend ) {
|
|
logTrace(g_conf.m_logTraceMsg25, "END, No alnum match - returning 0");
|
|
return 0;
|
|
}
|
|
|
|
// find last non-space char
|
|
char *q = p;
|
|
char *last = NULL;
|
|
while ( q < pend ) {
|
|
int character_len = getUtf8CharSize(q);
|
|
if ( q+character_len > pend ) //truncated utf8 character
|
|
break;
|
|
if ( ! is_wspace_utf8(q) )
|
|
last = q;
|
|
q += character_len;
|
|
}
|
|
// hack off trailing spaces
|
|
if ( last ) pend = last + getUtf8CharSize(last); // +1;
|
|
// shift left if we expunged some leading non-alnums
|
|
memmove ( buf , p , pend - p );
|
|
// reset buflen
|
|
bufLen = pend - p;
|
|
// null terminate
|
|
buf [ bufLen ] = '\0';
|
|
// return length
|
|
logTrace(g_conf.m_logTraceMsg25, "END, returning length %" PRId32 "", bufLen);
|
|
return bufLen;
|
|
}
|
|
|
|
int32_t Links::findLinkNum(char* url, int32_t urlLen) {
|
|
for(int32_t i = 0;i< m_numLinks; i++) {
|
|
if(m_linkLens[i] == urlLen &&
|
|
strncmp(url, m_linkPtrs[i], urlLen) == 0)
|
|
return i;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
|
|
// helper function for shared link ptr buffer
|
|
static int32_t getLinkBufferSize(int32_t numLinks){
|
|
return numLinks *
|
|
(sizeof(char* ) + // linkPtrs
|
|
sizeof(int32_t ) + // linkLens
|
|
sizeof(int32_t ) + // linkNodes
|
|
sizeof(uint64_t ) + // linkHashes
|
|
sizeof(uint64_t ) + // hostHashes
|
|
sizeof(int32_t ) + // domHashes
|
|
sizeof(linkflags_t ) + // linkFlags
|
|
sizeof(char* ) // spamNotes
|
|
);
|
|
}
|
|
|
|
|
|
// returns false and sets g_errno on error
|
|
bool Links::flagOldLinks(const Links *old) {
|
|
// do not double call
|
|
if ( m_flagged ) return true;
|
|
// only call once
|
|
m_flagged = true;
|
|
// skip if null
|
|
if ( ! old ) return true;
|
|
// hash the old links into a table
|
|
HashTable ht;
|
|
for ( int32_t i = 0 ; i < old->m_numLinks ; i++ ) {
|
|
// get the url
|
|
char *u = old->m_linkPtrs[i];
|
|
int32_t ulen = old->m_linkLens[i];
|
|
// hash it
|
|
int64_t uh = hash32 ( u , ulen );
|
|
// it does not like keys of 0, that means empty slot
|
|
if ( uh == 0 ) uh = 1;
|
|
// add to hash table
|
|
if ( ! ht.addKey ( uh , 1 ) ) return false;
|
|
}
|
|
// set the flags
|
|
for ( int32_t i = 0 ; i < m_numLinks ; i++ ) {
|
|
// get the url
|
|
char *u = m_linkPtrs[i];
|
|
int32_t ulen = m_linkLens[i];
|
|
// get our hash
|
|
int64_t uh = hash32 ( u , ulen );
|
|
// it does not like keys of 0, that means empty slot
|
|
if ( uh == 0 ) uh = 1;
|
|
// check if our hash is in this hash table, if not, then
|
|
// it is a new link, skip this
|
|
if ( ht.getSlot ( uh ) < 0 ) continue;
|
|
// assume new
|
|
m_linkFlags[i] |= LF_OLDLINK;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
// . are we a permalink?
|
|
// . this registers as a permalink which it is not:
|
|
// http://www.dawn.com/2009/01/04/rss.htm
|
|
// http://www.msnbc.msn.com/id/3032072
|
|
bool isPermalink(Links *links,
|
|
Url *u,
|
|
char contentType,
|
|
LinkInfo *linkInfo,
|
|
bool isRSS,
|
|
const char **note,
|
|
const char *pathOverride,
|
|
bool ignoreCgi,
|
|
linkflags_t *retFlags) {
|
|
|
|
|
|
// reset. caller will OR these into its flags
|
|
if ( retFlags ) *retFlags = 0;
|
|
|
|
// how can this happen?
|
|
if ( ! u ) return false;
|
|
|
|
// rss feeds cannot be permalinks
|
|
if ( isRSS ) {
|
|
if ( note ) *note = "url is rss feed.";
|
|
return false;
|
|
}
|
|
|
|
// root pages don't get to be permalinks
|
|
if ( u->isRoot() ) {
|
|
if ( note ) *note = "url is a site root";
|
|
return false;
|
|
}
|
|
|
|
// are we a "site root" i.e. hometown.com/users/fred/ etc.
|
|
//if ( u->isSiteRoot ( coll ) ) {
|
|
// if ( note ) *note = "url is a site root"; return false; }
|
|
|
|
// only html (atom feeds link to themselves)
|
|
if ( contentType != CT_HTML) {
|
|
if ( note ) *note = "content is not html";
|
|
return false;
|
|
}
|
|
|
|
// techcrunch has links like this in the rss:
|
|
// http://feedproxy.google.com/~r/Techcrunch/~3/pMaRh78u1W8/
|
|
if ( strncmp(u->getHost(),"feedproxy.",10)==0 ) {
|
|
if ( note ) *note = "from feedproxy host";
|
|
return true;
|
|
}
|
|
|
|
// might want to get <feedburner:origLink> instead of <link> if
|
|
// we can. that woudl save a redirect through evil g
|
|
if ( strncmp(u->getHost(),"feeds.feedburner.com/~",22)==0 ) {
|
|
if ( note ) *note = "feedburner tilde url";
|
|
return true;
|
|
}
|
|
|
|
|
|
// . BUT if it has a link to itself on digg, reddit, etc. then it
|
|
// doesn't need to have the digits or the underscores...
|
|
// . this helps us disnguish between
|
|
// science.howstuffworks.com/fantasy-football.html (permalink) and
|
|
// science.howstuffworks.com/space-channel.htm (NOT a permalink)
|
|
// . i guess this includes "post a comment" links that are just
|
|
// anchor links to the textarea at the page bottom so that will fix:
|
|
// http://workandplay.vox.com/library/post/running-again.html
|
|
// . includes trackbacks, comments feed, etc.
|
|
// . returns -1 if unknown whether it is a permalink or not
|
|
char status = -1;
|
|
if ( links ) status = links->isPermalink ( note );
|
|
if ( status == 1 ) return true;
|
|
if ( status == 0 ) return false;
|
|
|
|
const char *pathStart = u->getPath();
|
|
// a hack by Links.cpp after setting LF_SUBDIR
|
|
if ( pathOverride ) pathStart = pathOverride;
|
|
|
|
// compute these
|
|
linkflags_t extraFlags = 0;
|
|
|
|
// we must have a sequence of 3 or more digits in the path
|
|
const char *p = pathStart;
|
|
int32_t plen = u->getPathLen();
|
|
const char *pend = u->getPath() + plen;
|
|
int32_t dcount = 0;
|
|
// now we scan the cgi stuff too!!
|
|
// http://www.rocklintoday.com/news/templates/sierra_college.asp?articleid=6848&zoneid=51
|
|
// http://www.freemarketnews.com/WorldNews.asp?nid=57373
|
|
const char *uend = u->getUrl() + u->getUrlLen();
|
|
// halt at path if we should
|
|
if ( ignoreCgi ) uend -= u->getQueryLen(); // CgiLen();
|
|
// see if we find the digits in the cgi part
|
|
bool digitsInCgi = false;
|
|
// start scanning at the path
|
|
for ( ; p < uend ; p++ ) {
|
|
if ( *p == '?' ) digitsInCgi = true;
|
|
// if not a digit, reset count
|
|
if ( ! is_digit(*p) ) { dcount = 0; continue; }
|
|
// . check if it is a "strong permalink"
|
|
// . i.e. contains /yyyy/mm/?? in PATH (not cgi)
|
|
if ( p + 9 < pend &&
|
|
*(p-1)=='/' &&
|
|
is_digit(p[0]) &&
|
|
is_digit(p[1]) &&
|
|
is_digit(p[2]) &&
|
|
is_digit(p[3]) &&
|
|
p[4] == '/' &&
|
|
is_digit(p[5]) &&
|
|
is_digit(p[6]) &&
|
|
p[7] == '/' ) {
|
|
//is_digit(p[8]) &&
|
|
//is_digit(p[9]) &&
|
|
//p[10] == '/' )
|
|
// http://www.it.com.cn/f/office/091/4/722111.htm
|
|
// was thought to have strong outlinks, but they were
|
|
// not! this should fix it...
|
|
int32_t y = atoi(p+0);
|
|
int32_t m = atoi(p+5);
|
|
// make sure the year and month are in range
|
|
if ( y >= 1990 && y <= 2050 && m >= 1 && m <= 31 )
|
|
extraFlags |= LF_STRONGPERM;
|
|
}
|
|
// count it if a digit
|
|
if ( ++dcount == 3 ) break;
|
|
}
|
|
// it can also have 2+ hyphens or 2+ underscores in a single
|
|
// path component to be a permalink
|
|
int32_t hcount = 0;
|
|
p = pathStart;
|
|
for ( ; p < pend ; p++ ) {
|
|
// if not a digit, reset count
|
|
if ( *p == '/' ) { hcount = 0; continue; }
|
|
// is it a thing?
|
|
if ( *p != '_' && *p != '-' ) continue;
|
|
// count it
|
|
if ( ++hcount == 2 ) break;
|
|
}
|
|
|
|
// we can have a cgi of "?p=<digit>" and be a permalink
|
|
p = u->getQuery();
|
|
bool hasp = ( p && p[0]=='p' && p[1]=='=' && is_digit(p[2]) ) ;
|
|
// fix for http://proglobalbusiness.org/?m=200806 being detected as
|
|
// a permalink... it has ?p=xxx outlinks.
|
|
if ( hasp ) extraFlags |= LF_STRONGPERM;
|
|
|
|
// return these if the caller wants them
|
|
if ( retFlags ) *retFlags = extraFlags;
|
|
|
|
// . if we don't then not a permalink
|
|
// . THIS STILL FAILS on stuff like:
|
|
// BUT we can fix that by doing url pattern analysis? yeah,
|
|
// each domain can have a tag that is the permalink subdir, so
|
|
// that any url in that subdir is a permalink.
|
|
if ( ! hasp && dcount < 3 && hcount < 2 ) {
|
|
if ( note )
|
|
*note = "path has no digits, underscores or hyphens";
|
|
return false;
|
|
}
|
|
|
|
|
|
// if self link check for link text "permalink" then we are
|
|
// probably very strongly a permalink
|
|
// http://www.5minutesformom.com/5225/wordless-wednesday-angel/
|
|
// has a /promote-your-site tack-on which casues the LF_SUBDIR
|
|
// algo to call the parent a NON-permalink.this should fix that
|
|
// because it has a link to itself with the word "permalink"
|
|
if ( links && links->hasSelfPermalink() ) {
|
|
if ( note ) *note = "has permalink text to itself";
|
|
return true;
|
|
}
|
|
|
|
// http://proglobalbusiness.org/?m=200806 is never a permalink
|
|
p = u->getQuery();
|
|
if ( p && p[0]=='m' && p[1]=='=' && is_digit(p[2]) ) {
|
|
int32_t n = atoi(p+2);
|
|
if ( n > 199000 && n < 205000 ) {
|
|
if ( note ) *note = "has ?m=<year><month> cgi";
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// . if we have an internal outlink that is a permalink and is
|
|
// in a subdirectory of us, THEN we are not a permalink
|
|
// . fixes andrewsullivan.atlanticmonthly.com/the_daily_dish/
|
|
linkflags_t mf = (LF_PERMALINK | LF_SAMEHOST | LF_SUBDIR );
|
|
// loop over all outlinks
|
|
int32_t no = 0;
|
|
// make sure we got them
|
|
if ( links ) no = links->m_numLinks;
|
|
// practically all internal outlinks have LF_SUBDIR set for permalinks
|
|
// for the url http://www.breitbart.tv/?p=249453 so do not do this
|
|
// outlink algo on it on such urls! basically anytime we got our
|
|
// permalink indicator in the cgi portion of the url, do not do this
|
|
// subdir algorithm.
|
|
if ( hcount < 2 && dcount < 3 && hasp ) no = 0;
|
|
// or if we only got digits and they were in the cgi
|
|
if ( hcount < 2 && ! hasp && digitsInCgi ) no = 0;
|
|
// do the outlink loop
|
|
for ( int32_t i = 0 ; links && i < no ; i++ ) {
|
|
// get the flags
|
|
linkflags_t flags = links->m_linkFlags[i];
|
|
// skip if not a match. match the match flags = "mf"
|
|
if ( (flags & mf) != mf ) continue;
|
|
// allow /print/ "printer view" pages
|
|
//if ( strstr ( links->m_linkPtrs[i],"/print" ) ) continue;
|
|
if ( note ) *note = "has subdir permalink outlink";
|
|
// ok, we are not a permalink now
|
|
return false;
|
|
}
|
|
|
|
|
|
// now check for strong outlinks on same host when we are not strong
|
|
if ( links ) no = links->m_numLinks;
|
|
// if we are strong, forget it
|
|
if ( extraFlags & LF_STRONGPERM ) no = 0;
|
|
// look for strong permalink outlinks
|
|
mf = (LF_STRONGPERM| LF_SAMEHOST );
|
|
// loop over all outlinks we have
|
|
for ( int32_t i = 0 ; links && i < no ; i++ ) {
|
|
// get the flags
|
|
linkflags_t flags = links->m_linkFlags[i];
|
|
// . if we are NOT a "strong permalink" but we have a same host
|
|
// outlink that is, then we are not a permalink
|
|
// . fixes: http://blog.makezine.com/archive/kids/
|
|
// ?CMP=OTC-0D6B48984890
|
|
if ( (flags & mf) != mf ) continue;
|
|
// allow /print/ "printer view" pages
|
|
//if ( strstr ( links->m_linkPtrs[i],"/print" ) ) continue;
|
|
if ( note ) *note = "has strong permalink outlink";
|
|
// ok, we are not a permalink now
|
|
return false;
|
|
}
|
|
|
|
|
|
// no permalinks for archive directories
|
|
if ( (gb_strcasestr(u->getPath(),"/archive")||
|
|
u->getPathDepth(false)==0) &&
|
|
gb_strcasestr(u->getPath(), "/index.") &&
|
|
!u->isCgi()){
|
|
if ( note ) *note = "has /archive and /index. and not cgi";
|
|
return false;}
|
|
// no, /tag/ is ok --> http://www.makeuseof.com/
|
|
// BUT technorati.com/tag/search-engine-optimization is not a
|
|
// permalink!!! i took technorati.com|jp out of ruleset 36 for now
|
|
// ah, but a ton of the urls have /tags/ and are NOT permalinks!!!
|
|
if (gb_strcasestr(u->getPath(), "/tag/")){
|
|
if ( note ) *note = "has /tag/";
|
|
return false;
|
|
}
|
|
|
|
// no forums or category indexes
|
|
if (gb_strcasestr(u->getPath(), "/category")){
|
|
if ( note ) *note = "has /category";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "/cat_")){
|
|
if ( note ) *note = "has /cat_";
|
|
return false;
|
|
}
|
|
|
|
// http://www.retailerdaily.com/cat/search-engine-marketing/
|
|
if (gb_strcasestr(u->getPath(), "/cat/")){
|
|
if ( note ) *note = "has /cat/";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "/comment.html")){
|
|
if ( note ) *note = "has /comment.html";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "/comments/")){
|
|
if ( note ) *note = "has /comments/";
|
|
return false;
|
|
}
|
|
|
|
|
|
const char *pos;
|
|
// category or tag page detection
|
|
pos = gb_strcasestr(u->getUrl(), "cat=");
|
|
if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
|
|
if ( note ) *note = "has [A-z]cat=";
|
|
return false;
|
|
}
|
|
|
|
pos = gb_strcasestr(u->getUrl(), "tag=");
|
|
if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
|
|
if ( note ) *note = "has [A-z]tag=";
|
|
return false;
|
|
}
|
|
|
|
pos = gb_strcasestr(u->getUrl(), "tags=");
|
|
if ( pos && pos > u->getUrl() && !is_alpha_a(*(pos-1))){
|
|
if ( note ) *note = "has [A-z]tags=";
|
|
return false;
|
|
}
|
|
|
|
// more forum detection
|
|
if (gb_strcasestr(u->getUrl(), "forum")){
|
|
if ( note ) *note = "has forum";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "thread")){
|
|
if ( note ) *note = "has thread";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "topic") &&
|
|
!gb_strcasestr(u->getPath(), "/topics/")) {
|
|
if ( note ) *note = "has /topics/";
|
|
return false;
|
|
}
|
|
|
|
// more index page detection
|
|
if (gb_strcasestr(u->getPath(), "/default.")){
|
|
if ( note ) *note = "has /default.";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "/profile.")){
|
|
if ( note ) *note = "has /profile.";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "/archives.")){
|
|
if ( note ) *note = "has /archives.";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "_archive.")){
|
|
if ( note ) *note = "has _archive.";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "/search.")){
|
|
if ( note ) *note = "has /search.";
|
|
return false;
|
|
}
|
|
|
|
if (gb_strcasestr(u->getPath(), "/search/")){
|
|
if ( note ) *note = "has /search/";
|
|
return false;
|
|
}
|
|
|
|
// get path end
|
|
p = u->getPath() + u->getPathLen();
|
|
plen = u->getPathLen();
|
|
// back up over index.html
|
|
if ( plen > 10 && strncmp(p-10,"index.html",10)==0 ) {
|
|
plen -= 10; p -= 10; }
|
|
// hack off the /
|
|
if ( p[-1]=='/' ) { plen--; p--; }
|
|
|
|
// ends in /trackback means not a permalink
|
|
if ( plen >= 10 && strncasecmp(p-10,"/trackback",10)==0) {
|
|
if ( note ) *note = "ends in /trackback";
|
|
return false;
|
|
}
|
|
|
|
// ends in /dddd/dd means usually an archive date
|
|
if ( plen >= 8 &&
|
|
is_digit(p[-1]) &&
|
|
is_digit(p[-2]) &&
|
|
p[-3] == '/' &&
|
|
is_digit(p[-4]) &&
|
|
is_digit(p[-5]) &&
|
|
is_digit(p[-6]) &&
|
|
is_digit(p[-7]) &&
|
|
p[-8] == '/' ) {
|
|
// ensure the numbers are in range for a date
|
|
int32_t year = atoi(p-7);
|
|
int32_t month = atoi(p-2);
|
|
if ( year > 1990 && year <= 2015 &&
|
|
month > 0 && month <= 12 ) {
|
|
if ( note ) *note = "ends in /dddd/dd/";
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// /2008 is usually not permalink
|
|
if ( plen >= 5 &&
|
|
p[-5] == '/' &&
|
|
p[-4] == '2' &&
|
|
p[-3] == '0' &&
|
|
atoi(p-2) < 50 ) {
|
|
if ( note ) *note = "ends in year /20xx";
|
|
return false;
|
|
}
|
|
// /199? too
|
|
if ( plen >= 5 &&
|
|
p[-5] == '/' &&
|
|
p[-4] == '1' &&
|
|
p[-3] == '9' &&
|
|
atoi(p-2) > 90 ) {
|
|
if ( note ) *note = "ends in year /19xx";
|
|
return false;
|
|
}
|
|
|
|
|
|
// . look for a repetitive sequence of html tags
|
|
// . each must contain an outlink! there are some blog entries that
|
|
// have excerpts of an email chain.
|
|
// . if the repetition intersects the main content section,
|
|
// then it is an index page
|
|
|
|
// . make sure that SCORES can detect comment sections. very often
|
|
// the comment is bigger than the main section!!!!
|
|
|
|
// . or we can subtract the repetitive sections, and see if we have
|
|
// any beefy content left over... then we don't have to worry
|
|
// about comment identification
|
|
|
|
// . index tag pairs
|
|
// . look at header tags, div, p, index level # before the pair
|
|
|
|
// . find the delimeter between the blurbs
|
|
// . delimeter must touch the beefy content section
|
|
// . go by "strings" of tagids, a tagid of 0 means text i think
|
|
// but eliminate it if pure punctuation
|
|
// . and have a subtagid field, which is the hash of a tag's attributes
|
|
// BUT in the case of a text tag, a hash of the alpha chars
|
|
|
|
// . how many delimeters can we find that start at level X.
|
|
|
|
// . now we are determining
|
|
if ( note ) *note = "is permalink";
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
int32_t getSiteRank(int32_t sni) {
|
|
if ( sni <= 0 ) return 0;
|
|
if ( sni <= 1 ) return 1;
|
|
if ( sni <= 2 ) return 2;
|
|
if ( sni <= 3 ) return 3;
|
|
if ( sni <= 4 ) return 4;
|
|
if ( sni <= 5 ) return 5;
|
|
if ( sni <= 9 ) return 6;
|
|
if ( sni <= 19 ) return 7;
|
|
if ( sni <= 39 ) return 8;
|
|
if ( sni <= 79 ) return 9;
|
|
if ( sni <= 200-1 ) return 10;
|
|
if ( sni <= 500-1 ) return 11;
|
|
if ( sni <= 2000-1 ) return 12;
|
|
if ( sni <= 5000-1 ) return 13;
|
|
if ( sni <= 10000-1 ) return 14;
|
|
return 15;
|
|
}
|
|
|
|
|
|
// . get the # of words in this string
|
|
static int32_t getNumWords(const char *s, int32_t len) {
|
|
|
|
int32_t wordCount = 0;
|
|
bool inWord = false;
|
|
for ( int32_t i = 0 ; i < len ; i++ ) {
|
|
if ( ! is_alnum_a ( s[i] ) && s[i]!='\'' ) {
|
|
inWord = false;
|
|
continue;
|
|
}
|
|
if ( ! inWord ) {
|
|
inWord = true;
|
|
wordCount++;
|
|
}
|
|
}
|
|
return wordCount;
|
|
}
|