Removed all mention onf ping servers

This commit is contained in:
Ivan Skytte Jørgensen
2018-01-22 15:15:56 +01:00
parent 6bc3608cc2
commit 417d04fd50
6 changed files with 3 additions and 52 deletions

@ -1653,16 +1653,7 @@ bool Msg25::gotLinkText(Msg20Request *msg20req) {
// . linkText->getLinkTextLen()
if ( msg20reply && good &&
msg20reply->size_linkText <= 0 &&
msg20reply->size_rssItem <= 0 &&
// allow if from a ping server because like
// rpc.weblogs.com/shortChanges.xml so we can use
// "inlink==xxx" in the url filters to assign any page linked
// to by a pingserver into a special spider queue. then we can
// spider that page quickly and get its xml feed url, and then
// spider that to get new outlinks of permalinks.
// Well now we use "inpingserver" instead of having to specify
// the "inlink==xxx" expression for every ping server we know.
! linker.isPingServer() ) {
msg20reply->size_rssItem <= 0 ) {
good = false;
m_noText++;
note = "no link text";

@ -11382,8 +11382,8 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
"<tr class=poo><td>hopcount</td>"
"<td>All root urls, those that have only a single "
"slash for their path, and no cgi parms, have a "
"hop count of 0. Also, all RSS urls, ping "
"server urls and site roots (as defined in the "
"hop count of 0. Also, all RSS urls "
"and site roots (as defined in the "
"site rules table) have a hop count of 0. Their "
"outlinks have a hop count of 1, and the outlinks "
"of those outlinks a hop count of 2, etc."
@ -11536,16 +11536,6 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
"being discovered from the spider. "
"</td></tr>"
"<tr class=poo><td><nobr>inpingserver | !inpingserver"
"</nobr></td>"
"<td>"
"This is true if the url has an inlink from "
"a recognized ping server. Ping server urls are "
"hard-coded in Url.cpp. <b><font color=red> "
"pingserver urls are assigned a hop count of 0"
"</font></b>"
"</td></tr>"
"<tr class=poo><td>isindexed | !isindexed</td>"
"<td>"
"This url matches this if in the index already. "

@ -2421,14 +2421,6 @@ const char *getDomFast ( const char *url , int32_t *domLen , bool hasHttp ) {
return udom;
}
// Is it a ping server? It might respond with huge documents with thousands of
// links, which would normally be detected as link spam. This function is kept
// around until we have a better way of handling it than hardcoded URLs in a
// source file.
bool Url::isPingServer ( ) const {
return false;
}
// "s" point to the start of a normalized url (includes http://, etc.)
const char *getHost(const char *s, int32_t *hostLen) {

3
Url.h

@ -92,9 +92,6 @@ public:
//badExtensions - extensions not to be parsed
bool hasNonIndexableExtension(int32_t xxx) const;
// is it http://rpc.weblogs.com/shortChanges.xml, etc.?
bool isPingServer ( ) const;
int32_t getSubUrlLen(int32_t i) const;
int32_t getSubPathLen(int32_t i) const;

@ -11402,8 +11402,6 @@ int8_t *XmlDoc::getHopCount ( ) {
setStatus ( "getting hop count" );
// the unredirected url
Url *f = getFirstUrl();
// get url as string, skip "http://" or "https://"
//char *u = f->getHost();
// if we match site, we are a site root, so hop count is 0
@ -11414,13 +11412,6 @@ int8_t *XmlDoc::getHopCount ( ) {
// m_hopCountValid = true;
// return &m_hopCount;
//}
// ping servers have 0 hop counts
if ( f->isPingServer() ) {
// log("xmldoc: hc2 is 0 (pingserver) %s",m_firstUrl.m_url);
m_hopCount = 0;
m_hopCountValid = true;
return &m_hopCount;
}
char *isRSS = getIsRSS();
if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS;
// check for site root
@ -15108,7 +15099,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] );
// get it quick
bool ispingserver = url.isPingServer();
int32_t domHash32 = url.getDomainHash32();
// is link rss?
@ -15154,7 +15144,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
}
if ( issiteroot ) ksr.m_hopCount = 0;
if ( ispingserver ) ksr.m_hopCount = 0;
// validate it
ksr.m_hopCountValid = true;

@ -373,10 +373,6 @@ bool setLinkSpam ( int32_t ip ,
Xml *xml ,
Links *links ,
bool isContentTruncated ) {
// it is critical to get inlinks from all pingserver xml
// pages regardless if they are often large pages. we
// have to manually hard-code the ping servers in for now.
if ( linker->isPingServer() ) return false;
// if the doc got truncated we may be missing valuable identifiers
// that identify the doc as a guestbook or something
if ( isContentTruncated ) {
@ -629,10 +625,6 @@ bool isLinkSpam ( const Url *linker,
const Url *linkee ,
// node position of the linkee in the linker's content
int32_t linkNode ) {
// it is critical to get inlinks from all pingserver xml
// pages regardless if they are often large pages. we
// have to manually hard-code the ping servers in for now.
if ( linker->isPingServer() ) return false;
// same host linkers can be link spam (TODO: make same ip block)
// because we only allow up to 10 to vote as a single voter
if ( linkee ) {