diff --git a/Msg25.cpp b/Msg25.cpp index f6fa2cac..fec5e9ab 100644 --- a/Msg25.cpp +++ b/Msg25.cpp @@ -1653,16 +1653,7 @@ bool Msg25::gotLinkText(Msg20Request *msg20req) { // . linkText->getLinkTextLen() if ( msg20reply && good && msg20reply->size_linkText <= 0 && - msg20reply->size_rssItem <= 0 && - // allow if from a ping server because like - // rpc.weblogs.com/shortChanges.xml so we can use - // "inlink==xxx" in the url filters to assign any page linked - // to by a pingserver into a special spider queue. then we can - // spider that page quickly and get its xml feed url, and then - // spider that to get new outlinks of permalinks. - // Well now we use "inpingserver" instead of having to specify - // the "inlink==xxx" expression for every ping server we know. - ! linker.isPingServer() ) { + msg20reply->size_rssItem <= 0 ) { good = false; m_noText++; note = "no link text"; diff --git a/Parms.cpp b/Parms.cpp index 2dbc4082..1bf5d97c 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -11382,8 +11382,8 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) { "<tr class=poo><td>hopcount</td>" "<td>All root urls, those that have only a single " "slash for their path, and no cgi parms, have a " - "hop count of 0. Also, all RSS urls, ping " - "server urls and site roots (as defined in the " + "hop count of 0. Also, all RSS urls " + "and site roots (as defined in the " "site rules table) have a hop count of 0. Their " "outlinks have a hop count of 1, and the outlinks " "of those outlinks a hop count of 2, etc." @@ -11536,16 +11536,6 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) { "being discovered from the spider. " "</td></tr>" - "<tr class=poo><td><nobr>inpingserver | !inpingserver" - "</nobr></td>" - "<td>" - "This is true if the url has an inlink from " - "a recognized ping server. Ping server urls are " - "hard-coded in Url.cpp. <b><font color=red> " - "pingserver urls are assigned a hop count of 0" - "</font></b>" - "</td></tr>" - "<tr class=poo><td>isindexed | !isindexed</td>" "<td>" "This url matches this if in the index already. " diff --git a/Url.cpp b/Url.cpp index 7aa2c1c3..0db5d510 100644 --- a/Url.cpp +++ b/Url.cpp @@ -2421,14 +2421,6 @@ const char *getDomFast ( const char *url , int32_t *domLen , bool hasHttp ) { return udom; } -// Is it a ping server? It might respond with huge documents with thousands of -// links, which would normally be detected as link spam. This function is kept -// around until we have a better way of handling it than hardcoded URLs in a -// source file. -bool Url::isPingServer ( ) const { - return false; -} - // "s" point to the start of a normalized url (includes http://, etc.) const char *getHost(const char *s, int32_t *hostLen) { diff --git a/Url.h b/Url.h index 96d99077..e531814f 100644 --- a/Url.h +++ b/Url.h @@ -92,9 +92,6 @@ public: //badExtensions - extensions not to be parsed bool hasNonIndexableExtension(int32_t xxx) const; - // is it http://rpc.weblogs.com/shortChanges.xml, etc.? - bool isPingServer ( ) const; - int32_t getSubUrlLen(int32_t i) const; int32_t getSubPathLen(int32_t i) const; diff --git a/XmlDoc.cpp b/XmlDoc.cpp index baaf88f2..aa4812ba 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -11402,8 +11402,6 @@ int8_t *XmlDoc::getHopCount ( ) { setStatus ( "getting hop count" ); - // the unredirected url - Url *f = getFirstUrl(); // get url as string, skip "http://" or "https://" //char *u = f->getHost(); // if we match site, we are a site root, so hop count is 0 @@ -11414,13 +11412,6 @@ int8_t *XmlDoc::getHopCount ( ) { // m_hopCountValid = true; // return &m_hopCount; //} - // ping servers have 0 hop counts - if ( f->isPingServer() ) { - // log("xmldoc: hc2 is 0 (pingserver) %s",m_firstUrl.m_url); - m_hopCount = 0; - m_hopCountValid = true; - return &m_hopCount; - } char *isRSS = getIsRSS(); if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS; // check for site root @@ -15108,7 +15099,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) { bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] ); // get it quick - bool ispingserver = url.isPingServer(); int32_t domHash32 = url.getDomainHash32(); // is link rss? @@ -15154,7 +15144,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) { } if ( issiteroot ) ksr.m_hopCount = 0; - if ( ispingserver ) ksr.m_hopCount = 0; // validate it ksr.m_hopCountValid = true; diff --git a/linkspam.cpp b/linkspam.cpp index 02d3a5cf..4fe4f288 100644 --- a/linkspam.cpp +++ b/linkspam.cpp @@ -373,10 +373,6 @@ bool setLinkSpam ( int32_t ip , Xml *xml , Links *links , bool isContentTruncated ) { - // it is critical to get inlinks from all pingserver xml - // pages regardless if they are often large pages. we - // have to manually hard-code the ping servers in for now. - if ( linker->isPingServer() ) return false; // if the doc got truncated we may be missing valuable identifiers // that identify the doc as a guestbook or something if ( isContentTruncated ) { @@ -629,10 +625,6 @@ bool isLinkSpam ( const Url *linker, const Url *linkee , // node position of the linkee in the linker's content int32_t linkNode ) { - // it is critical to get inlinks from all pingserver xml - // pages regardless if they are often large pages. we - // have to manually hard-code the ping servers in for now. - if ( linker->isPingServer() ) return false; // same host linkers can be link spam (TODO: make same ip block) // because we only allow up to 10 to vote as a single voter if ( linkee ) {