mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-06-26 00:06:07 -04:00
Removed all mention onf ping servers
This commit is contained in:
11
Msg25.cpp
11
Msg25.cpp
@ -1653,16 +1653,7 @@ bool Msg25::gotLinkText(Msg20Request *msg20req) {
|
||||
// . linkText->getLinkTextLen()
|
||||
if ( msg20reply && good &&
|
||||
msg20reply->size_linkText <= 0 &&
|
||||
msg20reply->size_rssItem <= 0 &&
|
||||
// allow if from a ping server because like
|
||||
// rpc.weblogs.com/shortChanges.xml so we can use
|
||||
// "inlink==xxx" in the url filters to assign any page linked
|
||||
// to by a pingserver into a special spider queue. then we can
|
||||
// spider that page quickly and get its xml feed url, and then
|
||||
// spider that to get new outlinks of permalinks.
|
||||
// Well now we use "inpingserver" instead of having to specify
|
||||
// the "inlink==xxx" expression for every ping server we know.
|
||||
! linker.isPingServer() ) {
|
||||
msg20reply->size_rssItem <= 0 ) {
|
||||
good = false;
|
||||
m_noText++;
|
||||
note = "no link text";
|
||||
|
14
Parms.cpp
14
Parms.cpp
@ -11382,8 +11382,8 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"<tr class=poo><td>hopcount</td>"
|
||||
"<td>All root urls, those that have only a single "
|
||||
"slash for their path, and no cgi parms, have a "
|
||||
"hop count of 0. Also, all RSS urls, ping "
|
||||
"server urls and site roots (as defined in the "
|
||||
"hop count of 0. Also, all RSS urls "
|
||||
"and site roots (as defined in the "
|
||||
"site rules table) have a hop count of 0. Their "
|
||||
"outlinks have a hop count of 1, and the outlinks "
|
||||
"of those outlinks a hop count of 2, etc."
|
||||
@ -11536,16 +11536,6 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"being discovered from the spider. "
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td><nobr>inpingserver | !inpingserver"
|
||||
"</nobr></td>"
|
||||
"<td>"
|
||||
"This is true if the url has an inlink from "
|
||||
"a recognized ping server. Ping server urls are "
|
||||
"hard-coded in Url.cpp. <b><font color=red> "
|
||||
"pingserver urls are assigned a hop count of 0"
|
||||
"</font></b>"
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>isindexed | !isindexed</td>"
|
||||
"<td>"
|
||||
"This url matches this if in the index already. "
|
||||
|
8
Url.cpp
8
Url.cpp
@ -2421,14 +2421,6 @@ const char *getDomFast ( const char *url , int32_t *domLen , bool hasHttp ) {
|
||||
return udom;
|
||||
}
|
||||
|
||||
// Is it a ping server? It might respond with huge documents with thousands of
|
||||
// links, which would normally be detected as link spam. This function is kept
|
||||
// around until we have a better way of handling it than hardcoded URLs in a
|
||||
// source file.
|
||||
bool Url::isPingServer ( ) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// "s" point to the start of a normalized url (includes http://, etc.)
|
||||
const char *getHost(const char *s, int32_t *hostLen) {
|
||||
|
3
Url.h
3
Url.h
@ -92,9 +92,6 @@ public:
|
||||
//badExtensions - extensions not to be parsed
|
||||
bool hasNonIndexableExtension(int32_t xxx) const;
|
||||
|
||||
// is it http://rpc.weblogs.com/shortChanges.xml, etc.?
|
||||
bool isPingServer ( ) const;
|
||||
|
||||
int32_t getSubUrlLen(int32_t i) const;
|
||||
int32_t getSubPathLen(int32_t i) const;
|
||||
|
||||
|
11
XmlDoc.cpp
11
XmlDoc.cpp
@ -11402,8 +11402,6 @@ int8_t *XmlDoc::getHopCount ( ) {
|
||||
|
||||
setStatus ( "getting hop count" );
|
||||
|
||||
// the unredirected url
|
||||
Url *f = getFirstUrl();
|
||||
// get url as string, skip "http://" or "https://"
|
||||
//char *u = f->getHost();
|
||||
// if we match site, we are a site root, so hop count is 0
|
||||
@ -11414,13 +11412,6 @@ int8_t *XmlDoc::getHopCount ( ) {
|
||||
// m_hopCountValid = true;
|
||||
// return &m_hopCount;
|
||||
//}
|
||||
// ping servers have 0 hop counts
|
||||
if ( f->isPingServer() ) {
|
||||
// log("xmldoc: hc2 is 0 (pingserver) %s",m_firstUrl.m_url);
|
||||
m_hopCount = 0;
|
||||
m_hopCountValid = true;
|
||||
return &m_hopCount;
|
||||
}
|
||||
char *isRSS = getIsRSS();
|
||||
if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS;
|
||||
// check for site root
|
||||
@ -15108,7 +15099,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] );
|
||||
|
||||
// get it quick
|
||||
bool ispingserver = url.isPingServer();
|
||||
int32_t domHash32 = url.getDomainHash32();
|
||||
|
||||
// is link rss?
|
||||
@ -15154,7 +15144,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
}
|
||||
|
||||
if ( issiteroot ) ksr.m_hopCount = 0;
|
||||
if ( ispingserver ) ksr.m_hopCount = 0;
|
||||
|
||||
// validate it
|
||||
ksr.m_hopCountValid = true;
|
||||
|
@ -373,10 +373,6 @@ bool setLinkSpam ( int32_t ip ,
|
||||
Xml *xml ,
|
||||
Links *links ,
|
||||
bool isContentTruncated ) {
|
||||
// it is critical to get inlinks from all pingserver xml
|
||||
// pages regardless if they are often large pages. we
|
||||
// have to manually hard-code the ping servers in for now.
|
||||
if ( linker->isPingServer() ) return false;
|
||||
// if the doc got truncated we may be missing valuable identifiers
|
||||
// that identify the doc as a guestbook or something
|
||||
if ( isContentTruncated ) {
|
||||
@ -629,10 +625,6 @@ bool isLinkSpam ( const Url *linker,
|
||||
const Url *linkee ,
|
||||
// node position of the linkee in the linker's content
|
||||
int32_t linkNode ) {
|
||||
// it is critical to get inlinks from all pingserver xml
|
||||
// pages regardless if they are often large pages. we
|
||||
// have to manually hard-code the ping servers in for now.
|
||||
if ( linker->isPingServer() ) return false;
|
||||
// same host linkers can be link spam (TODO: make same ip block)
|
||||
// because we only allow up to 10 to vote as a single voter
|
||||
if ( linkee ) {
|
||||
|
Reference in New Issue
Block a user