privacore-open-source-searc…/linkspam.cpp

// http://www.propeciauk.co.uk/links.htm
// http://www.hendersonvillehomepro.com/FavoriteLinks/Default.aspx
// http://www.viacreme-viacream-viagra.com/health/pharmacies.htm
// are the same description for viagrapunch.com. why did they not cancel?

#include "linkspam.h"
#include "Url.h"
#include "Linkdb.h"
#include "Xml.h"
#include "matches2.h"
#include "fctypes.h"
#include "utf8_fast.h"

static bool isLinkChain ( Xml *xml, const Url *linker, const Url *linkee, int32_t linkNode,
		   const char **note ) ;

// . here's some additional things to mark it as a log page, but these
//   depend on the content of the page, not the url itself.
// . fields: string, stringLen, id, section?
// . section is "1" if the substring identifies the start of a comment
//   section, so that any links above that identifier should be
//   consider good, and any below, should be considered bad links.
//   Otherwise, if section is 0, if the match occurs anywhere on the
//   page then all links on the page should be considered bad.
static const Needle s_needles1[] = {
	{"open.thumbshots.org"          , 0 , 0  } ,
	//{"google-ad"                    , 0 , 0  } ,
	// indicates search results page
	// this often directly precedes the comment section
	{"[trackback"                   , 1 , 1  } ,
	{"class=\"comtext"              , 8 , 1  } ,
	{"class=\"comment"              , 8 , 1  } ,
	{"class=\"coment"               , 8 , 1  } ,
	{"class=\"trackback"            , 8 , 1  } ,
	{"class=\"ping"                 , 8 , 1  } ,
	{"class=\"followup"             , 8 , 1  } ,
	{"class=\"response"             , 8 , 1  } ,
	// this can signify a blog entry, not just a comment
	//{"class=\"entry"              , 8 , 1  } ,
	// these seem to be more indicative of posted comments
	{"class=\"posted"               , 8 , 1  },
	{"id=\"posted"                  , 8 , 1  },
	{"name=\"posted"                , 8 , 1  },
	// annoying little textbox thingy
	{"class=\"shoutbox"             , 8 , 1  } ,
	{"id=\"comment"                 , 8 , 1  } ,
	{"id=\"coment"                  , 8 , 1  } ,
	{"id=\"trackback"               , 8 , 1  } ,
	{"id=\"ping"                    , 8 , 1  } ,
	{"id=\"followup"                , 8 , 1  } ,
	{"id=\"response"                , 8 , 1  } ,
	{"name=\"comment"               , 8 , 1  } ,
	{"name=\"coment"                , 8 , 1  } ,
	{"name=\"trackback"             , 8 , 1  } ,
	{"name=\"ping"                  , 8 , 1  } ,
	{"name=\"followup"              , 8 , 1  } ,
	{"name=\"response"              , 8 , 1  } ,
	// a lot of the comment boards can be identified because
	// they have a bunch of mailto links, one before each comment
	//{"href=\"mailto"                , 8 , 1 },
	//{"href=mailto"                  , 8 , 1 },
	// wikipedias
	{"div class=\"editsection"      , 10, 1  } ,
	{"action=edit"                  , 10, 1  } ,
	// message boards
	{"anonymous user"               , 10, 1  } ,
	{"anonymer user"                , 10, 1  } ,
	{"date posted"                  , 10, 1  } ,
	{"post your notice"             , 10, 1  } ,
	{"edit this page"               , 10, 1  } ,
	// edit</a><br>
	{"edit<a]br"                    , 10, 1  } ,
	// link to edit a comment
	{">edit</a"                     , 10, 1  } ,
	// these often indicate blog entries, not just comments
	//{"postedon"                     , 10, 1 },
	//{"posted by "                   , 10, 1 },
	//{"posted at "                   , 10, 1 },
	{"reply with quote"             , 9 , 0  } ,
	{">post a reply"                , 10, 0  } ,
	{"post reply"                   , 10, 0  } ,
	{"submit post"                  , 10, 0  } ,
	{">post message"                , 10, 0  } ,
	{">post a comment"              , 10, 0  } ,
	{">leave a comment"             , 10, 0  } ,
	{">post comments"               , 10, 0  } ,
	// Comments</font> (0) after each posted entry...
	//{">comments<"                 , 10, 1  } ,
	{"comments: <"                  , 10, 1  } ,
	{"comments:<"                   , 10, 1  } ,
	//{"comment:"                   , 10, 1  } ,
	{"reacties:"                    , 10, 1  } ,
	{"comentarios:"                 , 10, 1  } ,
	{"comentários:"                 , 10, 1  } ,
	{">message:"                    , 10, 0  } ,
	{">mensagem:"                   , 10, 0  } ,
	{">faca seu comentario"         , 10, 0  } ,
	{">faça seu comentário"         , 10, 0  } ,
	// comment add in german
	{">Kommentar hinzuf"            , 10, 0  } ,
	{"rate this link"               , 10, 0  } ,
	{"link submit"                  , 10, 0  } ,
	{"links directory"              , 10, 0  } ,
	{">add my comment"              , 10, 0  } ,
	// title of the text area box
	{">your comment"                , 10, 0  } ,
	{"your comment<"                , 10, 0  } ,
	{">comment by"                  , 10, 1  } ,
	{">scrivi un commento"          , 10, 0  } ,
	{">scrivi il tuo commento"      , 10, 0  } ,
	{"add comment"                  , 10, 0  } ,
	{"trackbacks for the art"       , 12, 1  } ,
	{"these trackbacks have been re", 13, 1  } ,
	{"trackback pings"              , 13, 1  } ,
	{"read the rest of this com"    , 13, 1  } ,
	// that was the opinion of ...
	{"das war die meinung von"      , 13, 1  } ,
	{"resource partner"             , 49, 0  } ,
	{"partner link"                 , 50, 0  } ,
	{"partner site"                 , 51, 0  } ,
	{"sign the guestbook"           , 43, 0  } ,
	//{"add new comment"              , 14, 0 },
	//{"add message"                  , 14, 0 },
	// tagboard software allows free submits. it has this in
	// an html comment tag...
	{"2002 natali ardianto"         , 14, 0  } ,
	// guestbooks
	{"guestbook</title"             , 13, 0  } ,
	{"gastenboek</title"            , 13, 0  } ,
	// link management software puts a search box on there
	{"search our links"             , 14, 0  } ,
	{"find all words option"        , 14, 0  } ,
	// link exchange indicators
	{"link you want to share"       , 14, 0  } ,
	{"link trader"                  , 14, 0  } ,
	{"link exchange"                , 15, 0  } ,
	{"link partner"                 , 16, 0  } ,
	{"link xchange"                 , 17, 0  } ,
	{"link swap"                    , 18, 0  } ,
	{"links trader"                 , 19, 0  } ,
	{"links exchange"               , 20, 0  } ,
	{"links partner"                , 21, 0  } ,
	{"links xchange"                , 22, 0  } ,
	{"links swap"                   , 23, 0  } ,
	{"list your site"               , 26, 0  } ,
	{"add your web site"            , 24, 0  } ,
	{"add your website"             , 25, 0  } ,
	{"add your site"                , 26, 0  } ,
	{"add your link"                , 27, 0  } ,
	{"add your url"                 , 28, 0  } ,
	{"add site"                     , 28, 0  } ,
	// email the webmaster to have your link on this page
	{"have your link"               , 28, 0  } ,
	{"add a web site"               , 29, 0  } ,
	{"add a website"                , 30, 0  } ,
	{"add a site"                   , 31, 0  } ,
	{"add a link"                   , 32, 0  } ,
	{"add a url"                    , 33, 0  } ,
	{"adding your web site"         , 34, 0  } ,
	{"adding your website"          , 35, 0  } ,
	{"adding your site"             , 36, 0  } ,
	{"adding your link"             , 37, 0  } ,
	{"adding your url"              , 38, 0  } ,
	{"adding a web site"            , 39, 0  } ,
	{"adding a website"             , 40, 0  } ,
	{"adding a site"                , 41, 0  } ,
	{"adding a link"                , 42, 0  } ,
	{"adding a url"                 , 43, 0  } ,
	{"add url"                      , 43, 0  } ,
	{"add resource"                 , 43, 0  } ,
	{"add link"                     , 43, 0  } ,
	{"add free link"                , 43, 0  } ,
	{"addlink"                      , 43, 0  } ,
	{"suggest a site"               , 43, 0  } ,
	{"swap links"                   , 43, 0  } ,
	{"considered for addition"      , 43, 0  } ,
	{"we are not affiliated"        , 43, 0  } ,
	{"have a site to add"           , 43, 0  } ,
	{"submit your web site"         , 34, 0  } ,
	{"submit your website"          , 35, 0  } ,
	{"submit your site"             , 36, 0  } ,
	{"submit your link"             , 37, 0  } ,
	{"submit your url"              , 38, 0  } ,
	{"submit a web site"            , 39, 0  } ,
	{"submit a website"             , 40, 0  } ,
	{"submit a site"                , 41, 0  } ,
	{"submit a link"                , 42, 0  } ,
	{"submit link"                  , 42, 0  } ,
	{"submit a url"                 , 43, 0  } ,
	// . article spammers using article-emporium.com, etc.
	// . these articles get circulated into regular websites
	{"submit your article"          , 43, 0  } ,
	{"submit articles"              , 43, 0  } ,
	{"submit an article"            , 43, 0  } ,
	{"for any feedback contact"     , 43, 0  } ,
	{"for any feedback mail"        , 43, 0  } ,
	{"for any feedback email"       , 43, 0  } ,
	{"other articles that might"    , 43, 0  } ,
	{"is a freelance"               , 43, 0  } ,
	{"author is an amateur"         , 43, 0  } ,
	{"article source"               , 43, 0  } ,
	{"word count:"                  , 43, 0  } ,
	{"for additional information on", 43, 1  } ,
	{"for more information on"      , 43, 1  } ,
	{"for further assistance visit" , 43, 1  } ,
	{"article submitted on"         , 43, 0  } ,
	{"please rate this"             , 43, 0  } ,
	{"rate the article"             , 43, 0  } ,
	//{"how would you rate"         , 43, 0  } ,
	{"add rating"                   , 43, 0  } ,
	{"trade text link"              , 44, 0  } ,
	{"trade link"                   , 45, 0  } ,
	{"exchange link"                , 46, 0  } ,
	{"exchanging link"              , 47, 0  } ,
	{"reciprocal link"              , 48, 0  } ,

	// new stuff
	{">sponsors<"                   , 48, 0  } ,
	{">sponsor<"                    , 48, 0  } ,
	{">sponsored<"                  , 48, 0  } ,
	{">submit site<"                , 48, 0  } ,
	{": sponsor"                    , 48, 0  } ,
	{"/sponsor/"                    , 48, 0  } ,
	{"*sponsors*"                   , 48, 0  } ,
	{">payperpost"                  , 48, 0  } ,
	{"sponsored post"               , 48, 0  } ,
	{"sponsored flag"               , 48, 0  } ,
	{"sponsoredflag"                , 48, 0  } ,
	{"sponsored listing"            , 48, 1  } ,
	{"sponsored link"               , 48, 1  } ,
	{"post is sponsor"              , 48, 0  } ,
	{"paid post"                    , 48, 0  } ,
	{"powered by"                   , 48, 0  } , // wordpress
	{"suggest your website"         , 48, 0  } ,
	{"advertisement:"               , 48, 1  }
};

static constexpr int32_t numNeedles1 = sizeof(s_needles1)/sizeof(Needle);

// now check outlinks on the page for these substrings
static const Needle s_needles2[] = {
	{"cyber-robotics.com" , 0 , 0  } ,
	{"cyberspacehq.com"   , 0 , 0  } ,
	{"links4trade.com"    , 0 , 0  } ,
	{"searchfeed.com"     , 0 , 0  } ,
	{"marketnex.com"      , 0 , 0  } ,
	{"partnersignup"      , 0 , 0  } ,
	{"publisher-network"  , 0 , 0  } ,
	//{"amazon.com"       , 0 , 0  } ,
	//{"dmoz.org"         , 0 , 0  } ,
	//{"dmoz.com"         , 0 , 0  } ,
	{"linksmanager"       , 0 , 0  } ,
	{"changinglinks"      , 0 , 0  }
};

static constexpr int32_t numNeedles2 = sizeof(s_needles2)/sizeof(Needle);

//Check if a path is likely to contain uncontrolled links
//Eg. guestbooks, blog comments, link-trade, etc. There is nothing from with them per see
//but often a lot of them are unmonitored/unmoderated and link spammers insert links in them.
static bool isLinkfulPath(const char *path, size_t pathLen, const char **note) {
	if(pathLen<=1)
		return false;
	if(strncasestr(path,"guest",pathLen,5)) {
		*note = "path has guest";
		return true;
	} else if(strncasestr(path,"cgi",pathLen,3)) {
		*note = "path has cgi";
		return true;
	} else if(strncasestr(path,"gast",pathLen,4)) { // german
		*note = "path has gast";
		return true;
	} else if(strncasestr(path,"gaest",pathLen,5)) { //danish
		*note = "path has gaest";
		return true;
	} else if(strncasestr(path,"gbook",pathLen,5)) {
		*note = "path has gbook";
		return true;
	} else if(strncasestr(path,"akobook",pathLen,7)) { // vietnamese?
		*note = "path has akobook";
		return true;
	} else if(strncasestr(path,"/gb",pathLen,3)) {
		*note = "path has /gb";
		return true;
	} else if(strncasestr(path,"msg",pathLen,3 )) {
		*note = "path has msg";
		return true;
	} else if(strncasestr(path,"messag",pathLen,6)) {
		*note = "path has messag";
		return true;
	} else if(strncasestr(path,"board",pathLen,5)) {
		*note = "path has board";
		return true;
	} else if(strncasestr(path,"coment",pathLen,6)) {
		*note = "path has coment";
		return true;
	} else if(strncasestr(path,"comment",pathLen,7)) {
		*note = "path has comment";
		return true;
	} else if(strncasestr(path,"linktrader",pathLen,10)) {
		*note = "path has linktrader";
		return true;
	} else if(strncasestr(path,"tradelinks",pathLen,10)) {
		*note = "path has tradelinks";
		return true;
	} else if(strncasestr(path,"trade-links",pathLen,11)) {
		*note = "path has trade-links";
		return true;
	} else if(strncasestr(path,"linkexchange",pathLen,12)) {
		*note = "path has linkexchange";
		return true;
	} else if(strncasestr(path,"link-exchange",pathLen,13)) {
		*note = "path has link-exchange";
		return true;
	} else if(strncasestr(path,"reciprocal-link",pathLen,15)) {
		*note = "path has reciprocal-link";
		return true;
	} else if(strncasestr(path,"reciprocallink",pathLen,14)) {
		*note = "path has reciprocallink";
		return true;
	} else if(strncasestr(path,"/trackbacks/",pathLen,12)) {
		*note = "path has /trackbacks/";
		return true;
	}
	return false;
}


//Check if the document looks like a web statistics page
static bool isWebstatisticsPage(const Xml *xml) {
	// does title contain "web statistics for"?
	int32_t titleLen;
	const char *title = xml->getString("title", &titleLen);
	if(title && titleLen > 0) {
		// normalize title into buffer, remove non alnum chars
		char buf[256];
		char *dst    = buf;
		char *dstEnd = buf + 250;
		const char *src    = title;
		const char *srcEnd = title + titleLen;
		while(dst < dstEnd && src < srcEnd) {
			// remove punct
			if(is_alnum_a(*src) )
				*dst++ = to_lower_a(*src);
			src++;
		}
		*dst = '\0';
		// see if it matches some catch phrases
		bool val = false;
		if      ( strstr (buf,"webstatisticsfor"      )) val = true;
		if      ( strstr (buf,"webserverstatisticsfor")) val = true;
		else if ( strstr (buf,"usagestatisticsfor"    )) val = true;
		else if ( strstr (buf,"siteusageby"           )) val = true;
		else if ( strstr (buf,"surfstatsloganal"      )) val = true;
		else if ( strstr (buf,"webstarterhelpstats"   )) val = true;
		else if ( strstr (buf,"sitestatistics"        )) val = true;
		return val;
	}
	return false;
}


// . we set the bit in linkdb for a doc if this returns true
// . it precludes a doc from voting if its bits is set in linkdb
// . this saves resources
// . the isLinkSpam() function is used when we have the linkee url
// . note is only set if the whole doc can not vote for some reason
// . otherwise, each outlink in "links" is assigned a "note" to indicate if
//   the outlink is a spam link or not
// . returns true on success, false on error
bool setLinkSpam (const Url       *linker             ,
		   int32_t       siteNumInlinks     ,
		   Xml       *xml                ,
		   Links     *links              ,
		   bool       isContentTruncated ) {
	// if the doc got truncated we may be missing valuable identifiers
	// that identify the doc as a guestbook or something
	if ( isContentTruncated ) {
		links->setAllSpamBits("doc too big");
		return true;
	}
	// get linker quality
	//int32_t q = tr->getDocQuality();
	// do not allow .info or .biz to vote ever for now
	const char *tld    = linker->getTLD();
	int32_t  tldLen = linker->getTLDLen();
	if ( tldLen == 4 && strncmp ( tld, "info" , tldLen) == 0 && //q < 55 )
	     siteNumInlinks < 20 ) {
		links->setAllSpamBits("low quality .info linker");
		return true;
	}
	if ( tldLen == 3 && strncmp ( tld, "biz" , tldLen) == 0 && //q < 55 )
	     siteNumInlinks < 20 ) {
		links->setAllSpamBits("low quality .biz linker");
		return true;
	}

	// guestbook in hostname - domain?
	const char *hd  = linker->getHost();
	const char *hd2 = linker->getDomain();
	int32_t  hdlen = hd2 - hd;
	if ( hd && hd2 && hdlen < 30 ) {
		bool hasIt = false;
		if ( strnstr ( hd , "guestbook", hdlen ) ) hasIt = true;
		if ( hasIt ) {
			links->setAllSpamBits("guestbook in hostname");
			return true;
		}
	}

	// do not allow any cgi url to vote
	if ( linker->isCgi() ) {
		links->setAllSpamBits("path is cgi");
		return true;
	}

	// if the page has just one rel=nofollow tag then we know they
	// are not a guestbook
	//if ( links->hasRelNoFollow() ) plen = 0;
	const char *note = NULL;
	if(isLinkfulPath(linker->getPath(),linker->getPathLen(),&note)) {
		links->setAllSpamBits(note);
		return true;
	}

	// does title contain "web statistics for"?
	if(isWebstatisticsPage(xml)) {
		links->setAllSpamBits("stats page");
		return true;
	}

	/////////////////////////////////////////////////////
	//
	// check content for certain keywords and phrases
	//
	/////////////////////////////////////////////////////

	const char *haystack     = xml->getContent();
	int32_t  haystackSize = xml->getContentLen();

	// do not call them "bad links" if our link occurs before any
	// comment section. our link's position therefore needs to be known,
	// that is why we pass in linkPos.
	// "n" is the number it matches.
	NeedleMatch needleMatches1[numNeedles1];

	bool hadPreMatch;
	getMatches2(s_needles1, needleMatches1, numNeedles1, haystack, haystackSize, NULL, &hadPreMatch);

	// see if we got a hit
	char *minPtr = NULL;
	note = NULL;
	for ( int32_t i = 0 ; i < numNeedles1 ; i++ ) {
		// open.thumbshots.org needs multiple counts
		if ( i == 0 && needleMatches1[i].m_count < 5 ) continue;
		// skip if no matches on this string
		if ( needleMatches1[i].m_count <= 0  ) continue;
		// ok, if it had its section bit set to 0 that means the
		// whole page is link spam!
		if ( s_needles1[i].m_isSection == 0 ) {
			links->setAllSpamBits(s_needles1[i].m_string );
			return true;
		}
		// get the char ptr
		char *ptr = needleMatches1[i].m_firstMatch;
		// set to the min
		if ( ! minPtr || ptr < minPtr ) {
			note   = s_needles1[i].m_string;
			minPtr = ptr;
		}
	}

	// convert the char ptr into a link node following it
	int32_t aa = 0;
	if ( minPtr ) aa = links->getNumLinks();
	int32_t mini = -1;
	for ( int32_t i = 0 ; i < aa ; i++ ) {
		// get the link's char ptr into the content
		int32_t  linkNode = links->getNodeNum(i);
		char *linkPos  = NULL;
		if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode );
		// now we can compare, if BEFORE this comment section
		// indicating tag, we are NOT link spam, so continue
		if ( linkPos < minPtr ) continue;
		// otherwise, we are the first, stop.
		mini = i;
		break;
	}

	// now count all the links BELOW this match as link spam
	// but everyone else is ok!
	if ( minPtr && mini >= 0 )
		links->setSpamBits ( note , mini );

	// now check outlinks on the page for these substrings
	haystack     = links->getLinkBuf();
	haystackSize = links->getLinkBufLen();
	NeedleMatch needleMatches2[numNeedles2];
	getMatches2(s_needles2, needleMatches2, numNeedles2, haystack, haystackSize, NULL, NULL);

	// see if we got a hit
	for ( int32_t i = 0 ; i < numNeedles2 ; i++ ) {
		// skip if did not match
		if ( needleMatches2[i].m_count <= 0 ) continue;
		// the whole doc is considered link spam
		links->setAllSpamBits(s_needles2[i].m_string);
		return true;
	}

	//skiplinks:
	// check for certain post tag, indicative of a comment-friendly blog
	// <form method=post ... action=*comments*cgi-bin>
	// <form method="post"
	//       action="http://www.mydomain.com/cgi-bin/mt-comments.cgi"
	//       name="comments_form" ...>
	// <form method=POST
	//  action="http://peaceaction.org/wboard/wwwboard.cgi">
	int32_t nn = xml->getNumNodes();
	bool gotTextArea = false;
	bool gotSubmit   = false;
	for ( int32_t i=0; i < nn ; i++ ) {
		// <textarea> tags are bad... but only if we have not
		// matched "track" or whatever from above... check for that
		// if you uncommment this... otherwise you disable all blogs!
		// Only do this check if we did match a comment related phrase
		// in s_needles1[] BUT it was BEFORE our outlink. That
		// basically means that we do *not* recognize the format of
		// the comment page and so therefore need to be more
		// restrictive about allowing this page to vote.
		if ( ! hadPreMatch ) {
			// is it a <textarea> tag?
			if ( xml->getNodeId ( i ) == TAG_TEXTAREA )
				gotTextArea = true;
			// is it an <input> tag?
			int32_t len = 0;
			if ( xml->getNodeId ( i ) == TAG_INPUT &&
			     xml->getString(i,"submit",&len)) gotSubmit = true;
		}

		if ( xml->getNodeId ( i ) != TAG_FORM ) continue;

		// get the method field of this base tag
		int32_t  slen;
		char *s = (char *) xml->getString(i,"method",&slen);
		// if not thee, skip it
		if ( ! s || slen <= 0 ) continue;
		// get the action url
		s = (char *) xml->getString(i,"action",&slen);
		if ( ! s || slen <= 0 ) continue;
		char c = s[slen];
		s[slen]='\0';
		bool val = false;
		// this is a bit too strong, but i'ev seen an action of
		// "cgi-bin/mt-leaveone.cgi" so we can't rely on "mt-comment"
		if      ( strstr ( s , "comment" ) ) val = true;
		else if ( strstr ( s , "/MT/" ) ) val = true;
		else if ( strstr ( s , "/mt/" ) ) val = true;
		// they can have these search boxes though
		if ( val && strstr ( s , "/mt/mt-search" ) ) val = false;
		s[slen] = c;
		if ( val ) {
			links->setAllSpamBits("post page");
			return true;
		}
	}

	if ( gotTextArea && gotSubmit ) {
		links->setAllSpamBits("textarea tag");
		return true;
	}

	// edu, gov, etc. can have link chains
	if ( tldLen >= 3 && strncmp ( tld, "edu" , 3) == 0 ) return true;
	if ( tldLen >= 3 && strncmp ( tld, "gov" , 3) == 0 ) return true;

	// if linker is naughty, he cannot vote... how did he make it in?
	if ( linker->isAdult() ) {
		links->setAllSpamBits("linker is sporny");
		return true;
	}

	// . if they link to any adult site, consider them link spam
	// . just consider a 100 link radius around linkNode
	int32_t nl = links->getNumLinks();
	for ( int32_t i = 0 ; i < nl ; i++ ) {
		// skip if this link is internal, we will add it to linkdb
		// anyway... this will save us some processing time
		if ( links->isInternalDom(i) ) continue;
		// otherwise, normalize it...
		Url uu;
		uu.set( links->getLinkPtr( i ), links->getLinkLen( i ) );

		// . is it near sporny links? (naughty domains or lotsa -'s)
		// . if we are in a list of ads, chances are good the true
		//   nature of the ads will emerge...
		if ( uu.isAdult() ) {
			links->setAllSpamBits("has sporny outlinks");
			log(LOG_DEBUG,"build: %s has sporny outlinks.",
			    uu.getUrl());
			return true;
		}

		// check if this url is a link chain
		//if ( q >= 60 ) continue;
		if ( siteNumInlinks >= 50 ) continue;
		const char *np = NULL;
		// get the xml node of link #i
		int32_t xmlNode = links->getNodeNum ( i );
		if ( isLinkChain ( xml , linker, &uu, xmlNode, &np ))
			links->setSpamBit ( np , i );
	}
	return true;
}


bool isLinkSpam ( const Url *linker,
		  int32_t siteNumInlinks ,
		  Xml *xml,
		  Links *links ,
		  int32_t maxDocLen ,
		  const char **note ,
		  const Url *linkee ,
		  // node position of the linkee in the linker's content
		  int32_t  linkNode ) {
	// same host linkers can be link spam (TODO: make same ip block)
	// because we only allow up to 10 to vote as a single voter
	if ( linkee ) {
		const char *h1    = linkee->getHost();
		int32_t  h1len = linkee->getHostLen();
		const char *h2    = linker->getHost();
		int32_t h2len = linker->getHostLen();
		if ( h1len == h2len && strncmp ( h1 , h2 , h1len ) == 0 )
			return false;
	}

	// do not allow .info or .biz to vote ever for now
	const char *tld    = linker->getTLD();
	int32_t  tldLen = linker->getTLDLen();
	if ( tldLen == 4 && strncmp ( tld, "info" , tldLen) == 0 ) {
		*note = ".info tld";
		return true;
	}
	if ( tldLen == 3 && strncmp ( tld, "biz" , tldLen) == 0 ) {
		*note = ".biz tld";
		return true;
	}

	// i saw a german doc get its textarea cut out because of this, so
	// we need this here
	if ( xml && xml->getContentLen() > maxDocLen ) {
		*note ="doc too big";
		return true;
	}

	// guestbook in hostname - domain?
	const char *hd  = linker->getHost();
	const char *hd2 = linker->getDomain();
	int32_t  hdlen = hd2 - hd;
	if ( hd && hd2 && hdlen < 30 ) {
		bool hasIt = false;
		if ( strnstr ( hd , "guestbook", hdlen ) ) hasIt = true;
		if ( hasIt ) {
			*note = "guestbook in hostname";
			return true;
		}
	}

	// do not allow any cgi url to vote
	if ( linker->isCgi() ) { *note = "path is cgi"; return true; }

	if(isLinkfulPath(linker->getPath(),linker->getPathLen(),note))
		return true;

	if( !xml ) {
		return false;
	}

	// does title contain "web statistics for"?
	if(isWebstatisticsPage(xml)) {
		*note = "stats page";
		return true;
	}

	/////////////////////////////////////////////////////
	//
	// check content for certain keywords and phrases
	//
	/////////////////////////////////////////////////////

	const char *haystack     = xml->getContent();
	int32_t  haystackSize = xml->getContentLen();

	char *linkPos = NULL;
	if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode );

	// do not call them "bad links" if our link occurs before any
	// comment section. our link's position therefore needs to be known,
	// that is why we pass in linkPos.
	NeedleMatch needleMatches1[numNeedles1];
	bool hadPreMatch;
	getMatches2(s_needles1, needleMatches1, numNeedles1, haystack, haystackSize, linkPos, &hadPreMatch);

	// see if we got a hit
	for ( int32_t i = 0 ; i < numNeedles1 ; i++ ) {
		int32_t need = 1;
		// open.thumbshots.org needs multiple counts
		if ( i == 0 ) need = 5;
		if ( needleMatches1[i].m_count < need ) continue;
		*note = s_needles1[i].m_string;
		return true;
	}

	// now check outlinks on the page for these substrings
	haystack     = links->getLinkBuf();
	haystackSize = links->getLinkBufLen();
	NeedleMatch needleMatches2[numNeedles2];
	getMatches2(s_needles2, needleMatches2, numNeedles2, haystack, haystackSize, NULL, NULL);

	// see if we got a hit
	for ( int32_t i = 0 ; i < numNeedles2 ; i++ ) {
		int32_t need = 1;
		// open.thumbshots.org needs multiple counts
		//if ( i == 9 ) need = 5;
		if ( needleMatches2[i].m_count < need ) continue;
		*note = s_needles2[i].m_string;
		return true;
	}

	//skiplinks:
	// check for certain post tag, indicative of a comment-friendly blog
	// <form method=post ... action=*comments*cgi-bin>
	// <form method="post"
	//       action="http://www.mydomain.com/cgi-bin/mt-comments.cgi"
	//       name="comments_form" ...>
	// <form method=POST
	//  action="http://peaceaction.org/wboard/wwwboard.cgi">
	int32_t nn = xml->getNumNodes();
	bool gotTextArea = false;
	bool gotSubmit   = false;
	for ( int32_t i=0; i < nn ; i++ ) {
		// <textarea> tags are bad... but only if we have not
		// matched "track" or whatever from above... check for that
		// if you uncommment this... otherwise you disable all blogs!
		// Only do this check if we did match a comment related phrase
		// in s_needles1[] BUT it was BEFORE our outlink. That
		// basically means that we do *not* recognize the format of
		// the comment page and so therefore need to be more
		// restrictive about allowing this page to vote.
		if ( ! hadPreMatch ) {
			// is it a <textarea> tag?
			if ( xml->getNodeId ( i ) == TAG_TEXTAREA )
				gotTextArea = true;
			// is it an <input> tag?
			int32_t len = 0;
			if ( xml->getNodeId ( i ) == TAG_INPUT &&
			     xml->getString(i,"submit",&len)) gotSubmit = true;
		}

		if ( xml->getNodeId ( i ) != TAG_FORM ) continue;

		// get the method field of this base tag
		int32_t  slen;
		char *s = (char *) xml->getString(i,"method",&slen);
		// if not thee, skip it
		if ( ! s || slen <= 0 ) continue;
		// get the action url
		s = (char *) xml->getString(i,"action",&slen);
		if ( ! s || slen <= 0 ) continue;
		char c = s[slen];
		s[slen]='\0';
		bool val = false;
		// this is a bit too strong, but i'ev seen an action of
		// "cgi-bin/mt-leaveone.cgi" so we can't rely on "mt-comment"
		if      ( strstr ( s , "comment" ) ) val = true;
		else if ( strstr ( s , "/MT/" ) ) val = true;
		else if ( strstr ( s , "/mt/" ) ) val = true;
		// they can have these search boxes though
		if ( val && strstr ( s , "/mt/mt-search" ) ) val = false;
		s[slen] = c;
		if ( val ) { *note = "post page"; return true; }
	}

	if ( gotTextArea && gotSubmit ) {
		*note = "textarea tag";
		return true;
	}

	// edu, gov, etc. can have link chains
	if ( tldLen >= 3 && strncmp ( tld, "edu" , 3) == 0 ) return false;
	if ( tldLen >= 3 && strncmp ( tld, "gov" , 3) == 0 ) return false;

	// if linker is naughty, he cannot vote
	if ( linker->isAdult() )
		return true;

	// if being called from PageTitledb.cpp for displaying a titlerec,
	// then do not call this, because no linkee is provided in that case.
	if ( !linkee ) {
		*note = "linkee not found";
		return false;//true;
	}

	// . if they link to any adult site, consider them link spam
	// . just consider a 100 link radius around linkNode
	int32_t nl = links->getNumLinks();

	// init these before the loop
	int32_t  hlen  = linkee->getHostLen();
	const char *host  = linkee->getHost();
	const char *uu    = linkee->getUrl();
	const char *uuend = host + hlen;
	int32_t  uulen = uuend - uu;
	int32_t  x     = linkNode;
 loop:

	// return true right away if it is a link chain
	if ( siteNumInlinks < 1000 &&
	     isLinkChain ( xml , linker, linkee , x , note ) )
		return true;

	// if no domain, that's it
	if ( ! uu || uulen <= 0 ) return false;

	// . see if this domain is linked to in other areas of the document.
	// . if any of those areas are not link chains, then assume we are
	//   not a link chain
	for ( x++ ; x < nl ; x++ ) {
		char *link = links->getLinkPtr(x);
		int32_t  linkLen = links->getLinkLen(x);
		if ( ! link          ) continue;
		if ( linkLen <= 0    ) continue;
		if ( linkLen > uulen ) continue;
		if ( strncmp ( link , uu , uulen ) != 0 ) continue;
		// got a match, is it a link chain? if not, them we are not
		goto loop;
	}

	return false;

}

// Criteria for being a link chain:
//
// 1. the "linkee" is in a chain of outlinks to external domains
// 2. all outlinks to the same hostname as "linkee" are in link chains
// 3. no plain text is present between "linkee" and one of the other
//    outlinks in the chain
// 4. this might hurt blogrolls, and resource pages, but such links
//    are kind of low quality anyway.
static bool isLinkChain ( Xml *xml, const Url *linker, const Url *linkee, int32_t linkNode, const char **note ) {

	//log(LOG_DEBUG,"build: doing %s",linker->m_url);

	// if the linkee is internal (by domain) then not a link chain
	if ( linkee->getDomainLen() == linker->getDomainLen() &&
	     strncmp ( linkee->getDomain() , linker->getDomain(),linkee->getDomainLen())==0)
		return false;

	const char *linkPos = NULL;
	if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode );

	// did we have text to the left/right of this link and after/before
	// the neighboring link? assume not.
	bool leftText  = false;
	bool rightText = false;

	// the links on the left and right
	Url  leftUrl;
	Url  rightUrl;
	bool leftMalformed  = false;
	bool rightMalformed = false;
	// these do not have constructors so we must reset them
	leftUrl.reset();
	rightUrl.reset();
	int32_t i ;
	// . see if we are alone in a table or not
	// . table must occur before/after our left/right neighbor link
	bool tableLeft   = false;
	bool tableRight  = false;

	// going backwards from linkNode we are not in a link
	bool inLink = false;

	// get the start of an anchor tag on our immediate left
	for ( i = linkNode - 1 ; i >= 0 ; i-- ) {
		// do not look too far
		if ( linkPos - xml->getNode(i) >= 1500 ) break;
		if ( linkNode - i >= 90                ) break;
		// NOTE: if you add more tags to this list, then also add
		// to Vector::setPairHashes() as well
		// stop at <title> or </title> tags
		if ( xml->getNodeId(i) == TAG_TITLE ) break;
		// stop at <ul> or </ul> tags
		// no, otherwise, these lists are always "link chain left"
		//if ( xml->getNodeId(i) == TAG_UL ) break;
		// stop at <table> or </table> tags
		if ( xml->getNodeId(i) == TAG_TABLE ) {
			if ( ! xml->isBackTag(i) ) tableLeft = true;
			break;
		}

		// check for *plain* text
		if ( ! inLink && xml->getNodeId(i) == TAG_TEXTNODE ) {
			// get the node as a string
			char *p    = xml->getNode(i);
			char *pend = p + xml->getNodeLen(i);
			// check for elipsis, that is a sign that we are a serp
			for ( char *s = p ; s+2 < pend ; s++ ) {
				//if ( is_alnum(*s) ) break;
				if ( *s != '.' ) continue;
				s++;
				if ( *s != '.' ) continue;
				s++;
				if ( *s != '.' ) continue;
				// ok, got it
				*note = "search result right";
				return true;
			}
			// if we already got text, but searching still for ...
			if ( leftText ) continue;
			// does it have alnum
			if ( ! has_alpha_utf8 ( p , pend ) ) continue;
			leftText = true;
			// do not break yet, cont search for ellipsis!
		}

		// keep chugging if not an anchor tag, <a> or </a>
		if ( xml->getNodeId(i) != TAG_A ) continue;
		// if we are </a> then we are now in a link since we are moving
		// backwards
		if ( xml->isBackTag(i) ) { inLink = true; continue; }
		// if we hit a forward tag and inLink was false... we had
		// no corresponding back tag, so disconsider any text
		if ( ! inLink ) rightText = false;
		// no longer in an <a> tag
		inLink = false;

		// ok, get the url from this anchor tag
		int32_t  ulen = 0;
		char *u = (char *) xml->getString ( i, "href", &ulen );
		// if we did not get one, that means it could have been
		// malformed... like the href had a quote right b4 it
		if ( ulen == 0 ) leftMalformed = true;
		// normalize
		if ( ulen > 0 )
			leftUrl.set( linker, u, ulen );
		// . if NOT from the same domain, break out, otherwise continue
		// . this helps us find the <table> tag in ad tables with
		//   multiple links to the same domain
		// . this helps us accept a list of links to the same domain if
		//   there is left/right text, like the guy that had a list
		//   to 3 different gigablast.com links in a row with no
		//   text in between
		if ( leftUrl.getDomainLen() != linkee->getDomainLen()  )
			break;
		if( strncmp(leftUrl.getDomain(), linkee->getDomain(), linkee->getDomainLen()) != 0 )
			break;
	}

	// we start off in link text, since linkNode is an <a> tag
	inLink = true;
	// now loop through all the nodes after us
	for ( i = linkNode + 1 ; i < xml->getNumNodes() ; i++ ) {
		// stop if we've gone too far
		if ( xml->getNode(i) - linkPos >= 1580 ) break;
		if ( i - linkNode >= 95                ) break;
		// stop at <title> or </title> tags
		if ( xml->getNodeId(i) == TAG_TITLE ) break;
		// stop at <table> or </table> tags
		if ( xml->getNodeId(i) == TAG_TABLE ) {
			// note it for table ads
			if ( xml->isBackTag(i) ) tableRight = true;
			break;
		}

		// check for *plain* text
		if ( ! inLink && xml->getNodeId(i) == TAG_TEXTNODE ) {
			// get the node as a string
			char *p    = xml->getNode(i);
			char *pend = p + xml->getNodeLen(i);
			// check for elipsis, that is a sign that we are a serp
			for ( char *s = p ; s+2 < pend ; s++ ) {
				//if ( is_alnum(*s) ) break;
				if ( *s != '.' ) continue;
				s++;
				if ( *s != '.' ) continue;
				s++;
				if ( *s != '.' ) continue;
				// ok, got it
				*note = "search result right";
				return true;
			}
			// if we already got text, but searching still for ...
			if ( rightText ) continue;
			// does it have alnum
			if ( ! has_alpha_utf8 ( p , pend ) ) continue;
			rightText = true;
			// do not break yet, cont search for ellipsis!
		}

		// keep chugging if not an anchor tag, <a> or </a>
		if ( xml->getNodeId(i) != TAG_A ) continue;
		// skip if not a forward tag
		if ( xml->isBackTag(i) ) { inLink = false; continue; }
		// we are now in a link
		inLink = true;
		// stop text here
		//stopTextScan = i;
		// ok, get the url
		int32_t  ulen = 0;
		char *u = (char *) xml->getString ( i, "href", &ulen );
		// if we did not get one, that means it could have been
		// malformed... like the href had a quote right b4 it
		if ( ulen == 0 ) rightMalformed = true;
		// normalize
		if ( ulen > 0 )
			rightUrl.set( linker, u, ulen );
		// . if NOT from the same domain, break out, otherwise continue
		// . this helps us find the <table> tag in ad tables with
		//   multiple links to the same domain
		// . this helps us accept a list of links to the same domain if
		//   there is left/right text, like the guy that had a list
		//   to 3 different gigablast.com links in a row with no
		//   text in between
		if ( rightUrl.getDomainLen() != linkee->getDomainLen() )
			break;
		if ( strncmp(rightUrl.getDomain(), linkee->getDomain(), linkee->getDomainLen()) != 0 )
			break;
	}

	if ( tableLeft && tableRight ) {
		*note = "ad table";
		return true;
	}

	// if we had text on both sides of us, we are not a link chain
	if ( leftText && rightText ) return false;

	if      ( ! leftText  && rightText ) *note = "link chain left";
	else if ( ! rightText && leftText  ) *note = "link chain right";
	else                                 *note = "link chain middle";

	return true;
}