forked from Mirrors/privacore-open-source-search-engine
		
	
		
			
				
	
	
		
			412 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			412 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #include "SafeBuf.h"
 | |
| #include "HttpRequest.h"
 | |
| #include "HttpServer.h"
 | |
| #include "SearchInput.h"
 | |
| #include "PageCrawlBot.h"
 | |
| #include "Collectiondb.h"
 | |
| #include "Pages.h"
 | |
| #include "Parms.h"
 | |
| #include "Spider.h"
 | |
| #include "SpiderColl.h"
 | |
| #include "SpiderLoop.h"
 | |
| #include "PageResults.h" // for RESULT_HEIGHT
 | |
| #include "Stats.h"
 | |
| #include "PageRoot.h"
 | |
| 
 | |
| 
 | |
| // 5 seconds
 | |
| #define DEFAULT_WIDGET_RELOAD 1000
 | |
| 
 | |
| ///////////
 | |
| //
 | |
| // main > Basic > Settings
 | |
| //
 | |
| ///////////
 | |
| 
 | |
| bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
 | |
| 
 | |
| 	// true = useDefault?
 | |
| 	CollectionRec *cr = g_collectiondb.getRec ( hr , true );
 | |
| 	if ( ! cr ) return true;
 | |
| 
 | |
| 	/*
 | |
| 	// it is a safebuf parm
 | |
| 	char *siteList = cr->m_siteListBuf.getBufStart();
 | |
| 	if ( ! siteList ) siteList = "";
 | |
| 
 | |
| 	SafeBuf msgBuf;
 | |
| 	char *status = "";
 | |
| 	int32_t max = 1000000;
 | |
| 	if ( cr->m_siteListBuf.length() > max ) {
 | |
| 		msgBuf.safePrintf( "<font color=red><b>"
 | |
| 				   "Site list is over %" PRId32" bytes large, "
 | |
| 				   "too many to "
 | |
| 				   "display on this web page. Please use the "
 | |
| 				   "file upload feature only for now."
 | |
| 				   "</b></font>"
 | |
| 				   , max );
 | |
| 		status = " disabled";
 | |
| 	}
 | |
| 	*/
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	sb->safePrintf(
 | |
| 		       "On the command like you can issue a command like "
 | |
| 
 | |
| 		       "<i>"
 | |
| 		       "gb addurls < fileofurls.txt"
 | |
| 		       "</i> or "
 | |
| 
 | |
| 		       "<i>"
 | |
| 		       "gb addfile < *.html"
 | |
| 		       "</i> or "
 | |
| 
 | |
| 		       "<i>"
 | |
| 		       "gb injecturls < fileofurls.txt"
 | |
| 		       "</i> or "
 | |
| 
 | |
| 		       "<i>"
 | |
| 		       "gb injectfile < *.html"
 | |
| 		       "</i> or "
 | |
| 
 | |
| 		       "to schedule downloads or inject content directly "
 | |
| 		       "into Gigablast."
 | |
| 
 | |
| 		       "</td><td>"
 | |
| 
 | |
| 		       "<input "
 | |
| 		       "size=20 "
 | |
| 		       "type=file "
 | |
| 		       "name=urls>"
 | |
| 		       "</td></tr>"
 | |
| 
 | |
| 		       );
 | |
| 	*/	      
 | |
| 
 | |
| 	// example table
 | |
| 	sb->safePrintf ( "<a name=examples></a>"
 | |
| 			 "<table %s>"
 | |
| 			 "<tr class=hdrow><td colspan=2>"
 | |
| 			 "<center><b>Site List Examples</b></tr></tr>"
 | |
| 			 //"<tr bgcolor=#%s>"
 | |
| 			 //"<td>"
 | |
| 			 ,TABLE_STYLE );//, DARK_BLUE);
 | |
| 			 
 | |
| 
 | |
| 	sb->safePrintf(
 | |
| 		       //"*"
 | |
| 		       //"</td>"
 | |
| 		       //"<td>Spider all urls encountered. If you just submit "
 | |
| 		       //"this by itself, then Gigablast will initiate spidering "
 | |
| 		       //"automatically at dmoz.org, an internet "
 | |
| 		      //"directory of good sites.</td>"
 | |
| 		       //"</tr>"
 | |
| 
 | |
| 		      "<tr>"
 | |
| 		      "<td>goodstuff.com</td>"
 | |
| 		      "<td>"
 | |
| 		      "Spider the url <i>goodstuff.com/</i> and spider "
 | |
| 		      "any links we harvest that have the domain "
 | |
| 		      "<i>goodstuff.com</i>"
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      // protocol and subdomain match
 | |
| 		      "<tr>"
 | |
| 		      "<td>http://www.goodstuff.com/</td>"
 | |
| 		      "<td>"
 | |
| 		      "Spider the url "
 | |
| 		      "<i>http://www.goodstuff.com/</i> and spider "
 | |
| 		      "any links we harvest that start with "
 | |
| 		      "<i>http://www.goodstuff.com/</i>. NOTE: if the url "
 | |
| 		      "www.goodstuff.com redirects to foo.goodstuff.com then "
 | |
| 		      "foo.goodstuff.com still gets spidered "
 | |
| 		      "because it is considered to be manually added, but "
 | |
| 		      "no other urls from foo.goodstuff.com will be spidered."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      // protocol and subdomain match
 | |
| 		      "<tr>"
 | |
| 		      "<td>http://justdomain.com/foo/</td>"
 | |
| 		      "<td>"
 | |
| 		      "Spider the url "
 | |
| 		      "<i>http://justdomain.com/foo/</i> and spider "
 | |
| 		      "any links we harvest that start with "
 | |
| 		      "<i>http://justdomain.com/foo/</i>. "
 | |
| 		      "Urls that start with "
 | |
| 		      "<i>http://<b>www.</b>justdomain.com/</i>, for example, "
 | |
| 		      "will NOT match this."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      "<tr>"
 | |
| 		      "<td>seed:www.goodstuff.com/myurl.html</td>"
 | |
| 		      "<td>"
 | |
| 		      "Spider the url <i>www.goodstuff.com/myurl.html</i>. "
 | |
| 		      "Add any outlinks we find into the "
 | |
| 		      "spider queue, but those outlinks will only be "
 | |
| 		      "spidered if they "
 | |
| 		      "match ANOTHER line in this site list."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 
 | |
| 		      // protocol and subdomain match
 | |
| 		      "<tr>"
 | |
| 		      "<td>site:http://www.goodstuff.com/</td>"
 | |
| 		      "<td>"
 | |
| 		      "Allow any urls starting with "
 | |
| 		      "<i>http://www.goodstuff.com/</i> to be spidered "
 | |
| 		      "if encountered."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      // subdomain match
 | |
| 		      "<tr>"
 | |
| 		      "<td>site:www.goodstuff.com</td>"
 | |
| 		      "<td>"
 | |
| 		      "Allow any urls starting with "
 | |
| 		      "<i>www.goodstuff.com/</i> to be spidered "
 | |
| 		      "if encountered."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      "<tr>"
 | |
| 		      "<td>-site:bad.goodstuff.com</td>"
 | |
| 		      "<td>"
 | |
| 		      "Do not spider any urls starting with "
 | |
| 		      "<i>bad.goodstuff.com/</i> to be spidered "
 | |
| 		      "if encountered."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      // domain match
 | |
| 		      "<tr>"
 | |
| 		      "<td>site:goodstuff.com</td>"
 | |
| 		      "<td>"
 | |
| 		      "Allow any urls starting with "
 | |
| 		      "<i>goodstuff.com/</i> to be spidered "
 | |
| 		      "if encountered."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      // spider this subdir
 | |
| 		      "<tr>"
 | |
| 		      "<td><nobr>site:"
 | |
| 		      "http://www.goodstuff.com/goodir/anotherdir/</nobr></td>"
 | |
| 		      "<td>"
 | |
| 		      "Allow any urls starting with "
 | |
| 		      "<i>http://www.goodstuff.com/goodir/anotherdir/</i> "
 | |
| 		      "to be spidered "
 | |
| 		      "if encountered."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 
 | |
| 		      // exact match
 | |
| 		      
 | |
| 		      //"<tr>"
 | |
| 		      //"<td>exact:http://xyz.goodstuff.com/myurl.html</td>"
 | |
| 		      //"<td>"
 | |
| 		      //"Allow this specific url."
 | |
| 		      //"</td>"
 | |
| 		      //"</tr>"
 | |
| 
 | |
| 		      /*
 | |
| 		      // local subdir match
 | |
| 		      "<tr>"
 | |
| 		      "<td>file://C/mydir/mysubdir/"
 | |
| 		      "<td>"
 | |
| 		      "Spider all files in the given subdirectory or lower. "
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      "<tr>"
 | |
| 		      "<td>-file://C/mydir/mysubdir/baddir/"
 | |
| 		      "<td>"
 | |
| 		      "Do not spider files in this subdirectory."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 		      */
 | |
| 
 | |
| 		      // connect to a device and index it as a stream
 | |
| 		      //"<tr>"
 | |
| 		      //"<td>stream:/dev/eth0"
 | |
| 		      //"<td>"
 | |
| 		      //"Connect to a device and index it as a stream. "
 | |
| 		      //"It will be treated like a single huge document for "
 | |
| 		      //"searching purposes with chunks being indexed in "
 | |
| 		      //"realtime. Or chunk it up into individual document "
 | |
| 		      //"chunks, but proximity term searching will have to "
 | |
| 		      //"be adjusted to compute query term distances "
 | |
| 		      //"inter-document."
 | |
| 		      //"</td>"
 | |
| 		      //"</tr>"
 | |
| 
 | |
| 		      // negative subdomain match
 | |
| 		      "<tr>"
 | |
| 		      "<td>contains:goodtuff</td>"
 | |
| 		      "<td>Spider any url containing <i>goodstuff</i>."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      "<tr>"
 | |
| 		      "<td>-contains:badstuff</td>"
 | |
| 		      "<td>Do not spider any url containing <i>badstuff</i>."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      /*
 | |
| 		      "<tr>"
 | |
| 		      "<td>regexp:-pid=[0-9A-Z]+/</td>"
 | |
| 		      "<td>Url must match this regular expression. "
 | |
| 		      "Try to avoid using these if possible; they can slow "
 | |
| 		      "things down and are confusing to use."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 		      */
 | |
| 
 | |
| 		      // tag match
 | |
| 		      "<tr><td>"
 | |
| 		      //"<td>tag:boots contains:boots<br>"
 | |
| 		      "<nobr>tag:boots site:www.westernfootwear."
 | |
| 		      "</nobr>com<br>"
 | |
| 		      "tag:boots cowboyshop.com<br>"
 | |
| 		      "tag:boots contains:/boots<br>"
 | |
| 		      "tag:boots site:www.moreboots.com<br>"
 | |
| 		      "<nobr>tag:boots http://lotsoffootwear.com/"
 | |
| 		      "</nobr><br>"
 | |
| 		      //"<td>t:boots -contains:www.cowboyshop.com/shoes/</td>"
 | |
| 		      "</td><td>"
 | |
| 		      "Advance users only. "
 | |
| 		      "Tag any urls matching these 5 url patterns "
 | |
| 		      "so we can use "
 | |
| 		      "the expression <i>tag:boots</i> in the "
 | |
| 		      "<a href=\"/admin/filters\">url filters</a> and perhaps "
 | |
| 		      "give such urls higher spider priority. "
 | |
| 		      "For more "
 | |
| 		      "precise spidering control over url subsets. "
 | |
| 		      "Preceed any pattern with the tagname followed by "
 | |
| 		      "space to tag it."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 
 | |
| 		      "<tr>"
 | |
| 		      "<td># This line is a comment.</td>"
 | |
| 		      "<td>Empty lines and lines starting with # are "
 | |
| 		      "ignored."
 | |
| 		      "</td>"
 | |
| 		      "</tr>"
 | |
| 
 | |
| 		      "</table>"
 | |
| 		      );
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| ///////////
 | |
| //
 | |
| // main > Basic > Status
 | |
| //
 | |
| ///////////
 | |
| bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 | |
| 	StackBuf<128000> sb;
 | |
| 
 | |
| 	char format = hr->getReplyFormat();
 | |
| 
 | |
| 
 | |
| 	// true = usedefault coll?
 | |
| 	CollectionRec *cr = g_collectiondb.getRec ( hr , true );
 | |
| 	if ( ! cr ) {
 | |
| 		g_httpServer.sendErrorReply(socket,500,"invalid collection");
 | |
| 		return true;
 | |
| 	}
 | |
| 
 | |
| 	if ( format == FORMAT_JSON || format == FORMAT_XML) {
 | |
| 		// this is in PageCrawlBot.cpp
 | |
| 		printCrawlDetails2 ( &sb , cr , format );
 | |
| 		const char *ct = "text/xml";
 | |
| 		if ( format == FORMAT_JSON ) ct = "application/json";
 | |
| 		return g_httpServer.sendDynamicPage (socket,
 | |
| 						     sb.getBufStart(),
 | |
| 						     sb.length(),
 | |
| 						     0, // cachetime
 | |
| 						     false,//POSTReply        ,
 | |
| 						     ct);
 | |
| 	}
 | |
| 
 | |
| 	// print standard header
 | |
| 	if ( format == FORMAT_HTML ) {
 | |
| 		// this prints the <form tag as well
 | |
| 		g_pages.printAdminTop ( &sb , socket , hr );
 | |
| 
 | |
| 		// table to split between widget and stats in left and right panes
 | |
| 		sb.safePrintf("<TABLE id=pane>"
 | |
| 			      "<TR><TD valign=top>");
 | |
| 	}
 | |
| 
 | |
| 	int32_t savedLen1, savedLen2;
 | |
| 
 | |
| 	//
 | |
| 	// widget
 | |
| 	//
 | |
| 	// put the widget in here, just sort results by spidered date
 | |
| 	//
 | |
| 	// the scripts do "infinite" scrolling both up and down.
 | |
| 	// but if you are at the top then new results will load above
 | |
| 	// you and we try to maintain your current visual state even though
 | |
| 	// the scrollbar position will change.
 | |
| 	//
 | |
| 	if ( format == FORMAT_HTML ) {
 | |
| 
 | |
| 		// save position so we can output the widget code
 | |
| 		// so user can embed it into their own web page
 | |
| 		savedLen1 = sb.length();
 | |
| 		
 | |
| 		savedLen2 = sb.length();
 | |
| 
 | |
| 		// the right table pane is the crawl stats
 | |
| 		sb.safePrintf("</TD><TD valign=top>");
 | |
| 
 | |
| 		//
 | |
| 		// show stats
 | |
| 		//
 | |
| 		const char *crawlMsg;
 | |
| 		spider_status_t crawlStatus;
 | |
| 		getSpiderStatusMsg ( cr , &crawlMsg, &crawlStatus );
 | |
| 
 | |
| 		sb.safePrintf(
 | |
| 			      "<table id=stats border=0 cellpadding=5>"
 | |
| 
 | |
| 			      "<tr>"
 | |
| 			      "<td><b>Crawl Status Code:</td>"
 | |
| 			      "<td>%" PRId32"</td>"
 | |
| 			      "</tr>"
 | |
| 
 | |
| 			      "<tr>"
 | |
| 			      "<td><b>Crawl Status Msg:</td>"
 | |
| 			      "<td>%s</td>"
 | |
| 			      "</tr>"
 | |
| 			      , (int)crawlStatus
 | |
| 			      , crawlMsg);
 | |
| 
 | |
| 		sb.safePrintf("</table>\n\n");
 | |
| 
 | |
| 		// end the right table pane
 | |
| 		sb.safePrintf("</TD></TR></TABLE>");
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 	//if ( format != FORMAT_JSON )
 | |
| 	//	// wrap up the form, print a submit button
 | |
| 	//	g_pages.printAdminBottom ( &sb );
 | |
| 
 | |
| 	return g_httpServer.sendDynamicPage (socket,
 | |
| 					     sb.getBufStart(),
 | |
| 					     sb.length(),
 | |
| 					     0); // cachetime
 | |
| }
 |