2018-01-12 15:24:24 +01:00

412 lines
10 KiB

#include "SafeBuf.h"
#include "HttpRequest.h"
#include "HttpServer.h"
#include "SearchInput.h"
#include "PageCrawlBot.h"
#include "Collectiondb.h"
#include "Pages.h"
#include "Parms.h"
#include "Spider.h"
#include "SpiderColl.h"
#include "SpiderLoop.h"
#include "PageResults.h" // for RESULT_HEIGHT
#include "Stats.h"
#include "PageRoot.h"
// 5 seconds
// main > Basic > Settings
bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
// true = useDefault?
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) return true;
// it is a safebuf parm
char *siteList = cr->m_siteListBuf.getBufStart();
if ( ! siteList ) siteList = "";
SafeBuf msgBuf;
char *status = "";
int32_t max = 1000000;
if ( cr->m_siteListBuf.length() > max ) {
msgBuf.safePrintf( "<font color=red><b>"
"Site list is over %" PRId32" bytes large, "
"too many to "
"display on this web page. Please use the "
"file upload feature only for now."
, max );
status = " disabled";
"On the command like you can issue a command like "
"gb addurls &lt; fileofurls.txt"
"</i> or "
"gb addfile &lt; *.html"
"</i> or "
"gb injecturls &lt; fileofurls.txt"
"</i> or "
"gb injectfile &lt; *.html"
"</i> or "
"to schedule downloads or inject content directly "
"into Gigablast."
"<input "
"size=20 "
"type=file "
// example table
sb->safePrintf ( "<a name=examples></a>"
"<table %s>"
"<tr class=hdrow><td colspan=2>"
"<center><b>Site List Examples</b></tr></tr>"
//"<tr bgcolor=#%s>"
//"<td>Spider all urls encountered. If you just submit "
//"this by itself, then Gigablast will initiate spidering "
//"automatically at, an internet "
//"directory of good sites.</td>"
"Spider the url <i></i> and spider "
"any links we harvest that have the domain "
// protocol and subdomain match
"Spider the url "
"<i></i> and spider "
"any links we harvest that start with "
"<i></i>. NOTE: if the url "
" redirects to then "
" still gets spidered "
"because it is considered to be manually added, but "
"no other urls from will be spidered."
// protocol and subdomain match
"Spider the url "
"<i></i> and spider "
"any links we harvest that start with "
"<i></i>. "
"Urls that start with "
"<i>http://<b>www.</b></i>, for example, "
"will NOT match this."
"Spider the url <i></i>. "
"Add any outlinks we find into the "
"spider queue, but those outlinks will only be "
"spidered if they "
"match ANOTHER line in this site list."
// protocol and subdomain match
"Allow any urls starting with "
"<i></i> to be spidered "
"if encountered."
// subdomain match
"Allow any urls starting with "
"<i></i> to be spidered "
"if encountered."
"Do not spider any urls starting with "
"<i></i> to be spidered "
"if encountered."
// domain match
"Allow any urls starting with "
"<i></i> to be spidered "
"if encountered."
// spider this subdir
"Allow any urls starting with "
"<i></i> "
"to be spidered "
"if encountered."
// exact match
//"Allow this specific url."
// local subdir match
"Spider all files in the given subdirectory or lower. "
"Do not spider files in this subdirectory."
// connect to a device and index it as a stream
//"Connect to a device and index it as a stream. "
//"It will be treated like a single huge document for "
//"searching purposes with chunks being indexed in "
//"realtime. Or chunk it up into individual document "
//"chunks, but proximity term searching will have to "
//"be adjusted to compute query term distances "
// negative subdomain match
"<td>Spider any url containing <i>goodstuff</i>."
"<td>Do not spider any url containing <i>badstuff</i>."
"<td>Url must match this regular expression. "
"Try to avoid using these if possible; they can slow "
"things down and are confusing to use."
// tag match
//"<td>tag:boots contains:boots<br>"
"<nobr>tag:boots site:www.westernfootwear."
"tag:boots contains:/boots<br>"
"Advance users only. "
"Tag any urls matching these 5 url patterns "
"so we can use "
"the expression <i>tag:boots</i> in the "
"<a href=\"/admin/filters\">url filters</a> and perhaps "
"give such urls higher spider priority. "
"For more "
"precise spidering control over url subsets. "
"Preceed any pattern with the tagname followed by "
"space to tag it."
"<td># This line is a comment.</td>"
"<td>Empty lines and lines starting with # are "
return true;
// main > Basic > Status
bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
StackBuf<128000> sb;
char format = hr->getReplyFormat();
// true = usedefault coll?
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) {
g_httpServer.sendErrorReply(socket,500,"invalid collection");
return true;
if ( format == FORMAT_JSON || format == FORMAT_XML) {
// this is in PageCrawlBot.cpp
printCrawlDetails2 ( &sb , cr , format );
const char *ct = "text/xml";
if ( format == FORMAT_JSON ) ct = "application/json";
return g_httpServer.sendDynamicPage (socket,
0, // cachetime
false,//POSTReply ,
// print standard header
if ( format == FORMAT_HTML ) {
// this prints the <form tag as well
g_pages.printAdminTop ( &sb , socket , hr );
// table to split between widget and stats in left and right panes
sb.safePrintf("<TABLE id=pane>"
"<TR><TD valign=top>");
int32_t savedLen1, savedLen2;
// widget
// put the widget in here, just sort results by spidered date
// the scripts do "infinite" scrolling both up and down.
// but if you are at the top then new results will load above
// you and we try to maintain your current visual state even though
// the scrollbar position will change.
if ( format == FORMAT_HTML ) {
// save position so we can output the widget code
// so user can embed it into their own web page
savedLen1 = sb.length();
savedLen2 = sb.length();
// the right table pane is the crawl stats
sb.safePrintf("</TD><TD valign=top>");
// show stats
const char *crawlMsg;
spider_status_t crawlStatus;
getSpiderStatusMsg ( cr , &crawlMsg, &crawlStatus );
"<table id=stats border=0 cellpadding=5>"
"<td><b>Crawl Status Code:</td>"
"<td>%" PRId32"</td>"
"<td><b>Crawl Status Msg:</td>"
, (int)crawlStatus
, crawlMsg);
// end the right table pane
//if ( format != FORMAT_JSON )
// // wrap up the form, print a submit button
// g_pages.printAdminBottom ( &sb );
return g_httpServer.sendDynamicPage (socket,
0); // cachetime