privacore-open-source-searc.../PageAddUrl.cpp
Ivan Skytte Jørgensen beeddcf35d Got rid of gb-include.h
2018-07-26 17:29:51 +02:00

185 lines
5.1 KiB
C++

#include "Pages.h"
#include "HttpServer.h"
#include "Collectiondb.h"
#include "Msg4Out.h"
#include "Spider.h"
#include "Parms.h"
#include "GigablastRequest.h"
#include "PageCrawlBot.h"
#include "Conf.h"
#include "Mem.h"
#include "Errno.h"
static bool sendReply( void *state );
static void addedUrlsToSpiderdbWrapper ( void *state ) {
// otherwise call gotResults which returns false if blocked, true else
// and sets g_errno on error
sendReply ( state );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . add url page for admin, users use sendPageAddUrl() in PageRoot.cpp
bool sendPageAddUrl2 ( TcpSocket *sock , HttpRequest *hr ) {
// or if in read-only mode
if ( g_conf.m_readOnlyMode ) {
g_errno = EREADONLYMODE;
const char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(sock,500,msg);
}
// . get fields from cgi field of the requested url
// . get the search query
int32_t urlLen = 0;
const char *urls = hr->getString ( "urls" , &urlLen , NULL /*default*/);
char format = hr->getReplyFormat();
const char *c = hr->getString("c");
if ( ! c && (format == FORMAT_XML || format == FORMAT_JSON) ) {
g_errno = EMISSINGINPUT;
const char *msg = "missing c parm. See /admin/api to see parms.";
return g_httpServer.sendErrorReply(sock,500,msg);
}
if ( ! urls && (format == FORMAT_XML || format == FORMAT_JSON) ) {
g_errno = EMISSINGINPUT;
const char *msg = "missing urls parm. See /admin/api to see parms.";
return g_httpServer.sendErrorReply(sock,500,msg);
}
// get collection rec
CollectionRec *cr = g_collectiondb.getRec ( hr );
// bitch if no collection rec found
if ( ! cr ) {
g_errno = ENOCOLLREC;
const char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(sock,500,msg);
}
// make a new state
GigablastRequest *gr;
try { gr = new (GigablastRequest); }
catch(std::bad_alloc&) {
g_errno = ENOMEM;
log( LOG_WARN, "PageAddUrl: new(%i): %s", (int)sizeof(GigablastRequest),mstrerror(g_errno) );
return g_httpServer.sendErrorReply(sock, 500, mstrerror(g_errno));
}
mnew ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
// this will fill in GigablastRequest so all the parms we need are set
// set this. also sets gr->m_hr
g_parms.setGigablastRequest ( sock , hr , gr );
// if no url given, just print a blank page
if ( ! urls ) return sendReply ( gr );
// do not spider links for spots
bool status = getSpiderRequestMetaList ( urls, &gr->m_listBuf , gr->m_harvestLinks, NULL );
int32_t size = gr->m_listBuf.length();
// error / not list
if ( ! status || !size ) {
// nuke it
if ( !size ) {
g_errno = EMISSINGINPUT;
}
bool rc = g_httpServer.sendErrorReply(gr);
mdelete ( gr , sizeof(*gr) , "PageAddUrl" );
delete gr;
return rc;
}
// add to spiderdb
if (!gr->m_msg4.addMetaList(&(gr->m_listBuf), cr->m_collnum, gr, addedUrlsToSpiderdbWrapper)) {
// blocked!
return false;
}
// did not block, print page!
sendReply ( gr );
return true;
}
bool sendReply ( void *state ) {
GigablastRequest *gr = (GigablastRequest *)state;
// in order to see what sites are being added log it, then we can
// more easily remove sites from sitesearch.gigablast.com that are
// being added but not being searched
SafeBuf xb;
if ( gr->m_urlsBuf ) {
xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) );
}
char format = gr->m_hr.getReplyFormat();
TcpSocket *sock = gr->m_socket;
if ( format == FORMAT_JSON || format == FORMAT_XML ) {
bool status = g_httpServer.sendSuccessReply ( gr );
// nuke state
mdelete ( gr , sizeof(*gr) , "PageAddUrl" );
delete (gr);
return status;
}
int32_t ulen = 0;
const char *url = gr->m_urlsBuf;
if ( url ) ulen = strlen (url);
// re-null it out if just http://
bool printUrl = true;
if ( ulen == 0 ) printUrl = false;
if ( ! gr->m_urlsBuf ) printUrl = false;
if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
printUrl = false;
if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
printUrl = false;
// page is not more than 32k
StackBuf<1024*32+MAX_URL_LEN*2> sb;
g_pages.printAdminTop ( &sb , sock , &gr->m_hr );
// if there was an error let them know
SafeBuf mbuf;
if ( g_errno ) {
mbuf.safePrintf("<center><font color=red>");
mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno);
mbuf.safePrintf("</font></center>");
} else if ( printUrl ) {
mbuf.safePrintf("<center><font color=red>");
mbuf.safePrintf("<b><u>");
mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>");
mbuf.safePrintf("</font></center>");
}
if ( mbuf.length() ) {
sb.safeStrcpy( mbuf.getBufStart() );
}
g_parms.printParmTable ( &sb , sock , &gr->m_hr );
// print the final tail
g_pages.printTail ( &sb, true ); // admin?
// clear g_errno, if any, so our reply send goes through
g_errno = 0;
// nuke state
mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
delete (gr);
return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime
}