Files
privacore-open-source-searc…/PageAddUrl.cpp

184 lines
5.1 KiB
C++
Raw Normal View History

2013-08-02 13:12:24 -07:00
#include "gb-include.h"
#include "Pages.h"
#include "Collectiondb.h"
2017-03-16 16:04:27 +01:00
#include "Msg4Out.h"
2013-08-02 13:12:24 -07:00
#include "Spider.h"
2014-02-10 21:45:03 -07:00
#include "Parms.h"
#include "GigablastRequest.h"
#include "Conf.h"
2016-12-08 16:56:09 +01:00
#include "Mem.h"
2013-08-02 13:12:24 -07:00
static bool sendReply( void *state );
static void addedUrlsToSpiderdbWrapper ( void *state ) {
// otherwise call gotResults which returns false if blocked, true else
// and sets g_errno on error
sendReply ( state );
}
2013-08-02 13:12:24 -07:00
// . returns false if blocked, true otherwise
// . sets g_errno on error
2014-02-09 12:38:40 -07:00
// . add url page for admin, users use sendPageAddUrl() in PageRoot.cpp
bool sendPageAddUrl2 ( TcpSocket *sock , HttpRequest *hr ) {
// or if in read-only mode
2014-07-10 16:42:22 -07:00
if ( g_conf.m_readOnlyMode ) {
g_errno = EREADONLYMODE;
const char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(sock,500,msg);
}
2013-08-02 13:12:24 -07:00
// . get fields from cgi field of the requested url
// . get the search query
2014-11-10 14:45:11 -08:00
int32_t urlLen = 0;
const char *urls = hr->getString ( "urls" , &urlLen , NULL /*default*/);
char format = hr->getReplyFormat();
const char *c = hr->getString("c");
if ( ! c && (format == FORMAT_XML || format == FORMAT_JSON) ) {
g_errno = EMISSINGINPUT;
const char *msg = "missing c parm. See /admin/api to see parms.";
return g_httpServer.sendErrorReply(sock,500,msg);
2013-08-02 13:12:24 -07:00
}
2014-02-09 12:38:40 -07:00
if ( ! urls && (format == FORMAT_XML || format == FORMAT_JSON) ) {
g_errno = EMISSINGINPUT;
const char *msg = "missing urls parm. See /admin/api to see parms.";
return g_httpServer.sendErrorReply(sock,500,msg);
}
2013-08-02 13:12:24 -07:00
// get collection rec
CollectionRec *cr = g_collectiondb.getRec ( hr );
2013-08-02 13:12:24 -07:00
// bitch if no collection rec found
if ( ! cr ) {
g_errno = ENOCOLLREC;
const char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(sock,500,msg);
2013-08-02 13:12:24 -07:00
}
2014-02-09 12:38:40 -07:00
2013-08-02 13:12:24 -07:00
// make a new state
GigablastRequest *gr;
try { gr = new (GigablastRequest); }
2017-05-07 20:51:33 +02:00
catch(std::bad_alloc&) {
2013-08-02 13:12:24 -07:00
g_errno = ENOMEM;
log( LOG_WARN, "PageAddUrl: new(%i): %s", (int)sizeof(GigablastRequest),mstrerror(g_errno) );
return g_httpServer.sendErrorReply(sock, 500, mstrerror(g_errno));
2013-08-02 13:12:24 -07:00
}
mnew ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
2013-08-02 13:12:24 -07:00
// this will fill in GigablastRequest so all the parms we need are set
// set this. also sets gr->m_hr
g_parms.setGigablastRequest ( sock , hr , gr );
2013-08-02 13:12:24 -07:00
// if no url given, just print a blank page
if ( ! urls ) return sendReply ( gr );
2013-08-02 13:12:24 -07:00
// do not spider links for spots
bool status = getSpiderRequestMetaList ( (char*)urls, &gr->m_listBuf , gr->m_harvestLinks, NULL );
2014-11-10 14:45:11 -08:00
int32_t size = gr->m_listBuf.length();
// error / not list
if ( ! status || !size ) {
// nuke it
if ( !size ) {
g_errno = EMISSINGINPUT;
}
2016-03-10 00:02:26 +01:00
bool rc = g_httpServer.sendErrorReply(gr);
mdelete ( gr , sizeof(*gr) , "PageAddUrl" );
2016-03-10 00:02:26 +01:00
delete gr;
return rc;
2013-08-02 13:12:24 -07:00
}
// add to spiderdb
if (!gr->m_msg4.addMetaList(&(gr->m_listBuf), cr->m_collnum, gr, addedUrlsToSpiderdbWrapper)) {
// blocked!
2013-08-02 13:12:24 -07:00
return false;
}
2013-08-02 13:12:24 -07:00
// did not block, print page!
sendReply ( gr );
return true;
2013-08-02 13:12:24 -07:00
}
bool sendReply ( void *state ) {
GigablastRequest *gr = (GigablastRequest *)state;
2013-08-02 13:12:24 -07:00
// in order to see what sites are being added log it, then we can
// more easily remove sites from sitesearch.gigablast.com that are
// being added but not being searched
SafeBuf xb;
if ( gr->m_urlsBuf ) {
xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) );
}
char format = gr->m_hr.getReplyFormat();
TcpSocket *sock = gr->m_socket;
if ( format == FORMAT_JSON || format == FORMAT_XML ) {
bool status = g_httpServer.sendSuccessReply ( gr );
// nuke state
mdelete ( gr , sizeof(*gr) , "PageAddUrl" );
delete (gr);
return status;
}
2014-11-10 14:45:11 -08:00
int32_t ulen = 0;
2016-03-10 00:20:03 +01:00
const char *url = gr->m_urlsBuf;
if ( url ) ulen = strlen (url);
2013-08-02 13:12:24 -07:00
// re-null it out if just http://
bool printUrl = true;
if ( ulen == 0 ) printUrl = false;
if ( ! gr->m_urlsBuf ) printUrl = false;
if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
printUrl = false;
if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
2013-08-02 13:12:24 -07:00
printUrl = false;
2013-08-02 13:12:24 -07:00
// page is not more than 32k
StackBuf<1024*32+MAX_URL_LEN*2> sb;
2013-08-02 13:12:24 -07:00
g_pages.printAdminTop ( &sb , sock , &gr->m_hr );
2013-08-02 13:12:24 -07:00
// if there was an error let them know
SafeBuf mbuf;
2013-08-02 13:12:24 -07:00
if ( g_errno ) {
mbuf.safePrintf("<center><font color=red>");
mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno);
mbuf.safePrintf("</font></center>");
} else if ( printUrl ) {
mbuf.safePrintf("<center><font color=red>");
mbuf.safePrintf("<b><u>");
mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>");
mbuf.safePrintf("</font></center>");
2013-08-02 13:12:24 -07:00
}
if ( mbuf.length() ) {
sb.safeStrcpy( mbuf.getBufStart() );
}
g_parms.printParmTable ( &sb , sock , &gr->m_hr );
2014-02-09 12:38:40 -07:00
2013-08-02 13:12:24 -07:00
// print the final tail
2014-02-09 12:38:40 -07:00
g_pages.printTail ( &sb, true ); // admin?
2013-08-02 13:12:24 -07:00
// clear g_errno, if any, so our reply send goes through
g_errno = 0;
2014-02-09 12:38:40 -07:00
2013-08-02 13:12:24 -07:00
// nuke state
mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
delete (gr);
2013-08-02 13:12:24 -07:00
return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime
2013-08-02 13:12:24 -07:00
}