#include "Collectiondb.h" #include "HttpServer.h" #include "Stats.h" #include "Statistics.h" #include "Query.h" #include "Speller.h" #include "Msg40.h" #include "Pages.h" #include "Highlight.h" #include "SearchInput.h" #include <math.h> #include "SafeBuf.h" #include "Pos.h" #include "Bits.h" #include "sort.h" #include "CountryCode.h" #include "Posdb.h" #include "PosdbTable.h" #include "PageResults.h" #include "PageRoot.h" #include "Proxy.h" #include "Json.h" #include "Images.h" //Thumbnail* #include "HttpMime.h" #include "Process.h" #include "Linkdb.h" #include "XmlDoc.h" #include "ip.h" #include "GbUtil.h" #include "Conf.h" #include "Mem.h" #include "RobotsBlockedResultOverride.h" #include "QueryLanguage.h" #include "FxLanguage.h" #include "Errno.h" #ifdef _VALGRIND_ #include <valgrind/memcheck.h> #endif static bool printSearchResultsHeader(State0 *st); static bool printResult(State0 *st, int32_t ix, int32_t *numPrintedSoFar); static bool printSearchResultsTail(State0 *st); static bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) ; static bool printMenu ( SafeBuf *sb , int32_t menuNum , HttpRequest *hr ) ; static void gotQueryLanguageWrapper(void *state, const std::vector<std::pair<lang_t, double>> &languages); static bool gotQueryLanguage(State0 *st, const std::vector<std::pair<lang_t, double>> &languages); static void gotResultsWrapper ( void *state ) ; static bool gotResults ( void *state ) ; static bool replaceParm ( const char *cgi , SafeBuf *newUrl , HttpRequest *hr ) ; static bool replaceParm2 ( const char *cgi , SafeBuf *newUrl , const char *oldUrl , int32_t oldUrlLen ) ; static bool printPairScore(SafeBuf *sb, const SearchInput *si, const PairScore *ps, Msg20Reply *mr); static bool printScoresHeader ( SafeBuf *sb ) ; static bool printMetaContent ( Msg40 *msg40 , int32_t i ,State0 *st, SafeBuf *sb ); static bool printSingleScore(SafeBuf *sb, const SearchInput *si, const SingleScore *ss, Msg20Reply *mr); static bool printSingleTerm(SafeBuf *sb, const Query *q, const SingleScore *ss); static bool printTermPairs(SafeBuf *sb, const Query *q, const PairScore *ps); static bool printLogoAndSearchBox (SafeBuf *sb , class HttpRequest *hr, const SearchInput *si ); State0::State0() : m_sb() , m_header(true) , m_collnum(-1) , m_si() , m_msg40() , m_socket(nullptr) , m_startTime(0) , m_gotResults(false) , m_errno(0) , m_numDocIds(0) , m_took(0) , m_hr() , m_qesb() , m_xd(nullptr) , m_socketStartTimeHack(0) , m_primaryQueryLanguage(langUnknown) { } static bool sendReply(State0 *st, const char *reply, int32_t rlen) { int32_t savedErr = g_errno; TcpSocket *sock = st->m_socket; if ( ! sock ) { log("results: not sending back results on an empty socket." "socket must have closed on us abruptly."); //g_process.shutdownAbort(true); } } SearchInput *si = &st->m_si; const char *ct = "text/html"; if ( si->m_format == FORMAT_XML ) ct = "text/xml"; if ( si->m_format == FORMAT_JSON ) ct = "application/json"; const char *charset = "utf-8"; char format = si->m_format; // . filter anything < 0x20 to 0x20 to keep XML legal // . except \t, \n and \r, they're ok // . gotta set "f" down here in case it realloc'd the buf if ( format == FORMAT_XML && reply ) { unsigned char *f = (unsigned char *)reply; for ( ; *f ; f++ ) if ( *f < 0x20 && *f!='\t' && *f!='\n' && *f!='\r' ) *f = 0x20; } logf(LOG_DEBUG,"gb: sending back %" PRId32" bytes",rlen); Statistics::register_query_time(si->m_q.m_numWords, si->m_queryLangId, savedErr, (gettimeofdayInMilliseconds() - st->m_startTime)); // . log the time // . do not do this if g_errno is set lest m_sbuf1 be bogus b/c // it failed to allocate its buf to hold terminating \0 in // SearchInput::setQueryBuffers() if ( ! g_errno && st->m_took >= g_conf.m_logQueryTimeThreshold ) { logf(LOG_WARN,"Query took %" PRId64" ms for %s. results=%" PRId32, st->m_took, si->m_sbuf1.getBufStart(), st->m_msg40.getNumResults()); } if ( ! savedErr ) { // . one hour cache time... no 1000 hours, basically infinite // . no because if we redo the query the results are cached //int32_t cacheTime = 3600;//*1000; // no... do not use cache int32_t cacheTime = -1; // the "Check it" link on add url uses &usecache=0 to tell // the browser not to use its cache... //if ( hr->getLong("usecache",-1) == 0 ) cacheTime = 0; // // send back the actual search results // if ( sock ) g_httpServer.sendDynamicPage(sock, reply, rlen,//strlen(reply), // don't let the ajax re-gen // if they hit the back button! // so make this 1 hour, not 0 cacheTime, // cachetime in secs false, // POSTReply? ct, -1, // httpstatus -1 -> 200 NULL, // cookieptr charset ); // free st after sending reply since "st->m_sb" = "reply" mdelete(st, sizeof(State0), "PageResults2"); delete st; return true; } mdelete(st, sizeof(State0), "PageResults2"); delete st; // if we had a broken pipe from the browser while sending // them the search results, then we end up closing the socket fd // in TcpServer::sendChunk() > sendMsg() > destroySocket() if ( sock && sock->m_numDestroys ) { log("results: not sending back error on destroyed socket " "sd=%" PRId32,sock->m_sd); return true; } int32_t status = 500; if (savedErr == EBADREQUEST || savedErr == ENOPERM || savedErr == ENOCOLLREC) status = 400; if ( sock ) g_httpServer.sendQueryErrorReply(sock, status, mstrerror(savedErr), format,//xml, savedErr, "There was an error!"); return true; } static bool printCSSHead(SafeBuf *sb) { sb->safePrintf( "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML " "4.01 Transitional//EN\">\n" //"<meta http-equiv=\"Content-Type\" " //"content=\"text/html; charset=utf-8\">\n" "<html>\n" "<head>\n" "<title>Gigablast Search Results</title>\n" "<style><!--" "body {" "font-family:Arial, Helvetica, sans-serif;" ); sb->safePrintf( "color: #000000;" "font-size: 12px;" //"margin: 20px 5px;" "}" "a:link {color:#00c}" "a:visited {color:#551a8b}" "a:active {color:#f00}" ".bold {font-weight: bold;}" ".bluetable {background:#d1e1ff;" "margin-bottom:15px;font-size:12px;}" ".url {color:#008000;}" ".cached, .cached a {font-size: 10px;" "color: #666666;" "}" "table {" "font-family:Arial, Helvetica, sans-serif;" "color: #000000;" "font-size: 12px;" "}" ".directory {font-size: 16px;}" "-->\n" "</style>\n" "</head>\n" ); return true; } // . returns false if blocked, true otherwise // . sets g_errno on error // . "msg" will be inserted into the access log for this request bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) { // make a new state State0 *st; try { st = new State0(); } catch(std::bad_alloc&) { g_errno = ENOMEM; log(LOG_ERROR, "query: Query failed. " "Could not allocate %" PRId32" bytes for query. " "Returning HTTP status of 500.",(int32_t)sizeof(State0)); Statistics::register_query_time(0, langUnknown, g_errno, 0); return g_httpServer.sendQueryErrorReply(s, 500, mstrerror(g_errno), hr->getReplyFormat(), g_errno, "Query failed. Could not allocate memory to execute a search. Please try later."); } mnew(st, sizeof(State0), "PageResults2"); // copy yhits if (!st->m_hr.copy(hr)) { return sendReply(st,nullptr,0); } // set this in case SearchInput::set fails! st->m_socket = s; // record timestamp so we know if we got our socket closed and swapped st->m_socketStartTimeHack = s->m_startTime; // you have to say "&header=1" to get back the header for json now. // later on maybe it will default to on. st->m_header = hr->getLong("header", 1); // get query parameters int32_t queryLen = 0; const char *query = hr->getString("q", &queryLen, ""); const char *fx_qlang = hr->getString("fx_qlang", nullptr, ""); const char *fx_blang = hr->getString("fx_blang", nullptr, ""); const char *fx_country = hr->getString("fx_country", nullptr, ""); const char *fx_fetld = hr->getString("fx_fetld", nullptr, ""); // First try our built-in langauge detection which uses CLD2 and the top-quality accept-language item std::string contentLanguage; if (strlen(fx_qlang) && getLangIdFromAbbr(fx_qlang) != langUnknown) { contentLanguage = fx_qlang; } else if (strlen(fx_blang)) { // en-GB,en;q=0.8,fr;q=0.6,en-US;q=0.4 std::string blang_hint(fx_blang); std::vector<std::pair<std::string, double>> blang_quality_pairs; // split between entries auto items = split(blang_hint, ','); for (const auto &item : items) { // en-US;q=0.4 auto pairs = split(item, ';'); double q = 1; if (pairs.size() == 2) { // has language quality factor auto nvp = split(pairs[1], '='); if (nvp.size() == 2) { q = strtod(nvp[1].c_str(), nullptr); } } // en-US auto tokens = split(pairs[0], '-'); blang_quality_pairs.emplace_back(pairs[0], q); } std::sort(blang_quality_pairs.begin(), blang_quality_pairs.end(), [](const std::pair<std::string, double> &a, const std::pair<std::string, double> &b) { return a.second > b.second; }); auto it = std::unique(blang_quality_pairs.begin(), blang_quality_pairs.end(), [](const std::pair<std::string, double> &a, const std::pair<std::string, double> &b) { return a.first == b.first; }); blang_quality_pairs.erase(it, blang_quality_pairs.end()); if (blang_quality_pairs.size()) { double max_weight = blang_quality_pairs[0].second; for (auto blang_quality_pair : blang_quality_pairs) { if (!contentLanguage.empty()) { contentLanguage.append(","); } // append all blang with max weight if (blang_quality_pair.second == max_weight) { contentLanguage.append(blang_quality_pair.first); continue; } break; } } } const char *tld_hint = fx_fetld; if (!tld_hint || strlen(tld_hint) == 0 || strlen(tld_hint) != 2) { tld_hint = fx_country; } st->m_primaryQueryLanguage = FxLanguage::getLangIdCLD2(true, query, queryLen, contentLanguage.c_str(), contentLanguage.size(), tld_hint, strlen(tld_hint), true); //We can't use st->m_si.m_baseScoringParameters.allLanguageWeightsAreTheSame() because the cgi parameters have not set the members in SearchInput yet. //So we have to check if any CGI parameter using the "lw_" prefix has been specified. bool any_specific_language_weight_specified = false; for(int32_t i = 0; i < hr->getNumFields(); i++) { if(!hr->getValue(i)) continue; const char *full_field_name = hr->getField(i); if(strncmp(full_field_name,"lw_",3)==0) { any_specific_language_weight_specified = true; break; } } if(!any_specific_language_weight_specified) { // Then try the external language detection server if (g_queryLanguage.getLanguage(st, gotQueryLanguageWrapper, fx_qlang, fx_blang, fx_country, fx_fetld, query)) { // blocked return false; } } else log(LOG_DEBUG,"Explicit language weight specified. Not contacting external language detection server"); return gotQueryLanguage(st, {}); } static void gotQueryLanguageWrapper(void *state, const std::vector<std::pair<lang_t, double>> &language_weights) { State0 *st = reinterpret_cast<State0 *>(state); if (!language_weights.empty()) { st->m_primaryQueryLanguage = language_weights.front().first; } gotQueryLanguage(st,language_weights); } static bool gotQueryLanguage(State0 *st, const std::vector<std::pair<lang_t, double>> &language_weights) { // . parse it up // . this returns false and sets g_errno and, maybe, g_msg on error SearchInput *si = &st->m_si; // si just copies the ptr into the httprequest // into stuff like SearchInput::m_defaultSortLanguage // so do not use the "hr" on the stack. SearchInput:: // m_hr points to the hr we pass into // SearchInput::set if (!si->set(st->m_socket, &st->m_hr, st->m_primaryQueryLanguage, language_weights)) { log("query: set search input: %s",mstrerror(g_errno)); if ( ! g_errno ) g_errno = EBADENGINEER; return sendReply(st,NULL,0); } // save collnum now st->m_collnum = si->m_cr ? si->m_cr->m_collnum : -1; st->m_numDocIds = si->m_docsWanted; // add query stat st->m_startTime = gettimeofdayInMilliseconds(); // debug msg log(LOG_DEBUG, "query: Getting search results for q=%s", st->m_si.m_displayQuery); // for now disable queries if (!g_conf.m_queryingEnabled) { g_errno = EQUERYINGDISABLED; return sendReply(st, nullptr,0); } // LAUNCH RESULTS // . get some results from it // . this returns false if blocked, true otherwise // . it also sets g_errno on error // . use a niceness of 0 for all queries so they take precedence // over the indexing process // . this will copy our passed "query" and "coll" to it's own buffer // . we print out matching docIds to int32_t if m_isDebug is true // . no longer forward this, since proxy will take care of evenly // distributing its msg 0xfd "forward" requests now st->m_gotResults = st->m_msg40.getResults(si, false, st, gotResultsWrapper); // save error st->m_errno = g_errno; // wait for results? if (!st->m_gotResults) { return false; } // otherwise call gotResults which returns false if blocked, true else // and sets g_errno on error return gotResults(st); } static void gotResultsWrapper ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; // save error st->m_errno = g_errno; // mark as gotten st->m_gotResults = true; gotResults(st); } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error static bool gotResults ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; int64_t nowMS = gettimeofdayInMilliseconds(); // log the time int64_t took = nowMS - st->m_startTime; // record that st->m_took = took; //log("results: debug: in gotResults state=%" PTRFMT,(PTRTYPE)st); // grab the query Msg40 *msg40 = &(st->m_msg40); //char *q = msg40->getQuery(); //int32_t qlen = msg40->getQueryLen(); SearchInput *si = &st->m_si; // if we lost the socket because we were streaming and it // got closed from a broken pipe or something, then Msg40.cpp // will set st->m_socket to NULL if the fd ends up ending closed // because someone else might be using it and we do not want to // mess with their TcpSocket settings. if ( ! st->m_socket ) { log("results: socket is NULL. sending failed."); return sendReply(st,NULL,0); } // if we skipped a shard because it was dead, usually we provide // the results anyway, but if this switch is true then return an // error code instead. this is the 'all or nothing' switch. if ( msg40->m_msg3a.m_skippedShards > 0 && ! g_conf.m_returnResultsAnyway ) { char reply[256]; sprintf ( reply , "%" PRId32" shard(s) out of %" PRId32" did not " "respond to query." , msg40->m_msg3a.m_skippedShards , g_hostdb.m_numShards ); g_errno = ESHARDDOWN; return sendReply(st,reply,strlen(reply)); } // collection rec must still be there since SearchInput references // into it, and it must be the SAME ptr too! CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum ); if ( ! cr ) { // || cr != si->m_cr ) { g_errno = ENOCOLLREC; return sendReply(st,NULL,0); } // this causes ooms everywhere, not a good fix if ( ! msg40->m_msg20 && ! si->m_docIdsOnly && msg40->m_errno ) { log("msg40: failed to get results q=%s",si->m_q.originalQuery()); //g_errno = ENOMEM; g_errno = msg40->m_errno; return sendReply(st,NULL,0); } int32_t numResults = msg40->getNumResults(); SafeBuf *sb = &st->m_sb; // print logo, search box, results x-y, ... into st->m_sb printSearchResultsHeader ( st ); // then print each result // don't display more than docsWanted results int32_t count = msg40->getDocsWanted(); bool hadPrintError = false; int32_t numPrintedSoFar = 0; for ( int32_t i = 0 ; count > 0 && i < numResults ; i++ ) { ////////// // // prints in xml or html // ////////// if ( ! printResult ( st , i , &numPrintedSoFar ) ) { hadPrintError = true; break; } // limit it count--; } if ( hadPrintError ) { if ( ! g_errno ) g_errno = EBADENGINEER; log("query: had error: %s",mstrerror(g_errno)); } // wrap it up with Next 10 etc. printSearchResultsTail ( st ); // END SERP DIV if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("</div>"); // send it off sendReply(st, st->m_sb.getBufStart(), st->m_sb.length()); return true; } static bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) { SearchInput *si = &st->m_si; CollectionRec *cr = si->m_cr; char format = si->m_format; if ( format == FORMAT_HTML ) { const char *title = "Search Results"; sb.safePrintf("<title>Gigablast - %s</title>\n",title); sb.safePrintf("<style><!--\n"); sb.safePrintf("body {\n"); sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n"); sb.safePrintf("color: #000000;\n"); sb.safePrintf("font-size: 12px;\n"); sb.safePrintf("margin: 0px 0px;\n"); sb.safePrintf("letter-spacing: 0.04em;\n"); sb.safePrintf("}\n"); sb.safePrintf("a {text-decoration:none;}\n"); sb.safePrintf(".bold {font-weight: bold;}\n"); sb.safePrintf(".bluetable {background:#d1e1ff;" "margin-bottom:15px;font-size:12px;}\n"); sb.safePrintf(".url {color:#008000;}\n"); sb.safePrintf(".cached, .cached a {font-size: 10px;" "color: #666666;\n"); sb.safePrintf("}\n"); sb.safePrintf("table {\n"); sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n"); sb.safePrintf("color: #000000;\n"); sb.safePrintf("font-size: 12px;\n"); sb.safePrintf("}\n"); sb.safePrintf(".directory {font-size: 16px;}\n" ".nav {font-size:20px;align:right;}\n" ); sb.safePrintf("-->\n"); sb.safePrintf("</style>\n"); sb.safePrintf("\n"); sb.safePrintf("</head>\n"); sb.safePrintf("<script>\n"); sb.safePrintf("<!--\n"); sb.safePrintf("var openmenu=''; var inmenuclick=0;"); sb.safePrintf("function x(){document.f.q.focus();}\n"); sb.safePrintf("// --></script>\n"); sb.safePrintf("<body " "onmousedown=\"" "if (openmenu != '' && inmenuclick==0) {" "document.getElementById(openmenu)." "style.display='none'; openmenu='';" "}" "inmenuclick=0;" "\" " "onload=\"x()\">\n"); // // DIVIDE INTO TWO PANES, LEFT COLUMN and MAIN COLUMN // sb.safePrintf("<TABLE border=0 height=100%% cellpadding=0 " "cellspacing=0>" "\n<TR>\n"); // // first the nav column // // . also prints <TD>...</TD>. true=isSearchresults // . tabName = "search" printLeftColumnRocketAndTabs ( &sb , true , cr , "search" ); // // now the MAIN column // sb.safePrintf("\n</TD>" "<TD valign=top style=padding-left:30px;>\n"); } return true; } static bool printIgnoredWords ( SafeBuf *sb , const SearchInput *si ) { // mention ignored query terms // we need to set another Query with "keepAllSingles" set to false const Query *qq2 = &si->m_q; bool firstIgnored = true; for ( int32_t i = 0 ; i < qq2->m_numWords ; i++ ) { //if ( si->m_xml ) break; const QueryWord *qw = &qq2->m_qwords[i]; // only print out words ignored cuz they were stop words if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue; // print header -- we got one if ( firstIgnored ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf ("\t<ignoredWords><![CDATA["); else if ( si->m_format == FORMAT_JSON ) sb->safePrintf ("\t\"ignoredWords\":\""); else if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("<br><font " "color=\"#707070\">The " "following query words " "were ignored: " "<b>"); firstIgnored = false; } // print the word sb->utf8Encode2(qw->m_word, qw->m_wordLen); sb->safePrintf (" "); } // print tail if we had ignored terms if ( ! firstIgnored ) { sb->incrementLength(-1); if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></ignoredWords>\n"); else if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\",\n"); else if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("</b>. Preceed each with a '+' or " "wrap in " "quotes to not ignore.</font>"); } return true; } static bool printSearchResultsHeader(State0 *st) { const SearchInput *si = &st->m_si; // grab the query Msg40 *msg40 = &(st->m_msg40); const char *q = msg40->getQuery(); int32_t qlen = msg40->getQueryLen(); //char local[ 128000 ]; //SafeBuf sb(local, 128000); SafeBuf *sb = &st->m_sb; // reserve 1.5MB now! if ( ! sb->reserve(1500000 ,"pgresbuf" ) ) // 128000) ) return false; // just in case it is empty, make it null terminated sb->nullTerm(); // print first [ for json if ( si->m_format == FORMAT_JSON ) { if ( st->m_header ) sb->safePrintf("{\n"); // this is just for diffbot really... else sb->safePrintf("[\n"); } CollectionRec *cr = si->m_cr; HttpRequest *hr = &st->m_hr; // if there's a ton of sites use the post method otherwise // they won't fit into the http request, the browser will reject // sending such a large request with "GET" const char *method = "GET"; if ( si->m_sites && strlen(si->m_sites)>800 ) method = "POST"; if ( si->m_format == FORMAT_HTML && cr->m_htmlHead.length() ) { return expandHtml ( *sb , cr->m_htmlHead.getBufStart(), cr->m_htmlHead.length(), q, qlen, hr, si, method, cr); } if ( si->m_format == FORMAT_HTML ) { printCSSHead (sb); sb->safePrintf("<body>"); } if ( si->m_format==FORMAT_WIDGET_IFRAME ) { printCSSHead(sb); sb->safePrintf("<body style=padding:0px;margin:0px;>"); int32_t refresh = hr->getLong("refresh",0); if ( refresh ) sb->safePrintf("<meta http-equiv=\"refresh\" " "content=%" PRId32">",refresh); } // lead with user's widget header which usually has custom style tags if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { const char *header = hr->getString("header",NULL); if ( header ) sb->safeStrcpy ( header ); } // if we are xml/json we call this below otherwise we lose // the header of <?xml...> or whatever if ( si->m_format == FORMAT_HTML ) { printLeftNavColumn ( *sb,st ); printLogoAndSearchBox ( sb, &st->m_hr, si ); } // the calling function checked this so it should be non-null const char *coll = cr->m_coll; int32_t collLen = strlen(coll); if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { int32_t widgetwidth = hr->getLong("widgetwidth",150); int32_t widgetHeight = hr->getLong("widgetheight",400); //int32_t iconWidth = 25; // put image in this div which will have top:0px JUST like // the div holding the search results we print out below // so that the image does not scroll when you use the // scrollbar. holds the magifying glass img and searchbox. sb->safePrintf("<div class=magglassdiv " "style=\"position:absolute;" "right:15px;" "z-index:10;" "top:0px;\">"); //int32_t refresh = hr->getLong("refresh",15); const char *oq = hr->getString("q",NULL); if ( ! oq ) oq = ""; const char *prepend = hr->getString("prepend"); if ( ! prepend ) prepend = ""; const char *displayStr = "none"; if ( prepend[0] ) displayStr = ""; // to do a search we need to re-call the ajax, // just call reload like the one that is called every 15s or so sb->safePrintf("<form "//method=get action=/search " // use "1" as arg to force reload "onsubmit=\"widget123_reload(1);" // let user know we are loading "var w=document.getElementById(" "'widget123_scrolldiv');" // just set the widget content to the reply "if (w) " "w.innerHTML='<br><br><b>Loading Results..." "</b>';" // prevent it from actually submitting "return false;\">"); sb->safePrintf("<img " "style=\"" //"position:absolute;" // absolute or relative? // put it on TOP of the other stuff "z-index:10;" "margin-top:3px;" //"right:10px;" //"right:2px;" //"width:%" PRId32"px;" // so we are to the right of the searchbox "float:right;" "\" " "onclick=\"" "var e=document.getElementById('sbox');" "if(e.style.display == 'none') {" "e.style.display = '';" // give it focus "var qb=document.getElementById('qbox');" "qb.focus();" "} else {" "e.style.display = 'none';" "}" "\" " // end function " " "width=35 " "height=31 " "src=\"/magglass.png\">" ); //char *origq = hr->getString("q"); // we sort all results by spider date now so PREPEND // the actual user query const char *origq = hr->getString("prepend"); if ( ! origq ) origq = ""; sb->safePrintf("<div id=sbox style=\"float:left;" "display:%s;" "opacity:0.83;" //"background-color:gray;" //"padding:5px;" "\">" // the box that holds the query "<input type=text id=qbox name=qbox " "size=%" PRId32" " //name=prepend " "value=\"%s\" " "style=\"z-index:10;" "font-weight:bold;" "font-size:18px;" "border:4px solid black;" "margin:3px;" "\">" , displayStr , widgetwidth / 23 , origq ); sb->safePrintf("</div>" "</form>\n" ); // . BEGIN SERP DIV // . div to hold the search results // . this will have the scrollbar to just scroll the serps // and not the magnifying glass sb->safePrintf("</div>" "<div id=widget123_scrolldiv " "onscroll=widget123_append(); " "style=\"position:absolute;" "top:0px;" "overflow-y:auto;" "overflow-x:hidden;" "width:%" PRId32"px;" "height:%" PRId32"px;\">" , widgetwidth , widgetHeight); } // xml if ( si->m_format == FORMAT_XML ) sb->safePrintf("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); int64_t nowMS = gettimeofdayInMilliseconds(); // show current time if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t<currentTimeUTC>%" PRIu32"</currentTimeUTC>\n", (uint32_t)(nowMS/1000)); } else if ( st->m_header && si->m_format == FORMAT_JSON ) { sb->safePrintf("\"currentTimeUTC\":%" PRIu32",\n", (uint32_t)(nowMS/1000)); } // show result validity time if ( si->m_format == FORMAT_XML ) { int32_t expireTimeUTC = nowMS/1000 + g_conf.m_defaultQueryResultsValidityTime; sb->safePrintf("\t<expireTimeUTC>%" PRIu32"</expireTimeUTC>\n", expireTimeUTC); } else if ( st->m_header && si->m_format == FORMAT_JSON ) { int32_t expireTimeUTC = nowMS/1000 + g_conf.m_defaultQueryResultsValidityTime; sb->safePrintf("\"expireTimeUTC\":%" PRIu32",\n", expireTimeUTC); } // show response time if not doing Quality Assurance if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t<responseTimeMS>%" PRId64"</responseTimeMS>\n", st->m_took); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"responseTimeMS\":%" PRId64",\n", st->m_took); // out of memory allocating msg20s? if ( st->m_errno ) { log("query: Query failed. Had error processing query: %s", mstrerror(st->m_errno)); g_errno = st->m_errno; return false; } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t<numResultsOmitted>%" PRId32 "</numResultsOmitted>\n", msg40->m_omitCount); sb->safePrintf("\t<numShardsSkipped>%" PRId32"</numShardsSkipped>\n", msg40->m_msg3a.m_skippedShards); sb->safePrintf("\t<totalShards>%" PRId32"</totalShards>\n", g_hostdb.m_numShards ); sb->safePrintf("\t<pctSearched>%f</pctSearched>\n", msg40->m_msg3a.m_pctSearched); } if ( st->m_header && si->m_format == FORMAT_JSON ) { sb->safePrintf("\"numResultsOmitted\":%" PRId32",\n", msg40->m_omitCount); sb->safePrintf("\"numShardsSkipped\":%" PRId32",\n", msg40->m_msg3a.m_skippedShards); sb->safePrintf("\"totalShards\":%" PRId32",\n", g_hostdb.m_numShards ); sb->safePrintf("\"pctSearched\":%f,\n", msg40->m_msg3a.m_pctSearched); } // save how many docs are in this collection int64_t docsInColl = -1; RdbBase *base = getRdbBase ( RDB_CLUSTERDB , st->m_collnum ); // estimate it if ( base ) { docsInColl = base->estimateNumGlobalRecs(); } // include number of docs in the collection corpus if ( docsInColl >= 0LL ) { if ( si->m_format == FORMAT_XML) { sb->safePrintf ( "\t<docsInCollection>%" PRId64"</docsInCollection>\n", docsInColl ); } else if ( st->m_header && si->m_format == FORMAT_JSON) { sb->safePrintf("\"docsInCollection\":%" PRId64",\n", docsInColl); } } int32_t numResults = msg40->getNumResults(); bool moreFollow = msg40->moreResultsFollow(); // an estimate of the # of total hits int64_t totalHits = msg40->getNumTotalHits(); // only adjust upwards for first page now so it doesn't keep chaning if ( totalHits < numResults ) totalHits = numResults; if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t<hits>%" PRId64"</hits>\n",(int64_t)totalHits); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"hits\":%" PRId64",\n", (int64_t)totalHits); if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t<moreResultsFollow>%" PRId32 "</moreResultsFollow>\n" ,(int32_t)moreFollow); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"moreResultsFollow\":%" PRId32",\n", (int32_t)moreFollow); // print individual query term info if ( si->m_format == FORMAT_XML ) { const Query *q = &si->m_q; sb->safePrintf("\t<queryInfo>\n"); sb->safePrintf("\t\t<fullQuery><![CDATA["); cdataEncode(sb, q->originalQuery()); sb->safePrintf("]]></fullQuery>\n"); sb->safePrintf("\t\t<queryLanguageAbbr>" "<![CDATA[%s]]>" "</queryLanguageAbbr>\n" , getLanguageAbbr(si->m_queryLangId) ); sb->safePrintf("\t\t<queryLanguage>" "<![CDATA[%s]]>" "</queryLanguage>\n" , getLanguageString(si->m_queryLangId) ); // print query words we ignored, like stop words printIgnoredWords ( sb , si ); sb->safePrintf("\t\t<queryNumTermsTotal>" "%" PRId32 "</queryNumTermsTotal>\n" , q->m_numTermsUntruncated ); sb->safePrintf("\t\t<queryNumTermsUsed>" "%" PRId32 "</queryNumTermsUsed>\n" , q->m_numTerms ); int32_t tval = 0; if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1; sb->safePrintf("\t\t<queryWasTruncated>" "%" PRId32 "</queryWasTruncated>\n" , tval ); for ( int i = 0 ; i < q->m_numTerms ; i++ ) { sb->safePrintf("\t\t<term>\n"); const QueryTerm *qt = &q->m_qterms[i]; sb->safePrintf("\t\t\t<termNum>%i</termNum>\n",i); const char *term = qt->m_term; sb->safePrintf("\t\t\t<termStr><![CDATA["); const char *printTerm = qt->m_term; if ( is_wspace_a(term[0])) printTerm++; cdataEncode(sb, printTerm,qt->m_termLen); sb->safePrintf("]]>" "</termStr>\n"); // syn? const QueryTerm *sq = qt->m_synonymOf; // what language did synonym come from? if ( sq ) { // language map from wiktionary sb->safePrintf("\t\t\t<termLang>" "<![CDATA["); bool first = true; for ( int i = 0 ; i < langLast ; i++ ) { uint64_t bit = (uint64_t)1 << i; if ( ! (qt->m_langIdBits&bit))continue; const char *str = getLanguageAbbr(i); if ( ! first ) sb->pushChar(','); first = false; sb->safeStrcpy ( str ); } sb->safePrintf("]]></termLang>\n"); } if ( sq ) { const char *term = sq->m_term; const char *printTerm = term; if ( is_wspace_a(term[0])) printTerm++; sb->safePrintf("\t\t\t<synonymOf>" "<![CDATA[%*.*s]]>" "</synonymOf>\n" ,(int)sq->m_termLen,(int)sq->m_termLen,printTerm); } sb->safePrintf("\t\t\t<termFreq>%" PRId64"</termFreq>\n" ,qt->m_termFreq); sb->safePrintf("\t\t\t<termFreqWeight>%.2f</termFreqWeight>\n" ,qt->m_termFreqWeight); sb->safePrintf("\t\t\t<termHash48>%" PRId64"</termHash48>\n" ,qt->m_termId); sb->safePrintf("\t\t\t<termHash64>%" PRIu64"</termHash64>\n" ,qt->m_rawTermId); const QueryWord *qw = qt->m_qword; sb->safePrintf("\t\t\t<prefixHash64>%" PRIu64"</prefixHash64>\n" ,qw->m_prefixHash); sb->safePrintf("\t\t</term>\n"); } sb->safePrintf("\t</queryInfo>\n"); } // print individual query term info if ( si->m_format == FORMAT_JSON && st->m_header ) { const Query *q = &si->m_q; sb->safePrintf("\"queryInfo\":{\n"); sb->safePrintf("\t\"fullQuery\":\""); sb->jsonEncode(q->originalQuery()); sb->safePrintf("\",\n"); sb->safePrintf("\t\"queryLanguageAbbr\":\""); sb->jsonEncode ( getLanguageAbbr(si->m_queryLangId) ); sb->safePrintf("\",\n"); sb->safePrintf("\t\"queryLanguage\":\""); sb->jsonEncode ( getLanguageString(si->m_queryLangId) ); sb->safePrintf("\",\n"); //If the language weights are dissimilar then print the top 5 //make a temporari sorted-by-weight list of languages struct langweight_t { lang_t lang; double weight; } langweights[MAX_LANGUAGES]; for(int i=0; i<MAX_LANGUAGES; i++) { langweights[i].lang = (lang_t)i; langweights[i].weight = si->m_baseScoringParameters.m_languageWeights[i]; } std::sort(langweights, langweights+MAX_LANGUAGES, [](const langweight_t& a, const langweight_t& b) { return a.weight>b.weight; }); //if the weigths are not all identical then print the top 5 if(langweights[0].weight!=langweights[MAX_LANGUAGES-1].weight) { sb->safePrintf("\t\"languageWeights\": [ "); for(int i=0; i<7; i++) { const char *l = getLanguageAbbr(langweights[i].lang); sb->safePrintf("{\"lang\":\"%s\", \"weight\":%.3f}", l, langweights[i].weight); if(i+1<7) sb->safePrintf(", "); } sb->safePrintf(" ],\n"); } // print query words we ignored, like stop words printIgnoredWords ( sb , si ); sb->safePrintf("\t\"queryNumTermsTotal\":" "%" PRId32",\n" , q->m_numTermsUntruncated ); sb->safePrintf("\t\"queryNumTermsUsed\":" "%" PRId32",\n" , q->m_numTerms ); int32_t tval = 0; if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1; sb->safePrintf("\t\"queryWasTruncated\":" "%" PRId32",\n" , tval ); sb->safePrintf("\t\"terms\":[\n"); for ( int i = 0 ; i < q->m_numTerms ; i++ ) { const QueryTerm &qt = q->m_qterms[i]; sb->safePrintf("\t\t{\n"); sb->safePrintf("\t\t\t\"termNum\":%i,\n",i); sb->safePrintf("\t\t\t\"termStr\":\""); sb->jsonEncode(qt.m_term, qt.m_termLen); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"isPhrase\":%s,\n", qt.m_isPhrase?"true":"false"); sb->safePrintf("\t\t\t\"termHash48\":%" PRId64",\n", qt.m_termId); sb->safePrintf("\t\t\t\"termHash64\":%" PRIu64",\n", qt.m_rawTermId); //m_termSign? sb->safePrintf("\t\t\t\"termFreq\":%" PRId64",\n", qt.m_termFreq); sb->safePrintf("\t\t\t\"termFreqWeight\":%.2f,\n", qt.m_termFreqWeight); sb->safePrintf("\t\t\t\"queryStopWord\":%s,\n", qt.m_isQueryStopWord?"true":"false"); sb->safePrintf("\t\t\t\"termWeight\":%.2f,\n", qt.m_termWeight); sb->safePrintf("\t\t\t\"userWeight\":%.2f,\n", qt.m_userWeight); sb->safePrintf("\t\t\t\"userNotRequired\":%s,\n", qt.m_userNotRequired?"true":"false"); sb->safePrintf("\t\t\t\"ignored\":%s,\n", qt.m_ignored?"true":"false"); const QueryTerm *synqt = qt.m_synonymOf; if(synqt) { if(qt.m_langIdBitsValid) { // what language did synonym come from? // language map from wiktionary sb->safePrintf("\t\t\t\"termLang\":\""); bool first = true; for(int i = 0; i < langLast; i++) { uint64_t bit = (uint64_t)1 << i; if(!(qt.m_langIdBits&bit)) continue; const char *str = getLanguageAbbr(i); if(!first) sb->pushChar(','); first = false; sb->jsonEncode(str); } sb->safePrintf("\",\n"); } sb->safePrintf("\t\t\t\"synonymOf\":\""); sb->jsonEncode(synqt->m_term, synqt->m_termLen); sb->safePrintf("\",\n"); } const char *fieldCodeName = getFieldCodeName(qt.m_fieldCode); if(fieldCodeName) sb->safePrintf("\t\t\t\"fieldCodeName\":\"%s\",\n", fieldCodeName); sb->safePrintf("\t\t\t\"required\":%s,\n", qt.m_isRequired?"true":"false"); sb->safePrintf("\t\t\t\"prefixHash64\":%" PRIu64"\n", qt.m_qword->m_prefixHash); sb->safePrintf("\t\t}"); // don't end last query term attr on a omma if( i + 1 < q->m_numTerms) sb->pushChar(','); sb->pushChar('\n'); } sb->safePrintf("\t]\n"); // end "terms":[] sb->safePrintf("},\n"); //end "queryInfo":{} } if ( si->m_format == FORMAT_JSON && st->m_header ) { sb->safePrintf("\"results\":[\n"); return true; } // debug if ( si->m_debug ) { logf(LOG_DEBUG,"query: Displaying up to %" PRId32" results.", numResults); } // get some result info from msg40 int32_t firstNum = msg40->getFirstResultNum() ; // numResults may be more than we requested now! int32_t n = msg40->getDocsWanted(); if ( n > numResults ) n = numResults; // . make the query class here for highlighting // . keepAllSingles means to convert all individual words into // QueryTerms even if they're in quotes or in a connection (cd-rom). // we use this for highlighting purposes Query qq; qq.set(si->m_displayQuery, langUnknown, 1.0, 1.0, &si->m_word_variations_config, false, si->m_allowHighFrequencyTermCache, ABS_MAX_QUERY_TERMS); //syn-todo: in the call above si->m_queryExpansion was used for both 'queryExpansion' and 'useQueryStopWords'. Why? if ( g_errno ) return false; DocIdScore *dpx = NULL; if ( numResults > 0 ) dpx = msg40->getScoreInfo(0); if ( si->m_format == FORMAT_XML && dpx ) { float max = 0.0; // max pairwise float lw = getHashGroupWeight(HASHGROUP_INLINKTEXT); // square that location weight lw *= lw; // assume its an inlinker's text, who has rank 15!!! lw *= getLinkerWeight(MAXSITERANK); // single weights float maxtfw1 = 0.0; int32_t maxi1; // now we can have multiple SingleScores for the same term! // because we take the top MAX_TOP now and add them to // get the term's final score. for ( int32_t i = 0 ; i< dpx->m_numSingles ; i++ ) { SingleScore *ssi = &dpx->m_singleScores[i]; float tfwi = ssi->m_tfWeight; if ( tfwi <= maxtfw1 ) continue; maxtfw1 = tfwi; maxi1 = i; } float maxtfw2 = 0.0; for ( int32_t i = 0 ; i< dpx->m_numSingles ; i++ ) { if ( i == maxi1 ) continue; SingleScore *ssi = &dpx->m_singleScores[i]; float tfwi = ssi->m_tfWeight; if ( tfwi <= maxtfw2 ) continue; maxtfw2 = tfwi; } // only 1 term? if ( almostEqualFloat(maxtfw2, 0.0) ) maxtfw2 = maxtfw1; // best term freqs max *= maxtfw1 * maxtfw2; // site rank effect max *= MAXSITERANK*si->m_baseScoringParameters.m_siteRankMultiplier + 1; sb->safePrintf ("\t\t<theoreticalMaxFinalScore>%f" "</theoreticalMaxFinalScore>\n", max ); } // debug msg log ( LOG_TIMING , "query: Got %" PRId32" search results in %" PRId64" ms for q=%s", numResults,gettimeofdayInMilliseconds()-st->m_startTime, qq.getQuery()); st->m_qesb.nullTerm(); // encode query buf const char *dq = si->m_displayQuery; if ( dq ) { urlEncode(&st->m_qesb,dq); } // print it with commas into "thbuf" and null terminate it char thbuf[64]; ulltoa ( thbuf , totalHits ); char inbuf[128]; ulltoa ( inbuf , docsInColl ); // otherwise, we had no error if ( numResults == 0 && si->m_format == FORMAT_HTML ) { sb->safePrintf ( "No results found in <b>%s</b> collection.", cr->m_coll); } // the token is currently in the collection name so do not show that else if ( numResults == 0 && ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) ) { sb->safePrintf ( "No results found. Wait for spider to " "kick in."); } else if ( moreFollow && si->m_format == FORMAT_HTML ) { sb->safePrintf ("Results <b>%" PRId32"</b> to <b>%" PRId32"</b> of " "exactly <b>%s</b> from an index " "of about %s pages" , firstNum + 1 , firstNum + n , thbuf , inbuf ); } // otherwise, we didn't get enough results to show this page else if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ("Results <b>%" PRId32"</b> to <b>%" PRId32"</b> of " "exactly <b>%s</b> from an index " "of about %s pages" , firstNum + 1 , firstNum + n , thbuf , inbuf ); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf(" in %.02f seconds",((float)st->m_took)/1000.0); // // if query was a url print add url msg // const char *url = NULL; if ( !strncmp(q,"url:" ,4) && qlen > 4 ) url = q+4; if ( !strncmp(q,"http://" ,7) && qlen > 7 ) url = q; if ( !strncmp(q,"https://",8) && qlen > 8 ) url = q; if ( !strncmp(q,"www." ,4) && qlen > 4 ) url = q; // find end of url const char *ue = url; for ( ; ue && *ue && ! is_wspace_a(*ue) ; ue++ ) ; if ( numResults == 0 && si->m_format == FORMAT_HTML && url ) { sb->safePrintf("<br><br>" "Could not find that url in the " "index. Try <a href=\"/addurl?u="); urlEncode(sb,url,ue-url,false,false); sb->safePrintf("\">Adding it.</a>"); } // sometimes ppl search for "www.whatever.com" so ask them if they // want to search for url:www.whatever.com if ( numResults > 0 && si->m_format == FORMAT_HTML && url && url == q){ sb->safePrintf("<br><br>" "Did you mean to " "search for the url " "<a href=\"/search?q=url%%3A"); urlEncode(sb,url,ue-url,false,false); sb->safePrintf("\">"); sb->safeMemcpy(url,ue-url); sb->safePrintf("</a> itself?"); } // is it the main collection? bool isMain = false; if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true; // print "in collection ***" if we had a collection if (collLen>0 && numResults>0 && !isMain && si->m_format==FORMAT_HTML ) sb->safePrintf (" in collection <b>%s</b>",coll); printIgnoredWords ( sb , si ); if ( si->m_format == FORMAT_HTML ) { sb->safePrintf("<br><br>"); sb->safePrintf("<table cellpadding=0 cellspacing=0>" "<tr><td valign=top>"); } // debug if ( si->m_debug ) logf(LOG_DEBUG,"query: Printing up to %" PRId32" results. " "bufStart=%p", numResults, sb->getBufPtr()); return true; } bool printSearchResultsTail(State0 *st) { SafeBuf *sb = &st->m_sb; SearchInput *si = &st->m_si; Msg40 *msg40 = &(st->m_msg40); CollectionRec *cr = si->m_cr; const char *coll = cr->m_coll; if ( si->m_format == FORMAT_JSON ) { // remove last },\n if there and replace with just \n const char *e = sb->getBufPtr() - 2; if ( sb->length()>=2 && e[0]==',' && e[1]=='\n') { sb->m_length -= 2; sb->safePrintf("\n"); } // print ending ] for json search results sb->safePrintf("]\n"); if ( st->m_header ) sb->safePrintf("}\n"); ////////////////////// // for some reason if we take too long to write out this // tail we get a SIGPIPE on a firefox browser. ////////////////////// // all done for json return true; } // grab the query const char *q = msg40->getQuery(); int32_t qlen = msg40->getQueryLen(); HttpRequest *hr = &st->m_hr; // get some result info from msg40 int32_t firstNum = msg40->getFirstResultNum() ; // end the two-pane table if ( si->m_format == FORMAT_HTML) sb->safePrintf("</td></tr></table>"); // // PRINT PREV 10 NEXT 10 links! // // center everything below here if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "<br><center>" ); int32_t remember = sb->length(); // now print "Prev X Results" if we need to if ( firstNum < 0 ) firstNum = 0; StackBuf<300> args; // show banned? if ( si->m_showBanned && ! si->m_isMasterAdmin ) args.safePrintf("&sb=1"); if ( ! si->m_showBanned && si->m_isMasterAdmin ) args.safePrintf("&sb=0"); // collection args.safePrintf("&c=%s",coll); // formatting info if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { args.safePrintf("&format=widget"); int32_t widgetwidth = hr->getLong("widgetwidth",250); args.safePrintf("&widgetwidth=%" PRId32,widgetwidth); } // carry over the sites we are restricting the search results to if ( si->m_sites ) //whiteListBuf.getBufStart()); args.safePrintf("&sites=%s",si->m_sites); if ( si->m_format == FORMAT_HTML && msg40->m_omitCount ) { // && firstNum == 0 ) { // . add our cgi to the original url // . so if it has &qlang=de and they select &qlang=en // we have to replace it... etc. StackBuf<> newUrl; // show banned results replaceParm2 ("sb=1", &newUrl, hr->getOrigUrlRequest(), hr->getOrigUrlRequestLen() ); // no deduping by summary or content hash etc. StackBuf<> newUrl2; replaceParm2("dr=0",&newUrl2,newUrl.getBufStart(), newUrl.length()); // and no site clustering StackBuf<> newUrl3; replaceParm2 ( "sc=0", &newUrl3 , newUrl2.getBufStart(), newUrl2.length()); // start at results #0 again StackBuf<> newUrl4; replaceParm2 ( "s=0", &newUrl4 , newUrl3.getBufStart(), newUrl3.length()); // show errors StackBuf<> newUrl5; replaceParm2 ( "showerrors=1", &newUrl5 , newUrl4.getBufStart(), newUrl4.length()); sb->safePrintf("<center>" "<i>" "%" PRId32" results were omitted because they " "were considered duplicates, banned, errors " "<br>" "or " "from the same site as other results. " "<a href=%s>Click here to show all results</a>." "</i>" "</center>" "<br><br>" , msg40->m_omitCount , newUrl5.getBufStart() ); } if ( firstNum > 0 && (si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME //|| //si->m_format == FORMAT_WIDGET_AJAX ) ) { int32_t ss = firstNum - msg40->getDocsWanted(); //sb->safePrintf("<a href=\"/search?s=%" PRId32"&q=",ss); // our current query parameters //sb->safeStrcpy ( st->m_qe ); // print other args if not zero //sb->safeMemcpy ( &args ); // make the cgi parm to add to the original url char nsbuf[128]; sprintf(nsbuf,"s=%" PRId32,ss); // get the original url and add/replace in &s=xxx StackBuf<> newUrl; replaceParm ( nsbuf , &newUrl , hr ); // close it up sb->safePrintf ("<a href=\"%s\"><b>" "<font size=+0>Prev %" PRId32" Results</font>" "</b></a>" , newUrl.getBufStart() , msg40->getDocsWanted() ); } // now print "Next X Results" if ( msg40->moreResultsFollow() && (si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME //si->m_format == FORMAT_WIDGET_AJAX )) { int32_t ss = firstNum + msg40->getDocsWanted(); // print a separator first if we had a prev results before us if ( sb->length() > remember ) sb->safePrintf ( " " ); // add the query //sb->safePrintf ("<a href=\"/search?s=%" PRId32"&q=",ss); // our current query parameters //sb->safeStrcpy ( st->m_qe ); // print other args if not zero //sb->safeMemcpy ( &args ); // make the cgi parm to add to the original url char nsbuf[128]; sprintf(nsbuf,"s=%" PRId32,ss); // get the original url and add/replace in &s=xxx StackBuf<> newUrl; replaceParm ( nsbuf , &newUrl , hr ); // close it up sb->safePrintf("<a href=\"%s\"><b>" "<font size=+0>Next %" PRId32" Results</font>" "</b></a>" , newUrl.getBufStart() , msg40->getDocsWanted() ); } // print try this search on... // an additional <br> if we had a Next or Prev results link if ( sb->length() > remember && si->m_format == FORMAT_HTML ) sb->safeMemcpy ("<br>" , 4 ); // // END PRINT PREV 10 NEXT 10 links! // if ( si->m_format == FORMAT_HTML ) { sb->safePrintf("<input name=c type=hidden value=\"%s\">",coll); } bool isAdmin = (si->m_isMasterAdmin || si->m_isCollAdmin); if ( si->m_format != FORMAT_HTML ) isAdmin = false; // TODO: print cache line in light gray here // TODO: "these results were cached X minutes ago" if ( msg40->getCachedTime() > 0 && si->m_format == FORMAT_HTML ) { sb->safePrintf("<br><br><font size=1 color=707070>" "<b><center>"); sb->safePrintf ( " These results were cached " ); // this cached time is this local cpu's time int32_t diff = getTime() - msg40->getCachedTime(); if ( diff < 60 ) sb->safePrintf ("%" PRId32" seconds", diff ); else if ( diff < 2*60 ) sb->safePrintf ("1 minute"); else sb->safePrintf ("%" PRId32" minutes",diff/60); sb->safePrintf ( " ago. [<a href=\"/pageCache.html\">" "<font color=707070>Info</font></a>]"); sb->safePrintf ( "</center></font>"); } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("</response>\n"); } if ( si->m_format == FORMAT_HTML && cr->m_htmlTail.length() == 0 ) { sb->safePrintf ( "<br>" "<center>" "<font color=gray>" "Copyright © 2014. All Rights " "Reserved.<br/>" "Powered by the <a href=\"http://www." "gigablast.com/\">GigaBlast</a> open source " "search engine." "</font>" "</center>\n" "<br>\n" ); } // if we did not use ajax, print this tail here now if ( si->m_format == FORMAT_HTML ) { sb->safePrintf( "</body>\n" "</html>\n" ); } // ajax widgets will have this outside the downloaded content if ( si->m_format == FORMAT_WIDGET_IFRAME ) { sb->safePrintf ( "<br>" "<center>" "<font color=gray>" // link to edit the list of widget sites // or various other widget content properties // because we can't edit the width/height // of the widget like this. "<a href=\"/widget?inlineedit=1\">edit</a> " "• " //"Copyright © 2014. All Rights " //"Reserved.<br/>" "Powered by <a href=http://www.diffbot.com>" "Diffbot</a>." "</font>" "</center>\n" "</body>\n" "</html>\n" ); } if ( sb->length() == 0 && si->m_format == FORMAT_JSON ) sb->safePrintf("[]\n"); if ( sb->length() == 0 ) { sb->pushChar('\n'); sb->nullTerm(); } if ( si->m_format == FORMAT_HTML && cr->m_htmlTail.length() && ! expandHtml ( *sb , cr->m_htmlTail.getBufStart(), cr->m_htmlTail.length(), q, qlen, hr, si, NULL, // method, cr) ) return false; return true; } static int linkSiteRankCmp (const void *v1, const void *v2) { Inlink *i1 = *(Inlink **)v1; Inlink *i2 = *(Inlink **)v2; return i2->m_siteRank - i1->m_siteRank; } static bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si , int32_t *numPrinted ) { *numPrinted = 0; // . show the "LinkInfo" // . Msg20.cpp will have "computed" the LinkInfo if we set // Msg20Request::m_computeLinkInfo to true, but if we set // Msg20Request::m_getLinkInfo to true it will just get it // from the TitleRec, which is much faster but more stale. // . "&inlinks=1" is slow and fresh, "&inlinks=2" is fast // and stale. Both are really only for BuzzLogic. LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; // Corrupted linkinfo has been seen. In that case just log an error and do not try to print any link texts if(info && mr->size_linkInfo!=info->m_lisize) { log(LOG_ERROR, "results: mr->size_linkInfo(%d) != info->m_lisize (%d)", mr->size_linkInfo, info->m_lisize); return false; } // NULLify if empty if ( mr->size_linkInfo <= 0 ) info = NULL; // do not both if none if ( info && ! info->m_numStoredInlinks ) info = NULL; // bail? if ( ! info ) return true; // now sort them up Inlink *ptrs[3000]; //could use #define from Msg25/linkdb.h but we don't really have to match int32_t numLinks = 0; for(Inlink *k = info->getNextInlink(NULL); k; k = info->getNextInlink(k)) { ptrs[numLinks++] = k; if ( numLinks >= 3000 ) break; } // sort them gbsort ( ptrs , numLinks , sizeof(Inlink *) , linkSiteRankCmp ); // print xml starter if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<inlinks>\n"); // loop through the inlinks bool printedInlinkText = false; bool firstTime = true; int32_t inlinkId = 0; int64_t starttime = gettimeofdayInMilliseconds(); for ( int32_t i = 0 ; i < numLinks ; i++ ) { Inlink *k = ptrs[i]; if ( ! k->getLinkText() ) continue; if ( ! si->m_doQueryHighlighting && si->m_format == FORMAT_HTML ) continue; const char *str = k->getLinkText();//ptr_linkText; int32_t strLen = strnlen(k->getLinkText(),k->size_linkText); const char *frontTag = "<font style=\"color:black;background-color:yellow\">" ; const char *backTag = "</font>"; if ( si->m_format == FORMAT_XML ) { frontTag = "<b>"; backTag = "</b>"; } if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { frontTag = "<font style=\"background-color:yellow\">" ; } Highlight hi; SafeBuf hb; int32_t hlen = hi.set ( &hb, str, strLen , &si->m_hqq, frontTag, backTag ); if ( hlen <= 0 ) { continue; } // skip it if nothing highlighted if ( hi.getNumMatches() == 0 ) continue; if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\t<inlink " "docId=\"%" PRId64"\" " "url=\"", k->m_docId ); // encode it for xml sb->htmlEncode ( k->getUrl(),//ptr_urlBuf, k->size_urlBuf - 1 , false ); sb->safePrintf("\" " //"hostId=\"%" PRIu32"\" " "firstindexed=\"%" PRIu32"\" " // not accurate! //"lastspidered=\"%" PRIu32"\" " "wordposstart=\"%" PRId32"\" " "id=\"%" PRId32"\" " "siterank=\"%" PRId32"\" " "text=\"", //hh , //(int32_t)k->m_datedbDate, (uint32_t)k->m_firstIndexedDate, //(uint32_t)k->m_lastSpidered, (int32_t)k->m_wordPosStart, inlinkId, //linkScore); (int32_t)k->m_siteRank ); // HACK!!! k->m_siteHash = inlinkId; // inc it inlinkId++; // encode it for xml if ( !sb->htmlEncode ( hb.getBufStart(), hb.length(), false)) return false; sb->safePrintf("\"/>\n"); continue; } if ( firstTime ) { sb->safePrintf("<font size=-1>"); sb->safePrintf("<table border=1>" "<tr><td colspan=10>" "<center>" "<b>Inlinks with Query Terms</b>" "</center>" "</td></tr>" "<tr>" "<td>Inlink Text</td>" "<td>From Site</td>" "<td>Site IP</td>" "<td>Site Rank</td>" "</tr>" ); } firstTime = false; sb->safePrintf("<tr><td>" "<a href=\"/get?c=%s&d=%" PRId64"&cnsp=0\">" //"<a href=\"/print?" //"page=7&" //"c=%s&" //"d=%" PRId64"\">" //k->getUrl()); ,si->m_cr->m_coll ,k->m_docId); if ( ! sb->safeMemcpy(&hb) ) return false; int32_t hostLen = 0; const char *host = getHostFast(k->getUrl(),&hostLen,NULL); sb->safePrintf("</td><td>"); if ( host ) sb->safeMemcpy(host,hostLen); sb->safePrintf("</td><td>"); char ipbuf[16]; sb->safePrintf("<a href=\"/search?c=%s&q=ip%%3A%s" "&n=100\">" ,si->m_cr->m_coll,iptoa(k->m_ip,ipbuf)); sb->safePrintf("%s</a>",iptoa(k->m_ip,ipbuf)); sb->safePrintf("</td><td>%" PRId32"</td></tr>" ,(int32_t)k->m_siteRank); //sb->safePrintf("<br>"); printedInlinkText = true; *numPrinted = *numPrinted + 1; } int64_t took = gettimeofdayInMilliseconds() - starttime; if ( took > 2 ) log("timing: took %" PRId64" ms to highlight %" PRId32" links." ,took,numLinks); // closer for xml if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t</inlinks>\n"); //if ( printedInlinkText ) sb->safePrintf("<br>\n"); if ( printedInlinkText ) sb->safePrintf("</font>" "</table>" "<br>"); return true; } // use this for xml as well as html static bool printResult(State0 *st, int32_t ix, int32_t *numPrintedSoFar) { SafeBuf *sb = &st->m_sb; HttpRequest *hr = &st->m_hr; CollectionRec *cr = g_collectiondb.getRec ( st->m_collnum ); if ( ! cr ) { log("query: printResult: collnum %" PRId32" gone", (int32_t)st->m_collnum); return true; } // shortcuts SearchInput *si = &st->m_si; Msg40 *msg40 = &st->m_msg40; // ensure not all cluster levels are invisible if ( si->m_debug ) logf(LOG_DEBUG,"query: result #%" PRId32" clusterlevel=%" PRId32, ix, (int32_t)msg40->getClusterLevel(ix)); int64_t d = msg40->getDocId(ix); // this is normally a double, but cast to float float docScore = (float)msg40->getScore(ix); unsigned docFlags = msg40->getFlags(ix); if ( si->m_docIdsOnly ) { if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t<result>\n" "\t\t<docId>%" PRId64"</docId>\n" "\t</result>\n", d ); } else if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t{\n" "\t\t\"docId\":%" PRId64"\n" "\t},\n", d ); } else { sb->safePrintf("%" PRId64"<br/>\n", d ); } // inc it *numPrintedSoFar = *numPrintedSoFar + 1; return true; } Msg20 *m20 = msg40->m_msg20[ix]; // get the reply Msg20Reply *mr = m20->m_r; // . sometimes the msg20reply is NULL so prevent it coring // . i think this happens if all hosts in a shard are down or timeout // or something if ( ! mr ) { if(si->m_format == FORMAT_HTML) sb->safePrintf("<i>getting summary for docid %" PRId64" had error: %s</i><br>\n", d, mstrerror(m20->m_errno)); return true; } int32_t cursor = -1; if ( si->m_format == FORMAT_XML ) cursor = sb->length(); if ( si->m_format == FORMAT_JSON ) cursor = sb->length(); if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t<result>\n" ); if ( si->m_format == FORMAT_JSON ) { if ( *numPrintedSoFar != 0 ) { sb->safePrintf(",\n"); } sb->safePrintf("\t{\n" ); } if ( mr->ptr_content && si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<content><![CDATA["); cdataEncode(sb, mr->ptr_content); sb->safePrintf("]]></content>\n"); } if ( mr->ptr_content && si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"content\":\""); sb->jsonEncode(mr->ptr_content); sb->safePrintf("\",\n"); } Highlight hi; // get the url const char *url = mr->ptr_ubuf; int32_t urlLen = mr->size_ubuf - 1; int32_t err = mr->m_errno; // . remove any session ids from the url // . for speed reasons, only check if its a cgi url Url uu; uu.set( url, urlLen, false, true ); url = uu.getUrl(); urlLen = uu.getUrlLen(); bool isAdmin = (si->m_isMasterAdmin || si->m_isCollAdmin); if ( si->m_format == FORMAT_XML ) isAdmin = false; if ( si->m_format == FORMAT_HTML ) { sb->safePrintf("<table><tr><td>"); } if ( si->m_showBanned ) { if ( err == EDOCBANNED ) err = 0; if ( err == EDOCFILTERED ) err = 0; } // if this msg20 had an error print "had error" if ( err || urlLen <= 0 || ! url ) { // revert back so we do not break the json/xml if ( cursor >= 0 ) sb->m_length = cursor; // it's unprofessional to display this in browser // so just let admin see it if ( isAdmin && si->m_format == FORMAT_HTML ) { sb->safePrintf("<i>docId %" PRId64" had error: %s</i><br><br>", mr->m_docId, mstrerror(err)); } log("query: docId %" PRId64" had error: %s.", mr->m_docId,mstrerror(err)); // wrap it up if clustered // DO NOT inc it otherwise puts a comma in there and // screws up the json //*numPrintedSoFar = *numPrintedSoFar + 1; return true; } // if we have a thumbnail show it next to the search result, // base64 encoded. do NOT do this for the WIDGET, only for search // results in html/xml. if ( (si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) && si->m_showImages && mr->ptr_imgData ) { ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData; ThumbnailInfo *ti = ta->getThumbnailInfo(0); if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t"); ti->printThumbnailInHtml ( sb , 100 , // max width 100 , // max height true , // add <a href> NULL , " style=\"margin:10px;\" ", si->m_format ); if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<imageHeight>%" PRId32"</imageHeight>\n", ti->m_dy); sb->safePrintf("\t\t<imageWidth>%" PRId32"</imageWidth>\n", ti->m_dx); sb->safePrintf("\t\t<origImageHeight>%" PRId32 "</origImageHeight>\n", ti->m_origDY); sb->safePrintf("\t\t<origImageWidth>%" PRId32 "</origImageWidth>\n", ti->m_origDX); sb->safePrintf("\t\t<imageUrl><![CDATA["); cdataEncode(sb, ti->getUrl()); sb->safePrintf("]]></imageUrl>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"imageHeight\":%" PRId32",\n", ti->m_dy); sb->safePrintf("\t\t\"imageWidth\":%" PRId32",\n", ti->m_dx); sb->safePrintf("\t\t\"origImageHeight\":%" PRId32",\n", ti->m_origDY); sb->safePrintf("\t\t\"origImageWidth\":%" PRId32",\n", ti->m_origDX); sb->safePrintf("\t\t\"imageUrl\":\""); sb->jsonEncode(ti->getUrl()); sb->safePrintf("\",\n"); } } bool isWide = false; int32_t newdx = 0; // print image for widget if ( ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX || si->m_format == FORMAT_WIDGET_APPEND ) ) { int32_t widgetWidth = hr->getLong("widgetwidth",200); // prevent coring if ( widgetWidth < 1 ) widgetWidth = 1; // each search result in widget has a div around it sb->safePrintf("<div " "class=result " // we need the docid and score of last result // when we append new results to the end // of the widget for infinite scrolling // using the scripts in PageBasic.cpp "docid=%" PRId64" " "score=%f " // double "style=\"" "width:%" PRId32"px;" "min-height:%" PRId32"px;" "height:%" PRId32"px;" "padding:%" PRId32"px;" "position:relative;" // summary overflows w/o this! "overflow-y:hidden;" "overflow-x:hidden;" "\"" ">" , mr->m_docId // this is a double now. this won't work // for streaming... , msg40->m_msg3a.getScores()[ix] // subtract 8 for scrollbar on right , widgetWidth - 2*8 - 8 // padding is 8px , (int32_t)RESULT_HEIGHT , (int32_t)RESULT_HEIGHT , (int32_t)PADDING ); if ( mr->ptr_imgData ) { ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData; ThumbnailInfo *ti = ta->getThumbnailInfo(0); // account for scrollbar on the right int32_t maxWidth = widgetWidth - (int32_t)SCROLLBAR_WIDTH; int32_t maxHeight = (int32_t)RESULT_HEIGHT; // false = do not print <a href> link on image ti->printThumbnailInHtml ( sb , maxWidth , maxHeight , false , // add <a href> &newdx ); } sb->safePrintf ( "<a " "target=_blank " "style=\"text-decoration:none;" // don't let scroll bar obscure text "margin-right:%" PRId32"px;" ,(int32_t)SCROLLBAR_WIDTH ); // if thumbnail is wide enough put text on top of it, otherwise // image is to the left and text is to the right of image if ( newdx > .5 * widgetWidth ) { isWide = true; sb->safePrintf("position:absolute;" "bottom:%" PRId32";" "left:%" PRId32";" , (int32_t) PADDING , (int32_t) PADDING ); } // to align the text verticall we gotta make a textbox div // otherwise it wraps below image! mdw //else // sb->safePrintf("vertical-align:middle;"); else sb->safePrintf("position:absolute;" "bottom:%" PRId32";" "left:%" PRId32";" , (int32_t) PADDING , (int32_t) PADDING + newdx + 10 ); // close the style and begin the url sb->safePrintf( "\" " "href=\"" ); // truncate off -diffbotxyz%" PRId32" int32_t newLen = urlLen; // print the url in the href tag sb->safeMemcpy ( url , newLen ); // then finish the a href tag and start a bold for title sb->safePrintf ( "\">"); sb->safePrintf("<b style=\"" "text-decoration:none;" "font-size: 15px;" "font-weight:bold;" "background-color:rgba(0,0,0,.5);" "color:white;" "font-family:arial;" "text-shadow: 2px 2px 0 #000 " ",-2px -2px 0 #000 " ",-2px 2px 0 #000 " ", 2px -2px 0 #000 " ", 2px -2px 0 #000 " ", 0px -2px 0 #000 " ", 0px 2px 0 #000 " ", -2px 0px 0 #000 " ", 2px 0px 0 #000 " ";" "\">"); // then title over image } // only do link here if we have no thumbnail so no bg image if ( (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) && ! mr->ptr_imgData ) { sb->safePrintf ( "<a style=text-decoration:none;" "color:white; " "href=" ); // truncate off -diffbotxyz%" PRId32" int32_t newLen = urlLen; // print the url in the href tag sb->safeMemcpy ( url , newLen ); // then finish the a href tag and start a bold for title sb->safePrintf ( ">"); } // the a href tag if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "\n\n" ); // then if it is banned if ( mr->m_isBanned && si->m_format == FORMAT_HTML ) sb->safePrintf("<font color=red><b>BANNED</b></font> "); /////// // // PRINT THE TITLE // /////// // the a href tag if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ( "<a href=\"" ); // truncate off -diffbotxyz%" PRId32" int32_t newLen = urlLen; // print the url in the href tag sb->safeMemcpy ( url , newLen ); // then finish the a href tag and start a bold for title sb->safePrintf ( "\">"); } const char *str = nullptr; int32_t strLen = 0; // . then the title (should be NULL terminated) // . the title can be NULL // . highlight it first // . the title itself should not have any tags in it! str = mr->ptr_tbuf; strLen = mr->size_tbuf - 1; if (!str || strLen < 0) { strLen = 0; } std::string preferredResultLang = si->getPreferredResultLanguage(); std::string overriddenTitle; // override title if (strLen == 0 && mr->m_indexCode == EDOCDISALLOWEDROOT) { overriddenTitle = g_robotsBlockedResultOverride.getTitle(preferredResultLang, uu); str = overriddenTitle.c_str(); strLen = overriddenTitle.length(); } const char *frontTag = "<font style=\"color:black;background-color:yellow\">" ; const char *backTag = "</font>"; if ( si->m_format == FORMAT_XML ) { frontTag = "<b>"; backTag = "</b>"; } if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) { frontTag = "<font style=\"background-color:yellow\">" ; } int32_t cols = si->m_summaryMaxWidth; // url encode title StackBuf<> tmpTitle; if ( str && strLen ) { tmpTitle.htmlEncode(str, strLen, false); } StackBuf<> hb; if ( str && strLen && si->m_doQueryHighlighting ) { hi.set ( &hb, tmpTitle.getBufStart(), tmpTitle.length(), &si->m_hqq, frontTag, backTag); // reassign! str = hb.getBufStart(); strLen = hb.length(); } // . use "UNTITLED" if no title // . msg20 should supply the dmoz title if it can if ( strLen == 0 && si->m_format != FORMAT_XML && si->m_format != FORMAT_JSON ) { str = "<i>UNTITLED</i>"; strLen = strlen(str); } if ( str && strLen && ( si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) ) { // determine if TiTle wraps, if it does add a <br> count for each wrap if ( !sb->brify( str, strLen, cols ) ) { return false; } } // close up the title tag if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<title><![CDATA["); if ( str ) { cdataEncode(sb, str); } sb->safePrintf("]]></title>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"title\":\""); if ( str ) { sb->jsonEncode(str); } sb->safePrintf("\",\n"); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("</a><br>\n" ) ; // close the title tag stuf if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("</b></a>\n"); // // print <h1> tag contents. hack for client. // char *hp = mr->ptr_htag; char *hpend = hp + mr->size_htag; for ( ; hp && hp < hpend ; ) { if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<h1Tag><![CDATA["); cdataEncode(sb, hp); sb->safePrintf("]]></h1Tag>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"h1Tag\":\""); sb->jsonEncode(hp); sb->safePrintf("\",\n"); } // it is a \0 separated list of headers generated from XmlDoc::getHeaderTagBuf() hp += strlen(hp) + 1; } // print the [cached] link? bool printCached; if ( mr->m_contentLen <= 0 ) printCached = false; //nothing to show else if ( isAdmin ) printCached = true; //admin can bypass noarchive tag else if( mr->m_noArchive ) printCached = false; //page doesn't want to be archived. honour that. else printCached = true; ///// // // print content type after title // ///// unsigned char ctype = mr->m_contentType; const char *cs = g_contentTypeStrings[ctype]; if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<contentType><![CDATA[%s]]></contentType>\n", cs); if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs); if ( si->m_format == FORMAT_HTML && ctype != CT_HTML && ctype != CT_UNKNOWN ) { sb->safePrintf(" <b><font style=color:white;background-color:maroon;>"); const char *p = cs; for ( ; *p ; p++ ) { char c = to_upper_a(*p); sb->pushChar(c); } sb->safePrintf("</font></b> "); } //////////// // // print the summary // //////////// // do the normal summary str = mr->ptr_displaySum; // sometimes the summary is longer than requested because for // summary deduping purposes (see "pss" parm in Parms.cpp) we do not // get it as short as requested. so use mr->m_sumPrintSize here // not mr->size_sum strLen = mr->size_displaySum - 1; // this includes the terminating \0 or \0\0 so back up if ( strLen < 0 ) { strLen = 0; } // override summary std::string overriddenSummary; if (strLen == 0 && mr->m_indexCode == EDOCDISALLOWEDROOT) { overriddenSummary = g_robotsBlockedResultOverride.getSummary(preferredResultLang, uu); str = overriddenSummary.c_str(); strLen = overriddenSummary.length(); } bool printSummary = true; // do not print summaries for widgets by default unless overridden with &summary=1 int32_t defSum = 0; // if no image then default the summary to on if ( ! mr->ptr_imgData ) { defSum = 1; } if ( (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) && hr->getLong("summaries",defSum) == 0 ) { printSummary = false; } if ( printSummary && (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) ) { int32_t sumLen = strLen; if ( sumLen > 150 ) sumLen = 150; if ( sumLen ) { sb->safePrintf("<br>"); sb->safeTruncateEllipsis ( str , sumLen ); } } if ( si->m_format == FORMAT_HTML ) { if ( printSummary ) { sb->brify( str, strLen, cols ); } // new line if not xml. even summary is empty we need it too like // when showing xml docs - MDW 9/28/2014 sb->safePrintf( "<br>\n" ); } else if ( si->m_format == FORMAT_XML ) { sb->safePrintf( "\t\t<sum><![CDATA[" ); cdataEncode(sb, str); sb->safePrintf( "]]></sum>\n" ); } else if ( si->m_format == FORMAT_JSON ) { sb->safePrintf( "\t\t\"sum\":\"" ); sb->jsonEncode( str ); sb->safePrintf( "\",\n" ); } ///////// // // meta tag values for &dt=keywords ... // ///////// if ( mr->ptr_dbuf && mr->size_dbuf > 1 ) printMetaContent ( msg40 , ix,st,sb); //////////// // // print the URL // //////////// StackBuf<> tmpBuf; char* displayUrl = Url::getDisplayUrl(url, &tmpBuf); uint32_t displayUrlLen = tmpBuf.length(); // hack off the http:// if any for displaying it on screen if ( displayUrlLen > 8 && strncmp ( displayUrl , "http://" , 7 )==0 ) { displayUrl += 7; displayUrlLen -= 7; } // . remove trailing / // . only remove from root urls in case user cuts and // pastes it for link: search if ( displayUrl [ displayUrlLen - 1 ] == '/' ) { // see if any other slash before us int32_t j; for ( j = displayUrlLen - 2 ; j >= 0 ; j-- ) if ( displayUrl[j] == '/' ) break; // if there wasn't, we must have been a root url // so hack off the last slash if ( j < 0 ) displayUrlLen--; } if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ("<font color=gray>" ); //sb->htmlEncode ( url , strlen(url) , false ); // 20 for the date after it sb->safeTruncateEllipsis ( displayUrl , 50 ); // cols - 30 ); // turn off the color sb->safePrintf ( "</font>\n" ); } // print url for widgets now if ( (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) ) { //sb->safePrintf ("<br><font color=gray size=-1>" ); // print url for widgets in top left if we have a wide image // otherwise it gets truncated below the title for some reason if ( isWide ) sb->safePrintf ("<br><font color=white size=-1 " "style=position:absolute;left:10px;" "top:10px;background-color:black;>" ); else if ( mr->ptr_imgData ) sb->safePrintf ("<br><font color=gray size=-1 " "style=position:absolute;left:%" PRId32"px;" "top:10px;>" , (int32_t) PADDING + newdx + 10 ); else sb->safePrintf ("<br><font color=gray size=-1>"); // print the url now, truncated to 50 chars sb->safeTruncateEllipsis ( url , 50 ); // cols - 30 ); sb->safePrintf ( "</font>\n" ); } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<url><![CDATA["); sb->safeMemcpy ( displayUrl , displayUrlLen ); sb->safePrintf("]]></url>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"url\":\""); sb->jsonEncode ( displayUrl , displayUrlLen ); sb->safePrintf("\",\n"); } // now the last spidered date of the document time_t ts = mr->m_lastSpidered; if(si->m_format == FORMAT_HTML && ts>0) { sb->safePrintf(" - indexed: "); time_t now = getTime(); printTimeAgo(sb, now-ts, now, false); } // the date it was last modified ts = mr->m_lastModified; if(si->m_format == FORMAT_HTML && ts>0) { sb->safePrintf(" - modified: "); time_t now = getTime(); printTimeAgo(sb, now-ts, now, false); } // // more xml stuff // if ( si->m_format == FORMAT_XML ) { // doc size in Kilobytes sb->safePrintf ("\t\t<size><![CDATA[%4.0fk]]></size>\n", (float)mr->m_contentLen/1024.0); sb->safePrintf ( "\t\t<sizeInBytes>%" PRId32"</sizeInBytes>\n", mr->m_contentLen); // . docId for possible cached link // . might have merged a bunch together sb->safePrintf("\t\t<docId>%" PRId64"</docId>\n",mr->m_docId ); sb->safePrintf("\t\t<docScore>%f</docScore>\n",docScore); // . show the site root // . for hompages.com/users/fred/mypage.html this will be // homepages.com/users/fred/ // . for www.xyz.edu/~foo/burp/ this will be // www.xyz.edu/~foo/ etc. // seems like this isn't the way to do it, cuz Tagdb.cpp // adds the "site" tag itself and we do not always have it // in the XmlDoc::ptr_tagRec... so do it this way: const char *site = mr->ptr_site; int32_t siteLen = mr->size_site-1; //char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec); sb->safePrintf("\t\t<site><![CDATA["); if ( site && siteLen > 0 ) sb->safeMemcpy ( site , siteLen ); sb->safePrintf("]]></site>\n"); //int32_t sh = hash32 ( site , siteLen ); //sb->safePrintf ("\t\t<siteHash32>%" PRIu32"</siteHash32>\n",sh); //int32_t dh = uu.getDomainHash32 (); //sb->safePrintf ("\t\t<domainHash32>%" PRIu32"</domainHash32>\n",dh); // spider date sb->safePrintf ( "\t\t<spidered>%" PRIu32"</spidered>\n", (uint32_t)mr->m_lastSpidered); // backwards compatibility for buzz sb->safePrintf ( "\t\t<firstIndexedDateUTC>%" PRIu32 "</firstIndexedDateUTC>\n", (uint32_t)mr->m_firstIndexedDate); sb->safePrintf( "\t\t<contentHash32>%" PRIu32 "</contentHash32>\n", (uint32_t)mr->m_contentHash32); // pub date int32_t datedbDate = mr->m_datedbDate; // show the datedb date as "<pubDate>" for now if ( datedbDate != -1 ) sb->safePrintf ( "\t\t<pubdate>%" PRIu32"</pubdate>\n", (uint32_t)datedbDate); } if ( si->m_format == FORMAT_JSON ) { // doc size in Kilobytes sb->safePrintf ( "\t\t\"size\":\"%4.0fk\",\n", (float)mr->m_contentLen/1024.0); sb->safePrintf ( "\t\t\"sizeInBytes\":%" PRId32 ",\n", mr->m_contentLen); // . docId for possible cached link // . might have merged a bunch together sb->safePrintf("\t\t\"docId\":%" PRId64",\n", mr->m_docId ); sb->safePrintf("\t\t\"docScore\":%f,\n", docScore); sb->safePrintf("\t\t\"flags\":%u,\n", docFlags); sb->safePrintf("\t\t\"cacheAvailable\":%s,\n", printCached?"true":"false"); sb->safePrintf("\t\t\"isAdult\":%s,\n", mr->m_isAdult?"true":"false"); // . show the site root // . for hompages.com/users/fred/mypage.html this will be // homepages.com/users/fred/ // . for www.xyz.edu/~foo/burp/ this will be // www.xyz.edu/~foo/ etc. // seems like this isn't the way to do it, cuz Tagdb.cpp // adds the "site" tag itself and we do not always have it // in the XmlDoc::ptr_tagRec... so do it this way: const char *site = mr->ptr_site; int32_t siteLen = mr->size_site-1; //char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec); sb->safePrintf("\t\t\"site\":\""); if ( site && siteLen > 0 ) sb->safeMemcpy ( site , siteLen ); sb->safePrintf("\",\n"); // spider date sb->safePrintf ( "\t\t\"spidered\":%" PRIu32 ",\n", (uint32_t)mr->m_lastSpidered); // backwards compatibility for buzz sb->safePrintf ( "\t\t\"firstIndexedDateUTC\":%" PRIu32 ",\n", (uint32_t) mr->m_firstIndexedDate); sb->safePrintf( "\t\t\"contentHash32\":%" PRIu32 ",\n", (uint32_t)mr->m_contentHash32); // pub date int32_t datedbDate = mr->m_datedbDate; // show the datedb date as "<pubDate>" for now if ( datedbDate != -1 ) sb->safePrintf ( "\t\t\"pubdate\":%" PRIu32 ",\n", (uint32_t)datedbDate); } // . we also store the outlinks in a linkInfo structure // . we can call LinkInfo::set ( Links *outlinks ) to set it // in the msg20 LinkInfo *outlinks = (LinkInfo *)mr->ptr_outlinks; // NULLify if empty if ( mr->size_outlinks <= 0 ) outlinks = NULL; // only for xml for now if ( si->m_format == FORMAT_HTML ) outlinks = NULL; Inlink *k = NULL; // do we need absScore2 for outlinks? while ( outlinks && (k =outlinks->getNextInlink(k))) // print it out sb->safePrintf("\t\t<outlink docId=\"%" PRId64"\" hostId=\"%" PRIu32"\" indexed=\"%" PRId32"\" pubdate=\"%" PRId32"\" ", k->m_docId , (uint32_t)k->m_ip,//hostHash, but use ip for now (int32_t)k->m_firstIndexedDate , (int32_t)k->m_datedbDate ); if ( si->m_format == FORMAT_XML ) { // result sb->safePrintf("\t\t<language><![CDATA[%s]]></language>\n", getLanguageString(mr->m_language)); sb->safePrintf("\t\t<langAbbr>%s</langAbbr>\n", getLanguageAbbr(mr->m_language)); } if ( si->m_format == FORMAT_JSON ) { // result sb->safePrintf("\t\t\"language\":\"%s\",\n", getLanguageString(mr->m_language)); sb->safePrintf("\t\t\"langAbbr\":\"%s\",\n", getLanguageAbbr(mr->m_language)); } // // end more xml stuff // if ( si->m_format == FORMAT_HTML ) { int32_t lang = mr->m_language; if ( lang ) sb->safePrintf(" - %s",getLanguageString(lang)); sb->safePrintf("<br>\n"); } // if searching multiple collections - federated search CollectionRec *scr = g_collectiondb.getRec ( mr->m_collnum ); const char *coll = "UNKNOWN"; if ( scr ) coll = scr->m_coll; if ( si->m_format == FORMAT_HTML && printCached ) { sb->safePrintf ( "<a href=\"/get?q=%s&qlang=%s&c=%s&d=%" PRId64 "&cnsp=0\">cached</a>\n", st->m_qesb.getBufStart() , si->m_defaultSortLang, // "qlang" parm coll , mr->m_docId ); } // unhide the divs on click int32_t placeHolder = -1; int32_t placeHolderLen = 0; if ( si->m_format == FORMAT_HTML && si->m_getDocIdScoringInfo ) { // place holder for backlink table link placeHolder = sb->length(); sb->safePrintf (" - <a onclick=" "\"" "var e = document.getElementById('bl%" PRId32"');" "if ( e.style.display == 'none' ){" "e.style.display = '';" "}" "else {" "e.style.display = 'none';" "}" "\"" " " "style=" "cursor:hand;" "cursor:pointer;" "color:blue;>" "<u>00000 backlinks</u>" "</a>\n" , ix ); placeHolderLen = sb->length() - placeHolder; // unhide the scoring table on click sb->safePrintf (" - <a onclick=" "\"" "var e = document.getElementById('sc%" PRId32"');" "if ( e.style.display == 'none' ){" "e.style.display = '';" "}" "else {" "e.style.display = 'none';" "}" "\"" " " "style=" "cursor:hand;" "cursor:pointer;" "color:blue;>" "scoring" "</a>\n" ,ix ); } if ( si->m_format == FORMAT_HTML ) { // reindex sb->safePrintf(" - <a style=color:blue; href=\"/addurl?urls="); urlEncode(sb, url, strlen(url), false); uint64_t rand64 = gettimeofdayInMilliseconds(); sb->safePrintf("&c=%s&rand64=%" PRIu64"\">respider</a>\n", coll,rand64); sb->safePrintf (" - " "<a style=color:blue; " "href=\"/search?sb=1&c=%s&" "q=gbfieldmatch%%3AgbssUrl%%3A" , coll ); // do not include ending \0 urlEncode(sb, mr->ptr_ubuf, mr->size_ubuf-1, false); sb->safePrintf("\">spider info</a>\n"); // // show rainbow sections link // sb->safePrintf ( " - <a style=color:blue; href=\"" "/get?" "page=4&" // show rainbow sections "q=%s&" "qlang=%s&" "c=%s&" "d=%" PRId64"&" "cnsp=0\">" "sections</a>\n", st->m_qesb.getBufStart() , si->m_defaultSortLang, // "qlang" parm coll , mr->m_docId ); sb->safePrintf ( " - <a style=color:blue; href=\"" "/get?" "page=1&" // show rainbow sections //"q=%s&" //"qlang=%s&" "c=%s&" "d=%" PRId64"&" "cnsp=0\">" "page info</a>\n", //st->m_qe , // "qlang" parm //si->m_defaultSortLang, coll , mr->m_docId ); sb->safePrintf ( " - <a style=color:blue; href=\"" "/get?" "page=5&" // show rainbow sections //"q=%s&" //"qlang=%s&" "c=%s&" "d=%" PRId64"&" "cnsp=0\">" "term info</a>\n", //st->m_qe , // "qlang" parm //si->m_defaultSortLang, coll , mr->m_docId ); char ipbuf1[16]; char ipbuf2[16]; sb->safePrintf(//"<br>" " - <a style=color:blue; href=\"/search?" "c=%s&sc=1&dr=0&q=ip:%s&" "n=100&usecache=0\">%s</a>\n", coll,iptoa(mr->m_ip,ipbuf1), iptoa(mr->m_ip,ipbuf2) ); // ip domain link unsigned char *us = (unsigned char *)&mr->m_ip;//urlip; sb->safePrintf (" - <a style=color:blue; " "href=\"/search?c=%s&sc=1&dr=0&n=100&" "q=ip:%" PRId32".%" PRId32".%" PRId32"&" "usecache=0\">%" PRId32".%" PRId32".%" PRId32"</a>\n", coll, (int32_t)us[0],(int32_t)us[1],(int32_t)us[2], (int32_t)us[0],(int32_t)us[1],(int32_t)us[2]); } char dbuf [ MAX_URL_LEN ]; int32_t dlen = uu.getDomainLen(); if ( si->m_format == FORMAT_HTML ) { memcpy ( dbuf , uu.getDomain() , dlen ); dbuf [ dlen ] = '\0'; // newspaperarchive urls have no domain if ( dlen == 0 ) { dlen = uu.getHostLen(); memcpy ( dbuf , uu.getHost() , dlen ); dbuf [ dlen ] = '\0'; } // admin always gets the site: option so he can ban sb->safePrintf (" - " " <a style=color:blue; href=\"/search?" "q=site%%3A%s&sc=0&c=%s\">" "domain</a>\n" , dbuf , coll ); } if ( si->m_format == FORMAT_HTML && si->m_doSiteClustering ) { char hbuf [ MAX_URL_LEN ]; int32_t hlen = uu.getHostLen(); memcpy ( hbuf , uu.getHost() , hlen ); hbuf [ hlen ] = '\0'; // make the cgi parm to add to the original url StackBuf<512> qq; qq.safePrintf("q="); urlEncode(&qq, "site:"); urlEncode(&qq, hbuf); urlEncode(&qq, " | "); qq.safeStrcpy(st->m_qesb.getBufStart()); qq.nullTerm(); // get the original url and add/replace in query StackBuf<512> newUrl; replaceParm ( qq.getBufStart() , &newUrl , hr ); // put show more results from this site link sb->safePrintf (" - <nobr><a href=\"%s\">" "more from this site</a></nobr>" , newUrl.getBufStart() ); } if (si->m_format == FORMAT_HTML && isAdmin ){ const char *un = ""; int32_t banVal = 1; if ( mr->m_isBanned ) { un = "UN"; banVal = 0; } // don't put on a separate line because then it is too // easy to mis-click on it sb->safePrintf(//"<br>" " - " " <a style=color:green; href=\"/admin/tagdb?" "user=admin&" "tagtype0=manualban&" "tagdata0=%" PRId32"&" "u=%s&c=%s\">" "<nobr>%sBAN %s" "</nobr></a>\n" , banVal , dbuf , coll , un , dbuf ); dlen = uu.getHostLen(); memcpy ( dbuf , uu.getHost() , dlen ); dbuf [ dlen ] = '\0'; sb->safePrintf(" - " " <a style=color:green; href=\"/admin/tagdb?" "user=admin&" "tagtype0=manualban&" "tagdata0=%" PRId32"&" "u=%s&c=%s\">" "<nobr>%sBAN %s</nobr></a>\n" , banVal , dbuf , coll , un , dbuf ); } if ( mr->size_metadataBuf && si->m_format == FORMAT_JSON) { sb->safePrintf("\t\t\"metadata\":["); //sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf); sb->safeStrcpy(mr->ptr_metadataBuf); // without this \n we seem to lose our ] i guess it gets // backed up over sb->safePrintf("],\n"); } if ( mr->size_metadataBuf && si->m_format == FORMAT_HTML) { sb->safePrintf("<br>"); Json md; JsonItem *ji = md.parseJsonStringIntoJsonItems(mr->ptr_metadataBuf); StackBuf<1024> nameBuf; for ( ; ji ; ji = ji->m_next ) { if(ji->isInArray()) continue; if(ji->m_type == JT_ARRAY) continue; ji->getCompoundName ( nameBuf ) ; if(nameBuf.length() == 0) { continue; } //nameBuf.replaceChar('-', '_'); nameBuf.nullTerm(); int32_t valLen; const char* valBuf = ji->getValueAsString(&valLen); StackBuf<1024> queryBuf; // log("compound name is %s %d %d",nameBuf.getBufStart(), // nameBuf.length(), valLen); queryBuf.safePrintf("/search?q=%s:%%22",nameBuf.getBufStart()); urlEncode(&queryBuf, valBuf, valLen); queryBuf.safePrintf("%%22&c=%s",coll); queryBuf.nullTerm(); sb->safePrintf(" - <a href=\"%s\">%s:\"", queryBuf.getBufStart(), nameBuf.getBufStart()); sb->safeMemcpy(valBuf, valLen); sb->safeStrcpy("\"</a>"); } } // end serp div if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("</div><hr>"); if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "<br><br>\n"); // search result spacer if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("<div style=line-height:%" PRId32"px;><br></div>", (int32_t)SERP_SPACER); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; // done? DocIdScore *dp = msg40->getScoreInfo(ix); if ( ! dp ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf ("\t</result>\n\n"); if ( si->m_format == FORMAT_JSON ) { // remove last ,\n sb->m_length -= 2; sb->safePrintf ("\n\t}\n"); } // wtf? //g_process.shutdownAbort(true); // at least close up the table if ( si->m_format != FORMAT_HTML ) return true; sb->safePrintf("</table>\n"); //blank out the "xxxxx backlinks" part memset(sb->getBufStart() + placeHolder, ' ', placeHolderLen); return true; } // // scoring info tables // int32_t nr = dp->m_numRequiredTerms; if ( nr == 1 ) nr = 0; // print breakout tables here for distance matrix // final score calc StackBuf<1024> ft; // put in a hidden div so you can unhide it if ( si->m_format == FORMAT_HTML ) sb->safePrintf("<div id=bl%" PRId32" style=display:none;>\n", ix ); // print xml and html inlinks int32_t numInlinks = 0; printInlinkText ( sb , mr , si , &numInlinks ); if ( si->m_format == FORMAT_HTML ) { sb->safePrintf("</div>"); sb->safePrintf("<div id=sc%" PRId32" style=display:none;>\n", ix ); } // if pair changes then display the sum int32_t lastTermNum1 = -1; int32_t lastTermNum2 = -1; float minScore = -1; // display all the PairScores for ( int32_t i = 0 ; i < dp->m_numPairs ; i++ ) { float totalPairScore = 0.0; // print all the top winners for this pair PairScore *fps = &dp->m_pairScores[i]; // if same combo as last time skip if ( fps->m_qtermNum1 == lastTermNum1 && fps->m_qtermNum2 == lastTermNum2 ) continue; lastTermNum1 = fps->m_qtermNum1; lastTermNum2 = fps->m_qtermNum2; bool firstTime = true; // print all pairs for this combo for ( int32_t j = i ; j < dp->m_numPairs ; j++ ) { // get it PairScore *ps = &dp->m_pairScores[j]; // stop if different pair now if ( ps->m_qtermNum1 != fps->m_qtermNum1 ) break; if ( ps->m_qtermNum2 != fps->m_qtermNum2 ) break; // skip if 0. neighborhood terms have weight of 0 now if ( almostEqualFloat(ps->m_finalScore, 0.0) ) continue; // first time? if ( firstTime && si->m_format == FORMAT_HTML ) { const Query *q = &si->m_q; printTermPairs ( sb , q , ps ); printScoresHeader ( sb ); firstTime = false; } // print it printPairScore ( sb , si , ps , mr ); // add it up totalPairScore += ps->m_finalScore; } if ( ft.length() ) ft.safePrintf(" , "); ft.safePrintf("%f",totalPairScore); // min? if ( minScore < 0.0 || totalPairScore < minScore ) minScore = totalPairScore; // we need to set "ft" for xml stuff below if ( si->m_format != FORMAT_HTML ) continue; sb->safePrintf("<tr><td><b>%.04f</b></td><td colspan=20>total of above scores</td></tr>", totalPairScore); // close table from printScoresHeader if ( ! firstTime ) sb->safePrintf("</table><br>"); } int32_t lastTermNum = -1; int32_t numSingles = dp->m_numSingles; // do not print this if we got pairs if ( dp->m_numPairs ) numSingles = 0; for ( int32_t i = 0 ; i < numSingles ; i++ ) { float totalSingleScore = 0.0; // print all the top winners for this single SingleScore *fss = &dp->m_singleScores[i]; // if same combo as last time skip if ( fss->m_qtermNum == lastTermNum ) continue; // do not reprint for this query term num lastTermNum = fss->m_qtermNum; bool firstTime = true; // print all singles for this combo for ( int32_t j = i ; j < dp->m_numSingles ; j++ ) { // get it SingleScore *ss = &dp->m_singleScores[j]; // stop if different single now if ( ss->m_qtermNum != fss->m_qtermNum ) break; // skip if 0. skip neighborhoods i guess if ( almostEqualFloat(ss->m_finalScore, 0.0) ) continue; // first time? if ( firstTime && si->m_format == FORMAT_HTML ) { const Query *q = &si->m_q; printSingleTerm ( sb , q , ss ); printScoresHeader ( sb ); firstTime = false; } // print it printSingleScore ( sb , si , ss , mr ); if(si->m_format == FORMAT_JSON && j+1 < dp->m_numSingles) sb->safePrintf(",\n"); else sb->safePrintf("\n"); // add up totalSingleScore += ss->m_finalScore; } if ( ft.length() ) ft.safePrintf(" , "); ft.safePrintf("%f",totalSingleScore); // min? if ( minScore < 0.0 || totalSingleScore < minScore ) minScore = totalSingleScore; // we need to set "ft" for xml stuff below if ( si->m_format != FORMAT_HTML ) continue; sb->safePrintf("<tr><td><b>%.04f</b></td><td colspan=20>total of above scores</td></tr>", totalSingleScore); // close table from printScoresHeader if ( ! firstTime ) sb->safePrintf("</table><br>"); } if ( si->m_format == FORMAT_HTML ) { sb->safePrintf("<table border=1 cellpadding=3>"); sb->safePrintf("<tr><td colspan=10><b><center>Ranking bits</center></b></td></tr>"); sb->safePrintf("<tr><td>Bit</td><td>Score multiplier</td><td>Rank adjustment</td></tr>"); if(docFlags) { for(int bit=0; bit < 26; bit++) { if( (1<<bit) & docFlags ) { sb->safePrintf("<tr><td>%d</td><td>%f</td><td>%d</td></tr>", bit, si->m_baseScoringParameters.m_flagScoreMultiplier[bit], si->m_baseScoringParameters.m_flagRankAdjustment[bit]); } } } else { sb->safePrintf("<tr><td colspan=10>None set for document</td></tr>"); } sb->safePrintf("</table><br>"); } const char *ff = ""; const char *ff2 = "sum"; // final score!!! if ( si->m_format == FORMAT_XML ) { sb->safePrintf ("\t\t<siteRank>%" PRId32"</siteRank>\n", (int32_t)dp->m_siteRank ); sb->safePrintf ("\t\t<numGoodSiteInlinks>%" PRId32 "</numGoodSiteInlinks>\n", (int32_t)mr->m_siteNumInlinks ); struct tm *timeStruct3; time_t pageInlinksLastUpdated = mr->m_pageInlinksLastUpdated; struct tm tm_buf; timeStruct3 = gmtime_r(&pageInlinksLastUpdated, &tm_buf); char tmp3[64]; strftime(tmp3, 64, "%b-%d-%Y(%H:%M:%S)", timeStruct3); // -1 means unknown if ( mr->m_pageNumInlinks >= 0 ) { // how many inlinks, external and internal, we have // to this page not filtered in any way!!! sb->safePrintf("\t\t<numTotalPageInlinks>%" PRId32 "</numTotalPageInlinks>\n", mr->m_pageNumInlinks); } // how many inlinking ips we got, including our own if // we link to ourself sb->safePrintf("\t\t<numUniqueIpsLinkingToPage>%" PRId32 "</numUniqueIpsLinkingToPage>\n", mr->m_pageNumUniqueIps); // how many inlinking cblocks we got, including our own if // we link to ourself sb->safePrintf("\t\t<numUniqueCBlocksLinkingToPage>%" PRId32 "</numUniqueCBlocksLinkingToPage>\n", mr->m_pageNumUniqueCBlocks); // how many "good" inlinks. i.e. inlinks whose linktext we // count and index. sb->safePrintf("\t\t<numGoodPageInlinks>%" PRId32 "</numGoodPageInlinks>\n", mr->m_pageNumGoodInlinks); sb->safePrintf("\t\t<pageInlinksLastComputedUTC>%" PRIu32 "</pageInlinksLastComputedUTC>\n", (uint32_t)mr->m_pageInlinksLastUpdated); float score = msg40->getScore(ix); sb->safePrintf("\t\t<finalScore>%f</finalScore>\n", score); sb->safePrintf ("\t\t<finalScoreEquationCanonical>" "<![CDATA[" "Final Score = (siteRank*%.01f+1) * " "(%s of above matrix scores)" "]]>" "</finalScoreEquationCanonical>\n" , si->m_baseScoringParameters.m_siteRankMultiplier, ff2); sb->safePrintf ("\t\t<finalScoreEquation>" "<![CDATA[" "<b>%.03f</b> = (%" PRId32"*%.01f+1) " , dp->m_finalScore, (int32_t)dp->m_siteRank, si->m_baseScoringParameters.m_siteRankMultiplier); sb->safePrintf(" * %.01f", si->m_baseScoringParameters.m_languageWeights[mr->m_language]); // the actual min then sb->safePrintf(" * %.03f",minScore); // no longer list all the scores sb->safePrintf("]]></finalScoreEquation>\n"); sb->safePrintf ("\t</result>\n\n"); return true; } if ( si->m_format != FORMAT_HTML ) return true; const char *cc = getCountryCode ( mr->m_country ); if ( mr->m_country == 0 ) cc = "Unknown"; float langBoost = si->m_baseScoringParameters.m_languageWeights[mr->m_language]; sb->safePrintf("<table border=1 cellpadding=3>"); sb->safePrintf("<tr><td colspan=10><b><center>Final score</center></b></td></tr>"); sb->safePrintf("<tr><td>DocId</td><td>%" PRId64"</td></tr>", dp->m_docId); sb->safePrintf("<tr><td>Site</td><td>%s</td></tr>", mr->ptr_site); sb->safePrintf("<tr><td>Language</td><td><font color=green><b>%s</b></font></td></tr>", getLanguageString(mr->m_language)); // use page language sb->safePrintf("<tr><td>Language boost</td><td><font color=green><b>%.01f</b></font></td></tr>", langBoost); sb->safePrintf("<tr><td>Country</td><td>%s</td></tr>", cc); sb->safePrintf("<tr><td>Original SiteRank</td><td><font color=blue>%" PRId32 "</font></td></tr>", (int32_t)dp->m_siteRank); sb->safePrintf("<tr><td>Adjusted SiteRank</td><td><font color=blue>%f</font></td></tr>", dp->m_adjustedSiteRank); if( dp->m_usePageTemperature ) { sb->safePrintf("<tr><td>Page temperature</td><td>%f</td></tr>", dp->m_pageTemperature); } else { sb->safePrintf("<tr><td>Page temperature</td><td>Not used</td></tr>"); } sb->safePrintf("<tr><td colspan=100>"); // list all final scores starting with pairs sb->safePrintf("<b>%f</b> = (<font color=blue>%" PRId32"</font>*%.01f+1)", dp->m_finalScore, (int32_t)dp->m_siteRank, si->m_baseScoringParameters.m_siteRankMultiplier); // Check if user specified a query language if ( si->m_queryLangId != 0 ) { // Query language same as document language? if( si->m_queryLangId == mr->m_language ) { sb->safePrintf(" * <font color=green><b>%.01f</b></font>", langBoost); } else if( mr->m_language == 0 ) { // Document language unknown, use the unknown language weight sb->safePrintf(" * <font color=green><b>%.01f</b></font>", langBoost); } } // list all final scores starting with pairs sb->safePrintf(" * %s(", ff); sb->safeMemcpy ( &ft ); sb->safePrintf(")</td></tr>"); sb->safePrintf("</table><br>"); // put in a hidden div so you can unhide it sb->safePrintf("</div>\n"); // result is in a table so we can put the result # in its own column sb->safePrintf("</td></tr></table>"); // space out 0000 backlinks char *p = sb->getBufStart() + placeHolder; if ( numInlinks == 0 ) memset(p, ' ', placeHolderLen); else if ( numInlinks > 0 && numInlinks < 99999 ) { char *ss = strstr ( p, "00000" ); if ( ss ) { char c = ss[5]; sprintf(ss,"%5" PRId32,numInlinks); ss[5] = c; } // print "1 backlink" not "1 backlinks" if ( numInlinks == 1 ) { char *xx = strstr(p,"backlinks"); if ( xx ) xx[8] = ' '; } } return true; } static bool printPairScore(SafeBuf *sb, const SearchInput *si, const PairScore *ps, Msg20Reply *mr) { // shortcut const Query *q = &si->m_q; int32_t qtn1 = ps->m_qtermNum1; int32_t qtn2 = ps->m_qtermNum2; unsigned char de1 = ps->m_densityRank1; unsigned char de2 = ps->m_densityRank2; float dnw1 = getDensityWeight(de1); float dnw2 = getDensityWeight(de2); int32_t hg1 = ps->m_hashGroup1; int32_t hg2 = ps->m_hashGroup2; float hgw1 = getHashGroupWeight(hg1); float hgw2 = getHashGroupWeight(hg2); int32_t wp1 = ps->m_wordPos1; int32_t wp2 = ps->m_wordPos2; unsigned char wr1 = ps->m_wordSpamRank1; float wsw1 = getWordSpamWeight(wr1); unsigned char wr2 = ps->m_wordSpamRank2; float wsw2 = getWordSpamWeight(wr2); // HACK for inlink text! if ( hg1 == HASHGROUP_INLINKTEXT ) wsw1 = getLinkerWeight(wr1); if ( hg2 == HASHGROUP_INLINKTEXT ) wsw2 = getLinkerWeight(wr2); const char *syn1 = "no"; const char *syn2 = "no"; float sw1 = 1.0; float sw2 = 1.0; if ( ps->m_isSynonym1 ) { syn1 = "yes"; sw1 = g_conf.m_baseScoringParameters.m_synonymWeight; } if ( ps->m_isSynonym2 ) { syn2 = "yes"; sw2 = g_conf.m_baseScoringParameters.m_synonymWeight; } const char *bs1 = "no"; const char *bs2 = "no"; if ( ps->m_isHalfStopWikiBigram1 ) bs1 = "yes"; if ( ps->m_isHalfStopWikiBigram2 ) bs2 = "yes"; float wbw1 = 1.0; float wbw2 = 1.0; if ( ps->m_isHalfStopWikiBigram1 ) wbw1 = WIKI_BIGRAM_WEIGHT; if ( ps->m_isHalfStopWikiBigram2 ) wbw2 = WIKI_BIGRAM_WEIGHT; QueryTerm *qt1 = &q->m_qterms[qtn1]; QueryTerm *qt2 = &q->m_qterms[qtn2]; int64_t tf1 = qt1->m_termFreq; int64_t tf2 = qt2->m_termFreq; float tfw1 = ps->m_tfWeight1; float tfw2 = ps->m_tfWeight2; const char *wp = "no"; float wiw = 1.0; if ( ps->m_inSameWikiPhrase ) { wp = "yes"; wiw = WIKI_WEIGHT; // 0.50; } int32_t a = ps->m_wordPos2; int32_t b = ps->m_wordPos1; const char *bes = ""; if ( a < b ) { a = ps->m_wordPos1; b = ps->m_wordPos2; // out of query order penalty! bes = "+ <b>1.0</b>"; } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<pairInfo>\n"); sb->safePrintf("\t\t\t<densityRank1>%" PRId32 "</densityRank1>\n", (int32_t)de1); sb->safePrintf("\t\t\t<densityRank2>%" PRId32 "</densityRank2>\n", (int32_t)de2); sb->safePrintf("\t\t\t<densityWeight1>%f" "</densityWeight1>\n", dnw1); sb->safePrintf("\t\t\t<densityWeight2>%f" "</densityWeight2>\n", dnw2); sb->safePrintf("\t\t\t<term1><![CDATA["); sb->safeMemcpy ( q->m_qterms[qtn1].m_term , q->m_qterms[qtn1].m_termLen ); sb->safePrintf("]]></term1>\n"); sb->safePrintf("\t\t\t<term2><![CDATA["); sb->safeMemcpy ( q->m_qterms[qtn2].m_term , q->m_qterms[qtn2].m_termLen ); sb->safePrintf("]]></term2>\n"); sb->safePrintf("\t\t\t<location1><![CDATA[%s]]>" "</location1>\n", getHashGroupString(hg1)); sb->safePrintf("\t\t\t<location2><![CDATA[%s]]>" "</location2>\n", getHashGroupString(hg2)); sb->safePrintf("\t\t\t<locationWeight1>%.01f" "</locationWeight1>\n", hgw1 ); sb->safePrintf("\t\t\t<locationWeight2>%.01f" "</locationWeight2>\n", hgw2 ); sb->safePrintf("\t\t\t<wordPos1>%" PRId32 "</wordPos1>\n", wp1 ); sb->safePrintf("\t\t\t<wordPos2>%" PRId32 "</wordPos2>\n", wp2 ); sb->safePrintf("\t\t\t<isSynonym1>" "<![CDATA[%s]]>" "</isSynonym1>\n", syn1); sb->safePrintf("\t\t\t<isSynonym2>" "<![CDATA[%s]]>" "</isSynonym2>\n", syn2); sb->safePrintf("\t\t\t<synonymWeight1>%.01f" "</synonymWeight1>\n", sw1); sb->safePrintf("\t\t\t<synonymWeight2>%.01f" "</synonymWeight2>\n", sw2); // word spam / link text weight const char *r1 = "wordSpamRank1"; const char *r2 = "wordSpamRank2"; const char *t1 = "wordSpamWeight1"; const char *t2 = "wordSpamWeight2"; if ( hg1 == HASHGROUP_INLINKTEXT ) { r1 = "inlinkSiteRank1"; t1 = "inlinkTextWeight1"; } if ( hg2 == HASHGROUP_INLINKTEXT ) { r2 = "inlinkSiteRank2"; t2 = "inlinkTextWeight2"; } sb->safePrintf("\t\t\t<%s>%" PRId32"</%s>\n", r1,(int32_t)wr1,r1); sb->safePrintf("\t\t\t<%s>%" PRId32"</%s>\n", r2,(int32_t)wr2,r2); sb->safePrintf("\t\t\t<%s>%.02f</%s>\n", t1,wsw1,t1); sb->safePrintf("\t\t\t<%s>%.02f</%s>\n", t2,wsw2,t2); // if offsite inlink text show the inlinkid for matching // to an <inlink> LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; Inlink *k = info ? info->getNextInlink(NULL) : NULL; for (;k&&hg1==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->getLinkText() ) continue; if ( k->m_wordPosStart > wp1 ) continue; if ( k->m_wordPosStart + 50 < wp1 ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t<inlinkId1>%" PRId32 "</inlinkId1>\n", k->m_siteHash); } k = info ? info->getNextInlink(NULL) : NULL; for (;k&&hg2==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->getLinkText() ) continue; if ( k->m_wordPosStart > wp2 ) continue; if ( k->m_wordPosStart + 50 < wp2 ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t<inlinkId2>%" PRId32 "</inlinkId2>\n", k->m_siteHash); } // term freq sb->safePrintf("\t\t\t<termFreq1>%" PRId64 "</termFreq1>\n",tf1); sb->safePrintf("\t\t\t<termFreq2>%" PRId64 "</termFreq2>\n",tf2); sb->safePrintf("\t\t\t<termFreqWeight1>%f" "</termFreqWeight1>\n",tfw1); sb->safePrintf("\t\t\t<termFreqWeight2>%f" "</termFreqWeight2>\n",tfw2); sb->safePrintf("\t\t\t<isWikiBigram1>" "%" PRId32"</isWikiBigram1>\n", (int32_t)(ps->m_isHalfStopWikiBigram1)); sb->safePrintf("\t\t\t<isWikiBigram2>" "%" PRId32"</isWikiBigram2>\n", (int32_t)(ps->m_isHalfStopWikiBigram2)); sb->safePrintf("\t\t\t<wikiBigramWeight1>%.01f" "</wikiBigramWeight1>\n", wbw1); sb->safePrintf("\t\t\t<wikiBigramWeight2>%.01f" "</wikiBigramWeight2>\n", wbw2); sb->safePrintf("\t\t\t<inSameWikiPhrase>" "<![CDATA[%s]]>" "</inSameWikiPhrase>\n", wp); sb->safePrintf("\t\t\t<queryDist>" "%" PRId32 "</queryDist>\n", ps->m_qdist ); sb->safePrintf("\t\t\t<wikiWeight>" "%.01f" "</wikiWeight>\n", wiw ); sb->safePrintf("\t\t\t<score>%f</score>\n", ps->m_finalScore); sb->safePrintf("\t\t\t<equationCanonical>" "<![CDATA[" "score = " " 100 * " " locationWeight1" // hgw " * " " locationWeight2" // hgw " * " " synonymWeight1" // synweight " * " " synonymWeight2" // synweight " * " " wikiBigramWeight1" " * " " wikiBigramWeight2" " * " //"diversityWeight1" //" * " //"diversityWeight2" //" * " "densityWeight1" //density weight " * " "densityWeight2" //density weight " * " "%s" // wordspam weight " * " "%s" // wordspam weight " * " "termFreqWeight1" // tfw " * " "termFreqWeight2" // tfw " / ( ||wordPos1 - wordPos2| " " - queryDist| + 1.0 ) * " "wikiWeight" "]]>" "</equationCanonical>\n" , t1 , t2 ); sb->safePrintf("\t\t\t<equation>" "<![CDATA[" "%f=" "100*" "<font color=orange>%.1f</font>"//hashgroupweight "*" "<font color=orange>%.1f</font>"//hashgroupweight "*" "<font color=blue>%.1f</font>" // syn weight "*" "<font color=blue>%.1f</font>" // syn weight "*" "<font color=green>%.1f</font>"//wikibigramweight "*" "<font color=green>%.1f</font>"//wikibigramweight "*" "<font color=purple>%.02f</font>"//density weight "*" "<font color=purple>%.02f</font>"//density weight "*" "<font color=red>%.02f</font>" // wordspam weight "*" "<font color=red>%.02f</font>" // wordspam weight "*" "<font color=magenta>%.02f</font>"//tf weight "*" "<font color=magenta>%.02f</font>"//tf weight , ps->m_finalScore , hgw1 , hgw2 , sw1 , sw2 , wbw1 , wbw2 , dnw1 , dnw2 , wsw1 , wsw2 , tfw1 , tfw2 ); if ( ps->m_fixedDistance ) sb->safePrintf( "/<b>%" PRId32"</b> " , (int32_t)FIXED_DISTANCE ); else sb->safePrintf( "/" "(((<font color=darkgreen>%" PRId32"</font>" "-<font color=darkgreen>%" PRId32"</font>" ")-<font color=lime>%" PRId32"</font>)+1.0%s)" , a,b,ps->m_qdist,bes); // wikipedia weight if ( !almostEqualFloat(wiw, 1.0) ) sb->safePrintf("*%.01f", wiw ); sb->safePrintf("]]>" "</equation>\n" ); sb->safePrintf("\t\t</pairInfo>\n"); return true; // continue; } // // print first term in first row // sb->safePrintf("<tr><td rowspan=3>"); sb->safePrintf("<a onclick=\"" "var e = document.getElementById('poo');" "if ( e.style.display == 'none' ){" "e.style.display = '';" "}" "else {" "e.style.display = 'none';" "}" "\">" ); sb->safePrintf("%.04f</a></td>",ps->m_finalScore); sb->safePrintf("<td>" "%s <font color=orange>" "%.01f</font></td>" , getHashGroupString(hg1) , hgw1 ); // the word position sb->safePrintf("<td>"); sb->safePrintf("<a href=\"/get?d="); sb->safePrintf("<a href=\"/get?d="); sb->safePrintf("%" PRId64 "&page=4" //"&page=sections&" "&hipos=%" PRId32 "&c=%s#hipos\">" "%" PRId32"</a></td>" "</a></td>" ,mr->m_docId ,(int32_t)ps->m_wordPos1 ,si->m_cr->m_coll ,(int32_t)ps->m_wordPos1); sb->safePrintf("<td>%s <font color=blue>%.02f</font></td>",syn1,sw1); sb->safePrintf("<td>%s <font color=green>%.02f</font></td>",bs1,wbw1); // density sb->safePrintf("<td>%" PRId32" <font color=purple>" "%.02f</font></td>", (int32_t)de1,dnw1); // word spam if ( hg1 == HASHGROUP_INLINKTEXT ) { sb->safePrintf("<td> </td>"); sb->safePrintf("<td>%" PRId32" <font color=red>" "%.02f</font></td>", (int32_t)wr1,wsw1); } else { sb->safePrintf("<td>%" PRId32, (int32_t)wr1); //if ( wsw1 != 1.0 ) sb->safePrintf( " <font color=red>" "%.02f</font>", wsw1); sb->safePrintf("</td>"); sb->safePrintf("<td> </td>"); } // term freq sb->safePrintf("<td id=tf>%" PRId64" <font color=magenta>" "%.02f</font></td>", tf1,tfw1); // inSamePhraseId distInQuery phraseWeight sb->safePrintf("<td>%s</td><td>%" PRId32"</td><td>%.01f</td>" ,wp,ps->m_qdist,wiw); // end the row sb->safePrintf("</tr>"); // // print 2nd term in 2nd row // sb->safePrintf("<tr><td>"); sb->safePrintf( "%s <font color=orange>" "%.01f</font></td>" , getHashGroupString(hg2) , hgw2 ); // the word position sb->safePrintf("<td>"); sb->safePrintf("<a href=\"/get?d="); sb->safePrintf("%" PRId64 "&page=4&" "hipos=%" PRId32"&c=%s#hipos\">" "%" PRId32"</a></td>" "</a></td>" ,mr->m_docId ,(int32_t)ps->m_wordPos2 ,si->m_cr->m_coll ,(int32_t)ps->m_wordPos2); sb->safePrintf("<td>%s <font color=blue>%.02f</font></td>",syn2,sw2); sb->safePrintf("<td>%s <font color=green>%.02f</font></td>",bs2,wbw2); // density sb->safePrintf("<td>%" PRId32" <font color=purple>" "%.02f</font></td>", (int32_t)de2,dnw2); // word spam if ( hg2 == HASHGROUP_INLINKTEXT ) { sb->safePrintf("<td> </td>"); sb->safePrintf("<td>%" PRId32" <font color=red>" "%.02f</font></td>", (int32_t)wr2,wsw2); } else { sb->safePrintf("<td>%" PRId32, (int32_t)wr2); //if ( wsw2 != 1.0 ) sb->safePrintf( " <font color=red>" "%.02f</font>", wsw2); sb->safePrintf("</td>"); sb->safePrintf("<td> </td>"); } // term freq sb->safePrintf("<td id=tf>%" PRId64" <font color=magenta>" "%.02f</font></td>", tf2,tfw2); // inSamePhraseId distInQuery phraseWeight sb->safePrintf("<td>%s</td><td>%" PRId32"</td><td>%.01f</td>" ,wp,ps->m_qdist,wiw); // end the row sb->safePrintf("</tr>"); sb->safePrintf("<tr><td "); sb->safePrintf("colspan=50>" // style=\"display:none\">" "%.03f " "= " //" ( " "100*" "<font color=orange>%.1f" "</font>" "*" "<font color=orange>%.1f" "</font>" "*" //"(%" PRId32" - " , ps->m_finalScore //, idstr , hgw1 , hgw2 //, (int32_t)MAXWORDPOS+1 ); sb->safePrintf("<font color=blue>%.1f</font>" "*" " <font color=blue>%.1f</font>" "*" // wiki bigram weight "<font color=green>%.02f</font>" "*" "<font color=green>%.02f</font>" "*" "<font color=purple>%.02f</font>" "*" "<font color=purple>%.02f</font>" "*" "<font color=red>%.02f</font>" "*" " <font color=red>%.02f</font>" "*" "<font color=magenta>%.02f</font>" "*" "<font color=magenta>%.02f</font>" , sw1 , sw2 , wbw1 , wbw2 , dnw1 , dnw2 , wsw1 , wsw2 , tfw1 , tfw2 ); if ( ps->m_fixedDistance ) sb->safePrintf( "/<b>%" PRId32"</b> " , (int32_t)FIXED_DISTANCE ); else sb->safePrintf( "/" "(((<font color=darkgreen>%" PRId32"</font>" "-<font color=darkgreen>%" PRId32"</font>)-" "<font color=lime>%" PRId32"</font>) + 1.0%s)" , a,b,ps->m_qdist,bes); // wikipedia weight if ( !almostEqualFloat(wiw, 1.0) ) sb->safePrintf("*%.01f", wiw ); sb->safePrintf( // end formula "</td></tr>" //"</table>" //"<br>"); ); return true; } static bool printSingleTerm(SafeBuf *sb, const Query *q, const SingleScore *ss) { int32_t qtn = ss->m_qtermNum; sb->safePrintf("<table border=1 cellpadding=3>"); sb->safePrintf("<tr><td colspan=50><center><b>"); // link to rainbow page //sb->safePrintf("<a href=\"/print?u="); //sb->urlEncode( mr->ptr_ubuf ); //sb->safePrintf("&page=4&recycle=1&c=%s\">",coll); if ( q->m_qterms[qtn].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn].m_term , q->m_qterms[qtn].m_termLen ); if ( q->m_qterms[qtn].m_isPhrase ) sb->pushChar('\"'); //sb->safePrintf("</a>"); sb->safePrintf("</b></center></td></tr>"); return true; } static bool printTermPairs(SafeBuf *sb, const Query *q, const PairScore *ps) { // print pair text int32_t qtn1 = ps->m_qtermNum1; int32_t qtn2 = ps->m_qtermNum2; sb->safePrintf("<table cellpadding=3 border=1>" "<tr><td colspan=20><center><b>"); if ( q->m_qterms[qtn1].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn1].m_term , q->m_qterms[qtn1].m_termLen ); if ( q->m_qterms[qtn1].m_isPhrase ) sb->pushChar('\"'); sb->safePrintf("</b> vs <b>"); if ( q->m_qterms[qtn2].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn2].m_term , q->m_qterms[qtn2].m_termLen ); if ( q->m_qterms[qtn2].m_isPhrase ) sb->pushChar('\"'); return true; } static bool printScoresHeader ( SafeBuf *sb ) { sb->safePrintf("<tr>" "<td>score</td>" "<td>location</td>" "<td>wordPos</td>" "<td>synonym</td>" "<td>wikibigram</td>" //"<td>diversityRank</td>" "<td>density</td>" "<td>spam</td>" "<td>inlinkPR</td>" // nlinkSiteRank</td>" "<td>termFreq</td>" "<td>inSamePhrase</td>" "<td>distInQuery</td>" "<td>phraseWeight</td>" "</tr>\n" ); return true; } static bool printSingleScore(SafeBuf *sb, const SearchInput *si, const SingleScore *ss, Msg20Reply *mr) { // shortcut const Query *q = &si->m_q; //SafeBuf ft; // store in final score calc //if ( ft.length() ) ft.safePrintf(" + "); //ft.safePrintf("%f",ss->m_finalScore); const char *syn = "no"; float sw = 1.0; if ( ss->m_isSynonym ) { syn = "yes"; sw = g_conf.m_baseScoringParameters.m_synonymWeight; } //char bf = ss->m_bflags; float wbw = 1.0; const char *bs = "no"; if ( ss->m_isHalfStopWikiBigram ) { bs = "yes"; wbw = WIKI_BIGRAM_WEIGHT; } float hgw = getHashGroupWeight(ss->m_hashGroup); float dnw = getDensityWeight(ss->m_densityRank); float wsw = getWordSpamWeight(ss->m_wordSpamRank); // HACK for inlink text! if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) wsw = getLinkerWeight(ss->m_wordSpamRank); //int64_t tf = ss->m_termFreq;//ss->m_listSize; int32_t qtn = ss->m_qtermNum; //int64_t tf = msg40->m_msg3a.m_termFreqs[qtn]; QueryTerm *qt = &q->m_qterms[qtn]; int64_t tf = qt->m_termFreq; float tfw = ss->m_tfWeight; if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"terminfo\": {\n"); sb->safePrintf("\t\t\t\"densityRank\": %" PRId32",\n", (int32_t)ss->m_densityRank); sb->safePrintf("\t\t\t\"densityWeight\": %f,\n", dnw); sb->safePrintf("\t\t\t\"term\": \"%*.*s\",\n", (int)q->m_qterms[qtn].m_termLen, (int)q->m_qterms[qtn].m_termLen,q->m_qterms[qtn].m_term); sb->safePrintf("\t\t\t\"location\": \"%s\",\n", getHashGroupString(ss->m_hashGroup)); sb->safePrintf("\t\t\t\"locationWeight\": %.01f,\n", hgw ); sb->safePrintf("\t\t\t\"wordPos\": %" PRId32",\n", (int32_t)ss->m_wordPos ); sb->safePrintf("\t\t\t\"isSynonym\": \"%s\",\n", syn); sb->safePrintf("\t\t\t\"synonymWeight\": %.01f,\n", sw); sb->safePrintf("\t\t\t\"isWikiBigram\": %" PRId32",\n", (int32_t)(ss->m_isHalfStopWikiBigram) ); sb->safePrintf("\t\t\t\"wikiBigramWeight\": %.01f,\n", (float)WIKI_BIGRAM_WEIGHT); // word spam if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) { sb->safePrintf("\t\t\t\"inlinkSiteRank>%" PRId32",\n", (int32_t)ss->m_wordSpamRank); sb->safePrintf("\t\t\t\"inlinkTextWeight\": %.02f,\n", wsw); } else { sb->safePrintf("\t\t\t\"wordSpamRank\": %" PRId32",\n", (int32_t)ss->m_wordSpamRank); sb->safePrintf("\t\t\t\"wordSpamWeight\": %.02f,\n", wsw); } // if offsite inlink text show the inlinkid for matching // to an <inlink> LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; Inlink *k = info ? info->getNextInlink(NULL) : NULL; sb->safePrintf("\t\t\t\"inlinkIds\": [,\n"); for ( ; k && ss->m_hashGroup==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->getLinkText() ) continue; if ( k->m_wordPosStart > ss->m_wordPos ) continue; if ( k->m_wordPosStart + 50 < ss->m_wordPos ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t\t\"%" PRId32",\n", k->m_siteHash); } sb->safePrintf("\t\t\t],\n"); // term freq sb->safePrintf("\t\t\t\"termFreq\": %" PRId64",\n", tf); sb->safePrintf("\t\t\t\"termFreqWeight\": %f,\n", tfw); sb->safePrintf("\t\t\t\"score\": %f,\n", ss->m_finalScore); sb->safePrintf("\t\t\t\"equationCanonical\": \"" "score = " " 100 * " " locationWeight" // hgw " * " " locationWeight" // hgw " * " " synonymWeight" // synweight " * " " synonymWeight" // synweight " * " " wikiBigramWeight" " * " " wikiBigramWeight" " * " //" diversityWeight" // divweight //" * " //" diversityWeight" // divweight //" * " "densityWeight" // density weight " * " "densityWeight" // density weight " * " "wordSpamWeight" // wordspam weight " * " "wordSpamWeight" // wordspam weight " * " "termFreqWeight" // tfw " * " "termFreqWeight" // tfw "\",\n" ); sb->safePrintf("\t\t\t\"equation>\": \"" "%f=" "100*" "%.1f" // hgw "*" "%.1f" // hgw "*" "%.1f" // synweight "*" "%.1f" // synweight "*" "%.02f" // wikibigram weight "*" "%.02f" // wikibigram weight "*" "%.02f" // density weight "*" "%.02f" // density weight "*" "%.02f" // wordspam weight "*" "%.02f" // wordspam weight "*" "%.02f" // tfw "*" "%.02f" // tfw "\"\n" , ss->m_finalScore , hgw , hgw , sw , sw , wbw , wbw , dnw , dnw , wsw , wsw , tfw , tfw ); sb->safePrintf("\t\t}"); return true; } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<termInfo>\n"); sb->safePrintf("\t\t\t<densityRank>%" PRId32 "</densityRank>\n", (int32_t)ss->m_densityRank); sb->safePrintf("\t\t\t<densityWeight>%f" "</densityWeight>\n", dnw); sb->safePrintf("\t\t\t<term><![CDATA["); sb->safeMemcpy ( q->m_qterms[qtn].m_term , q->m_qterms[qtn].m_termLen ); sb->safePrintf("]]></term>\n"); sb->safePrintf("\t\t\t<location><![CDATA[%s]]>" "</location>\n", getHashGroupString(ss->m_hashGroup)); sb->safePrintf("\t\t\t<locationWeight>%.01f" "</locationWeight>\n", hgw ); sb->safePrintf("\t\t\t<wordPos>%" PRId32 "</wordPos>\n", (int32_t)ss->m_wordPos ); sb->safePrintf("\t\t\t<isSynonym>" "<![CDATA[%s]]>" "</isSynonym>\n", syn); sb->safePrintf("\t\t\t<synonymWeight>%.01f" "</synonymWeight>\n", sw); sb->safePrintf("\t\t\t<isWikiBigram>%" PRId32 "</isWikiBigram>\n", (int32_t)(ss->m_isHalfStopWikiBigram) ); sb->safePrintf("\t\t\t<wikiBigramWeight>%.01f" "</wikiBigramWeight>\n", (float)WIKI_BIGRAM_WEIGHT); // word spam if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) { sb->safePrintf("\t\t\t<inlinkSiteRank>%" PRId32 "</inlinkSiteRank>\n", (int32_t)ss->m_wordSpamRank); sb->safePrintf("\t\t\t<inlinkTextWeight>%.02f" "</inlinkTextWeight>\n", wsw); } else { sb->safePrintf("\t\t\t<wordSpamRank>%" PRId32 "</wordSpamRank>\n", (int32_t)ss->m_wordSpamRank); sb->safePrintf("\t\t\t<wordSpamWeight>%.02f" "</wordSpamWeight>\n", wsw); } // if offsite inlink text show the inlinkid for matching // to an <inlink> LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; Inlink *k = info ? info->getNextInlink(NULL) : NULL; for ( ; k && ss->m_hashGroup==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->getLinkText() ) continue; if ( k->m_wordPosStart > ss->m_wordPos ) continue; if ( k->m_wordPosStart + 50 < ss->m_wordPos ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t<inlinkId>%" PRId32 "</inlinkId>\n", k->m_siteHash); } // term freq sb->safePrintf("\t\t\t<termFreq>%" PRId64 "</termFreq>\n",tf); sb->safePrintf("\t\t\t<termFreqWeight>%f" "</termFreqWeight>\n",tfw); sb->safePrintf("\t\t\t<score>%f</score>\n", ss->m_finalScore); sb->safePrintf("\t\t\t<equationCanonical>" "<![CDATA[" "score = " " 100 * " " locationWeight" // hgw " * " " locationWeight" // hgw " * " " synonymWeight" // synweight " * " " synonymWeight" // synweight " * " " wikiBigramWeight" " * " " wikiBigramWeight" " * " //" diversityWeight" // divweight //" * " //" diversityWeight" // divweight //" * " "densityWeight" // density weight " * " "densityWeight" // density weight " * " "wordSpamWeight" // wordspam weight " * " "wordSpamWeight" // wordspam weight " * " "termFreqWeight" // tfw " * " "termFreqWeight" // tfw //" / ( 3.0 )" "]]>" "</equationCanonical>\n" ); sb->safePrintf("\t\t\t<equation>" "<![CDATA[" "%f=" "100*" "%.1f" // hgw "*" "%.1f" // hgw "*" "%.1f" // synweight "*" "%.1f" // synweight "*" "%.02f" // wikibigram weight "*" "%.02f" // wikibigram weight "*" "%.02f" // density weight "*" "%.02f" // density weight "*" "%.02f" // wordspam weight "*" "%.02f" // wordspam weight "*" "%.02f" // tfw "*" "%.02f" // tfw //" / ( 3.0 )" "]]>" "</equation>\n" , ss->m_finalScore , hgw , hgw , sw , sw , wbw , wbw , dnw , dnw , wsw , wsw , tfw , tfw ); sb->safePrintf("\t\t</termInfo>\n"); return true; } sb->safePrintf("<tr>" "<td rowspan=2>%.03f</td>\n" "<td>%s <font color=orange>%.1f" "</font></td\n>" // wordpos "<td>" "<a href=\"/get?d=" , ss->m_finalScore , getHashGroupString(ss->m_hashGroup) , hgw ); //sb->urlEncode( mr->ptr_ubuf ); sb->safePrintf("%" PRId64,mr->m_docId ); sb->safePrintf("&page=4&" "hipos=%" PRId32"&c=%s#hipos\">" ,(int32_t)ss->m_wordPos ,si->m_cr->m_coll); sb->safePrintf("%" PRId32"</a></td>\n" "<td>%s <font color=blue>%.1f" "</font></td>\n" // syn // wikibigram?/weight "<td>%s <font color=green>%.02f</font></td>\n" //"<td>%" PRId32"/<font color=green>%f" //"</font></td>" // diversity "<td>%" PRId32" <font color=purple>" "%.02f</font></td>\n" // density , (int32_t)ss->m_wordPos , syn , sw // synonym weight , bs , wbw //, (int32_t)ss->m_diversityRank //, dvw , (int32_t)ss->m_densityRank , dnw ); if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) { sb->safePrintf("<td> </td>" "<td>%" PRId32" <font color=red>%.02f" "</font></td>\n" // wordspam , (int32_t)ss->m_wordSpamRank , wsw ); } else { sb->safePrintf("<td>%" PRId32" <font color=red>%.02f" "</font></td>" // wordspam "<td> </td>\n" , (int32_t)ss->m_wordSpamRank , wsw ); } sb->safePrintf("<td id=tf>%" PRId64" <font color=magenta>" "%.02f</font></td>\n" // termfreq "</tr>\n" , tf , tfw ); // last row is the computation of score sb->safePrintf("<tr><td colspan=50>" "%.03f " " = " //" %" PRId32" * " "100 * " " <font color=orange>%.1f</font>" " * " " <font color=orange>%.1f</font>" " * " " <font color=blue>%.1f</font>" " * " " <font color=blue>%.1f</font>" " * " " <font color=green>%.02f</font>"//wikibigramwght " * " " <font color=green>%.02f</font>" " * " "<font color=purple>%.02f</font>" " * " "<font color=purple>%.02f</font>" " * " "<font color=red>%.02f</font>" " * " "<font color=red>%.02f</font>" " * " "<font color=magenta>%.02f</font>" " * " "<font color=magenta>%.02f</font>" //" / ( 3.0 )" // end formula "</td></tr>\n" , ss->m_finalScore //, (int32_t)MAXWORDPOS+1 , hgw , hgw , sw , sw , wbw , wbw //, dvw //, dvw , dnw , dnw , wsw , wsw , tfw , tfw ); //sb->safePrintf("</table>" // "<br>"); return true; } // if catId >= 1 then print the dmoz radio button static bool printLogoAndSearchBox(SafeBuf *sb, HttpRequest *hr, const SearchInput *si) { const char *coll = hr->getString("c"); if ( ! coll ) coll = ""; // if there's a ton of sites use the post method otherwise // they won't fit into the http request, the browser will reject // sending such a large request with "GET" const char *method = "GET"; if ( si && si->m_sites && strlen(si->m_sites)>800 ) { method = "POST"; } sb->safePrintf( // // search box // "<form name=f method=%s action=/search>\n\n" // propagate the collection if they re-search "<input name=c type=hidden value=\"%s\">" , method , coll ); // propagate prepend const char *prepend = hr->getString("prepend"); if ( prepend ) { sb->safePrintf("<input name=prepend type=hidden value=\""); sb->htmlEncode ( prepend, strlen(prepend), false); sb->safePrintf("\">"); } // put search box in a box sb->safePrintf( "<br>" "<br>" "<br>" "<div style=" "background-color:#fcc714;" "border-style:solid;" "border-width:3px;" "border-color:blue;" //"background-color:blue;" "padding:20px;" "border-radius:20px;" ">"); sb->safePrintf ( //"<div style=margin-left:5px;margin-right:5px;> "<input size=40 type=text name=q " "style=\"" //"width:%" PRId32"px;" "height:26px;" "padding:0px;" "font-weight:bold;" "padding-left:5px;" //"border-radius:10px;" "margin:0px;" "border:1px inset lightgray;" "background-color:#ffffff;" "font-size:18px;" "\" " "value=\"" ); // contents of search box int32_t qlen; const char *qstr = hr->getString("q",&qlen,"",NULL); sb->htmlEncode ( qstr , qlen , false ); // if it was an advanced search, this can be empty if ( qlen == 0 && si && si->m_displayQuery ) sb->htmlEncode ( si->m_displayQuery ); sb->safePrintf ("\">" " " "<div onclick=document.f.submit(); " " onmouseover=\"" "this.style.backgroundColor='lightgreen';" "this.style.color='black';\"" " onmouseout=\"" "this.style.backgroundColor='green';" "this.style.color='white';\" " "style=border-radius:28px;" "cursor:pointer;" "cursor:hand;" "border-color:white;" "border-style:solid;" "border-width:3px;" "padding:12px;" "width:20px;" "height:20px;" "display:inline-block;" "background-color:green;color:white;>" "<b style=margin-left:-5px;font-size:18px;" ">GO</b>" "</div>" ); sb->safePrintf( "</div>" "<br>" "<br>" ); printSearchFiltersBar ( sb , hr ); sb->safePrintf( "</form>\n" ); return true; } class MenuItem { public: int32_t m_menuNum; const char *m_title; // we append this to the url const char *m_cgi; char m_tmp[25]; }; const static int MAX_MENU_ENTRIES = 200; static MenuItem s_mi[MAX_MENU_ENTRIES]; static int32_t s_num = 0; static int getNextMenuEntryIndex(int curr_index) { int i = curr_index+1; if( i >= MAX_MENU_ENTRIES ) { g_process.shutdownAbort(true); } return i; } static bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) { SafeBuf cu; hr->getCurrentUrl ( cu ); sb->safePrintf("<script>" "function show(id){" "var e = document.getElementById(id);" "if ( e.style.display == 'none' ){" "e.style.display = '';" "}" "else {" "e.style.display = 'none';" "}" "}" "</script>" ); static bool s_init = false; if ( ! s_init ) { int32_t menuNum = 0; int32_t n = 0; s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Any time"; s_mi[n].m_cgi = "secsback=0"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Past 24 hours"; s_mi[n].m_cgi = "secsback=86400"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Past week"; s_mi[n].m_cgi = "secsback=604800"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Past month"; s_mi[n].m_cgi = "secsback=2592000"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Past year"; s_mi[n].m_cgi = "secsback=31536000"; n = getNextMenuEntryIndex(n); ++menuNum; // sort by s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Sorted by relevance"; s_mi[n].m_cgi = "sortby=0"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Sorted by date"; s_mi[n].m_cgi = "sortby=1"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Reverse sorted by date"; s_mi[n].m_cgi = "sortby=2"; n = getNextMenuEntryIndex(n); ++menuNum; // languages s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Any language"; s_mi[n].m_cgi = "qlang=xx"; n = getNextMenuEntryIndex(n); for ( int32_t i = 0 ; i < langLast ; i++ ) { s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = getLanguageString(i); const char *abbr = getLanguageAbbr(i); snprintf(s_mi[n].m_tmp,10,"qlang=%s",abbr); s_mi[n].m_cgi = s_mi[n].m_tmp; n = getNextMenuEntryIndex(n); } ++menuNum; // filetypes s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Any filetype"; s_mi[n].m_cgi = "filetype=any"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "HTML"; s_mi[n].m_cgi = "filetype=html"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "TEXT"; s_mi[n].m_cgi = "filetype=txt"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "PDF"; s_mi[n].m_cgi = "filetype=pdf"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Microsoft Word"; s_mi[n].m_cgi = "filetype=doc"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "XML"; s_mi[n].m_cgi = "filetype=xml"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "JSON"; s_mi[n].m_cgi = "filetype=json"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Excel"; s_mi[n].m_cgi = "filetype=xls"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "PostScript"; s_mi[n].m_cgi = "filetype=ps"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Spider Status"; s_mi[n].m_cgi = "filetype=status"; n = getNextMenuEntryIndex(n); ++menuNum; // output s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Output HTML"; s_mi[n].m_cgi = "format=html"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Output XML"; s_mi[n].m_cgi = "format=xml"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Output JSON"; s_mi[n].m_cgi = "format=json"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Output CSV"; s_mi[n].m_cgi = "format=csv"; n = getNextMenuEntryIndex(n); ++menuNum; // show/hide banned s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Hide banned results"; s_mi[n].m_cgi = "sb=0"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Show banned results"; s_mi[n].m_cgi = "sb=1"; n = getNextMenuEntryIndex(n); ++menuNum; // spider status s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Hide Spider Log"; s_mi[n].m_cgi = "splog=0"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Show Spider Log"; s_mi[n].m_cgi = "q=type:status"; n = getNextMenuEntryIndex(n); ++menuNum; // family filter s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Family Filter Off"; s_mi[n].m_cgi = "ff=0"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Family Filter On"; s_mi[n].m_cgi = "ff=1"; n = getNextMenuEntryIndex(n); ++menuNum; // META TAGS s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "No Meta Tags"; s_mi[n].m_cgi = "dt="; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Show Meta Tags"; s_mi[n].m_cgi = "dt=keywords+description"; n = getNextMenuEntryIndex(n); ++menuNum; // ADMIN s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Show Admin View"; s_mi[n].m_cgi = "admin=1"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "Show User View"; s_mi[n].m_cgi = "admin=0"; n = getNextMenuEntryIndex(n); ++menuNum; // fx_country s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "fx_country (none)"; s_mi[n].m_cgi = "fx_country="; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "de"; s_mi[n].m_cgi = "fx_country=de"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "dk"; s_mi[n].m_cgi = "fx_country=dk"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "fr"; s_mi[n].m_cgi = "fx_country=fr"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "no"; s_mi[n].m_cgi = "fx_country=no"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "se"; s_mi[n].m_cgi = "fx_country=se"; n = getNextMenuEntryIndex(n); ++menuNum; // fx_blang s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "fx_blang (none)"; s_mi[n].m_cgi = "fx_blang="; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "da"; s_mi[n].m_cgi = "fx_blang=da"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "de"; s_mi[n].m_cgi = "fx_blang=de"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "en"; s_mi[n].m_cgi = "fx_blang=en"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "en-US"; s_mi[n].m_cgi = "fx_blang=en-US"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "no"; s_mi[n].m_cgi = "fx_blang=no"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "se"; s_mi[n].m_cgi = "fx_blang=se"; n = getNextMenuEntryIndex(n); ++menuNum; // fx_fetld s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "fx_fetld (none)"; s_mi[n].m_cgi = "fx_fetld="; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "com"; s_mi[n].m_cgi = "fx_fetld=com"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "de"; s_mi[n].m_cgi = "fx_fetld=de"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "dk"; s_mi[n].m_cgi = "fx_fetld=dk"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "no"; s_mi[n].m_cgi = "fx_fetld=no"; n = getNextMenuEntryIndex(n); s_mi[n].m_menuNum = menuNum; s_mi[n].m_title = "se"; s_mi[n].m_cgi = "fx_fetld=se"; n = getNextMenuEntryIndex(n); s_num = n; s_init = true; } // we'll print the admin menu custom since it's mostly off-page links // bar of drop down menus sb->safePrintf("<div style=color:gray;>"); for ( int32_t i = 0 ; i <= s_mi[s_num-1].m_menuNum ; i++ ) { // after 5 make a new line if ( i % 5 == 0 ) sb->safePrintf("<br><br>"); printMenu ( sb , i , hr ); } sb->safePrintf("</div>\n"); sb->safePrintf("<br>\n"); return true; } static bool printMenu ( SafeBuf *sb , int32_t menuNum , HttpRequest *hr ) { bool firstOne = true; MenuItem *first = NULL; const char *src = hr->getOrigUrlRequest(); int32_t srcLen = hr->getOrigUrlRequestLen(); const char *frontTag = ""; const char *backTag = ""; bool isDefaultHeader = true; // try to set first based on what's in the url for ( int32_t i = 0 ; i < s_num ; i++ ) { // shortcut MenuItem *mi = &s_mi[i]; // skip if not our item if ( mi->m_menuNum != menuNum ) continue; // is it in the url const char *match = strnstr ( src, mi->m_cgi, srcLen ); // or if empty quotes it is the true header like // for 'hide spider log' option if ( ! match ) { isDefaultHeader = false; continue; } // ensure ? or & preceeds if ( match > src && match[-1] != '?' && match[-1] != '&' ) continue; // and \0 or & follows int32_t milen = strlen(mi->m_cgi); if ( match+milen > src+srcLen ) continue; if ( ! is_wspace_a(match[milen]) && match[milen] != '&' ) continue; // got it first = mi; // do not highlight the orig header if ( isDefaultHeader ) break; frontTag = "<b style=color:maroon;>"; backTag = "</b>"; break; } for ( int32_t i = 0 ; i < s_num ; i++ ) { // shortcut MenuItem *mi = &s_mi[i]; // skip if not our item if ( mi->m_menuNum != menuNum ) continue; if ( ! first ) first = mi; if ( ! firstOne ) goto skip; firstOne = false; // for centering the dropdown sb->safePrintf("<span style=position:relative;></span>"); // print hidden drop down menu sb->safePrintf( "<span id=menu%" PRId32" style=\"display:none;" "position:absolute;" //"margin-left:-20px;" "margin-top:15px;" "width:150px;" "max-height:300px;" "overflow-y:auto;" "background-color:white;" "padding:10px;" "width=80px;border-width:1px;" "border-color:lightgray;" "box-shadow: -.5px 1px 1px gray;" "border-style:solid;color:gray;\" " //" onmouseout=\"" //"this.style.display='none';\"" // if clicking on scrollbar do not hide menu! " onmousedown=\"inmenuclick=1;\" " ">" , mi->m_menuNum ); skip: // . add our cgi to the original url // . so if it has &qlang=de and they select &qlang=en // we have to replace it... etc. StackBuf<512> newUrl; replaceParm ( mi->m_cgi , &newUrl , hr ); newUrl += '\0'; // print each item in there sb->safePrintf("<a href=%s>" "<div style=cursor:pointer;cursor:hand;" "padding-top:10px;" "padding-bottom:10px;" "color:gray;" " onmouseover=\"" "this.style.backgroundColor='#e0e0e0';\" " " onmouseout=\"" "this.style.backgroundColor='white';\" " // prevent the body onmousedown from // hiding the menu " onmousedown=\"inmenuclick=1;\" " ">" "<nobr>" , newUrl.getBufStart() ); // print checkmark (check mark) next to selected one // if not the default (trueHeader) if ( mi == first ) // ! isDefaultHeader && mi == first ) sb->safePrintf("<b style=color:black;>✓</b>"); else sb->safePrintf(" "); sb->safePrintf(" %s</nobr>" "</div>" "</a>" , mi->m_title ); //sb->safePrintf("<br><br>"); } // wrap up the drop down sb->safePrintf("</span>"); // print heading or current selection i guess sb->safePrintf( // separate menus with these two spaces " " // print the menu header that when clicked // will show the drop down "<span style=cursor:pointer;" "cursor:hand; " "onmousedown=\"this.style.color='red';" "inmenuclick=1;" "\" " "onmouseup=\"this.style.color='gray';" // close any other open menu "if ( openmenu !='') {" "document.getElementById(openmenu)." "style.display='none'; " "var saved=openmenu;" "openmenu='';" // don't reopen our same menu below! "if ( saved=='menu%" PRId32"') return;" "}" // show our menu "show('menu%" PRId32"'); " // we are now open "openmenu='menu%" PRId32"'; " "\"" ">" "%s%s%s %c%c%c" "</span>" , first->m_menuNum , first->m_menuNum , first->m_menuNum , frontTag , first->m_title , backTag // print triangle ,0xe2 ,0x96 ,0xbc ); return true; } static bool replaceParm ( const char *cgi , SafeBuf *newUrl , HttpRequest *hr ) { if ( ! cgi[0] ) return true; // get original request url. this is not \0 terminated const char *src = hr->getOrigUrlRequest(); int32_t srcLen = hr->getOrigUrlRequestLen(); return replaceParm2 ( cgi ,newUrl, src, srcLen ); } static bool replaceParm2 ( const char *cgi , SafeBuf *newUrl , const char *oldUrl , int32_t oldUrlLen ) { const char *src = oldUrl; int32_t srcLen = oldUrlLen; const char *srcEnd = src + srcLen; const char *equal = strstr(cgi,"="); if ( ! equal ) { log(LOG_WARN, "results: %s has no equal sign", cgi); return false; } int32_t cgiLen = equal - cgi; const char *p = src; tryagain: const char *found = strncasestr ( p , cgi , srcEnd - p , cgiLen ); // if no ? or & before it it is bogus! if ( found && found[-1] != '&' && found[-1] != '?' ) { // try again p = found + 1; goto tryagain; } // fix &s= replaceing &sb= if ( found && found[cgiLen] != '=' ) { // try again p = found + 1; goto tryagain; } // if no collision, just append it if ( ! found ) { if ( ! newUrl->safeMemcpy ( src , srcLen ) ) return false; if ( ! newUrl->pushChar('&') ) return false; if ( ! newUrl->safeStrcpy ( cgi ) ) return false; if ( ! newUrl->nullTerm() ) return false; return true; } // . otherwise we have to replace it // . copy up to where it starts if ( ! newUrl->safeMemcpy ( src , found-src ) ) return false; // then insert our new cgi there if ( ! newUrl->safeStrcpy ( cgi ) ) return false; // then resume it const char *foundEnd = strncasestr ( found , "&" , srcEnd - found ); // if nothing came after... if ( ! foundEnd ) { if ( ! newUrl->nullTerm() ) return false; return true; } // copy over what came after if ( ! newUrl->safeMemcpy ( foundEnd, srcEnd-foundEnd ) ) return false; if ( ! newUrl->nullTerm() ) return false; return true; } static bool printMetaContent ( Msg40 *msg40 , int32_t i , State0 *st, SafeBuf *sb ) { // store the user-requested meta tags content SearchInput *si = &st->m_si; const char *pp = si->m_displayMetas; const char *ppend = pp + strlen(si->m_displayMetas); Msg20 *m = msg40->m_msg20[i];//getMsg20(i); Msg20Reply *mr = m->m_r; char *dbuf = mr->ptr_dbuf;//msg40->getDisplayBuf(i); int32_t dbufLen = mr->size_dbuf-1;//msg40->getDisplayBufLen(i); char *dbufEnd = dbuf + (dbufLen-1); char *dptr = dbuf; //bool printedSomething = false; // loop over the names of the requested meta tags while ( pp < ppend && dptr < dbufEnd ) { // . assure last byte of dbuf is \0 // provided dbufLen > 0 // . this insures sprintf and strlen won't // crash on dbuf/dptr if ( dbuf [ dbufLen ] != '\0' ) { log(LOG_LOGIC,"query: Meta tag buffer has no \\0."); break; } // skip initial spaces while ( pp < ppend && is_wspace_a(*pp) ) pp++; // break if done if ( ! *pp ) break; // that's the start of the meta tag name const char *ss = pp; // . find end of that meta tag name // . can end in :<integer> -- specifies max len while ( pp < ppend && ! is_wspace_a(*pp) && *pp != ':' ) pp++; size_t sslen = (size_t)(pp-ss); // if ':' was specified, skip the rest if ( *pp == ':' ) while ( pp < ppend && ! is_wspace_a(*pp)) pp++; // print the name int32_t ddlen = dbufLen; // newspaperarchive wants tags printed even if no value // make sure the meta tag isn't fucked up for ( int32_t ti = 0; ti < ddlen; ti++ ) { if ( dptr[ti] == '"' || dptr[ti] == '>' || dptr[ti] == '<' || dptr[ti] == '\r' || dptr[ti] == '\n' || dptr[ti] == '\0' ) { ddlen = ti; break; } } if ( ddlen > 0 ) { // ship it out if ( si->m_format == FORMAT_XML ) { sb->safePrintf ( "\t\t<display name=\"%.*s\">" "<![CDATA[", (int)sslen, ss); cdataEncode(sb, dptr, ddlen); sb->safePrintf ( "]]></display>\n" ); } else if ( si->m_format == FORMAT_JSON ) { sb->safePrintf ( "\t\t\"display.%.*s\":\"", (int)sslen, ss); sb->jsonEncode ( dptr, ddlen ); sb->safePrintf ( "\",\n"); } // otherwise, print in light gray else { sb->safePrintf("<font color=#c62939>" "<b>%.*s</b>: ", (int)sslen, ss); sb->safeMemcpy ( dptr, ddlen ); sb->safePrintf ( "</font><br>" ); } } // point to next content of tag to display dptr += ddlen + 1; } return true; }