updating help table. still more work on query.cpp to do

This commit is contained in:
mwells
2014-08-07 22:02:55 -07:00
parent fb0c8f0c4d
commit 50cc24e2be
4 changed files with 512 additions and 73 deletions

@ -2675,16 +2675,15 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
sb.safePrintf(
"<br>"
" <table width=650px cellpadding=5 cellspacing=0 border=0>"
"<table width=650px cellpadding=5 cellspacing=0 border=0>"
""
"<tr bgcolor=#0340fd>"
""
" <th><font color=33dcff>Search</font></th>"
" <th><font color=33dcff>Description</font></th>"
" </tr>"
" <tr> "
" <td><a href=/search?q=cat+dog>cat "
" dog</a></td>"
"<th><font color=33dcff>Search</font></th>"
"<th><font color=33dcff>Description</font></th>"
"</tr>"
"<tr> "
"<td><a href=/search?q=cat+dog>cat dog</a></td>"
" <td>Search results have the word <em>cat</em> and the word <em>dog</em> "
" in them. They could also have <i>cats</i> and <i>dogs</i>.</td>"
" </tr>"
@ -2783,11 +2782,54 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
" </tr>"
""
""
);
sb.safePrintf(
// spacer
//"<tr><td><br></td><td></td></tr>"
//"<tr bgcolor=#0340fd>"
// "<td><font color=33dcff><b>Special Query</b>"
// "</font></td>"
//"<td><font color=33dcff><b>Description</b></font></td>"
// "</tr>"
"<tr><td>document<b>.</b>title<b>:</b>paper</td><td>"
"That query will match a JSON document like "
"<i>"
"{ \"document\":{\"title\":\"This is a good paper.\" "
"}}</i> or, alternatively, an XML document like <i>"
);
sb.htmlEncode("<document><title>This is a good paper"
"</title></document>" );
sb.safePrintf("</i></td></tr>\n");
// table of the query keywords
long n = getNumFieldCodes();
for ( long i = 0 ; i < n ; i++ ) {
// get field #i
QueryField *f = &g_fields[i];
// print it out
char *d = f->desc;
// fix table internal cell bordering
if ( d[0] == '\0' ) d = "&nbsp;";
sb.safePrintf("<tr bgcolor=#%s>"
"<td><b>%s</b></td><td>%s</td></tr>\n",
LIGHT_BLUE,f->example,d);
}
sb.safePrintf(
" <tr> "
" <td style=padding-bottom:12px;>&nbsp;</td>"
" <td style=padding-bottom:12px;>&nbsp;</td>"
" </tr>"
""
"<tr bgcolor=#0340fd>"
""
" <th><font color=33dcff>Boolean Search</font></th>"
@ -2878,16 +2920,26 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
" operator can optionally preceed the left or the right operand.</td>"
" </tr>"
""
" </table>"
//" </table>"
""
""
""
"</td></tr>"
"</table>"
"<br>"
//"</td></tr>"
//"</table>"
//"<br>"
);
//sb.safePrintf("<tr><td></td><td></td></tr>\n");
//sb.safePrintf("<tr><td></td><td></td></tr>\n");
//sb.safePrintf("<tr><td></td><td></td></tr>\n");
//sb.safePrintf("<tr><td></td><td></td></tr>\n");
sb.safePrintf("</table>");
//sb.safePrintf("</form>\n");
sb.safePrintf("<br>\n");
sb.safePrintf("\n");

493
Query.cpp

@ -3162,96 +3162,457 @@ static bool s_isInitialized = false;
// 3rd field = m_hasColon
struct QueryField g_fields[] = {
{"url", FIELD_URL, true,"Match the exact url. Example: url:www.gigablast.com/addurl.htm"},
{"ext", FIELD_EXT, true,"Match the url extension. Example: ext:htm or ext:mpeg to find urls ending in .htm or .mpeg respectively."},
{"url2", FIELD_URL, true,"Match the exact url. Example: url:www.gigablast.com/addurl.htm"},
{"ext2", FIELD_EXT, true,"Match the url extension. Example: ext:htm or ext:mpeg to find urls ending in .htm or .mpeg respectively."},
{"url",
FIELD_URL,
true,
"url:www.abc.com/page.html",
"Matches the page with that exact url. Uses the first url, not "
"the url it redirects to, if any." ,
0 },
{"link", FIELD_LINK, true,"Match pages that link to the given url. Example: link:www.gigablast.com will return all pages linking to the www.gigablast.com page."},
{"links", FIELD_LINKS, true,"Same as link:."},
{"ilink", FIELD_ILINK, true,"Similar to above."},
{"sitelink", FIELD_SITELINK, true,"Matches all pages that link to the given site. Example:sitelink:www.gigablast.com matches all pages that link to some page on the www.gigablast.com site."},
{"ext",
FIELD_EXT,
true,
"ext:doc",
"Match documents whose url ends in the <i>.doc</i> file extension.",
0 },
{"url2",
FIELD_URL,
true,
"url2:www.abc.com/page.html",
"Matches the <i>Spider Status</i> documents for the specified url. "
"These special documents "
"let you know exactly when the url was attempted to be "
"spidered and the outcome.",
0 },
{"link",
FIELD_LINK,
true,
"link:http://www.gigablast.com/foobar.html",
"Matches all the documents that have a link to "
"http://www.gigablast.com/foobar.html",
0 },
//{"links", FIELD_LINKS, true,"Same as link:."},
//{"ilink", FIELD_ILINK, true,"Similar to above."},
{"sitelink",
FIELD_SITELINK,
true,
"sitelink:abc.foobar.com",
"Matches all documents that link to any page on the "
"<i>abc.foobar.com</i> site.",
0 },
{"site",
FIELD_SITE,
true,
"site:mysite.com",
"Matches all documents on the mysite.com domain.",
0 },
{"site",
FIELD_SITE,
true,
"site:www.mysite.com/dir1/dir2/",
"Matches all documents whose url starts with "
"www.mysite.com/dir1/dir2/",
QTF_DUP },
{"site", FIELD_SITE, true,"Matches all pages from the given site. Example: site:www.gigablast.com will return all the pages on the gigablast site"},
//{"coll", FIELD_COLL, true,"Not sure if this works."},
{"ip", FIELD_IP, true,"Matches all pages with the given ip. Example:1.2.3.4 will match all pages whose urls have that IP address."},
{"inurl", FIELD_SUBURL, true,"Matches all pages that have the given terms in the url. Example inurl:water will match all pages whose url has the word water in it, but the word must be delineated by punctuation."},
{"ip",
FIELD_IP,
true,
"ip:1.2.3.4",
"Matches all documents whose IP is 1.2.3.4.",
0 },
{"site2", FIELD_SITE, true,"Matches all pages from the given site. Example: site:www.gigablast.com will return all the pages on the gigablast site"},
//{"coll", FIELD_COLL, true,"Not sure if this works."},
{"ip2", FIELD_IP, true,"Matches all pages with the given ip. Example:1.2.3.4 will match all pages whose urls have that IP address."},
{"inurl2", FIELD_SUBURL, true,"Matches all pages that have the given terms in the url. Example inurl:water will match all pages whose url has the word water in it, but the word must be delineated by punctuation."},
{"ip",
FIELD_IP,
true,
"ip:1.2.3",
"Matches all documents whose IP STARTS with 1.2.3.",
QTF_DUP },
{"suburl", FIELD_SUBURL, true,"Same as inurl."},
{"intitle", FIELD_TITLE, false,"Matches all pages that have pages that have the given term in their title. Example: title:web returns all pages that have the word web in their title."},
{"title", FIELD_TITLE, false,"Same as intitle:"},
{"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."},
{"gbrss", FIELD_GBRSS, true,"Matches all pages that are rss feeds."},
//{"gbruleset",FIELD_GBRULESET, true,"Obsolete."},
{"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension. Examples: type:doc type:status type:json type:xls"},
{"filetype", FIELD_TYPE, false,"Same as type:"},
{"gbisadult",FIELD_GENERIC,false,"use gbisadult:0 and gbisadult:1 to restrict results to non-adult and adult documents respectively."},
{"gbimage",FIELD_URL,false,"use gbimage:<url> to return all documents containing that image url."},
{"inurl",
FIELD_SUBURL,
true,
"inurl:dog",
"Matches all documents that have the word dog in their url, like "
"http://www.mysite.com/dog/food.html. However will not match "
"http://www.mysite.com/dogfood.html because it is not an "
"individual word. It must be delineated by punctuation.",
0 },
{"gbstatus",FIELD_GENERIC,false,"If document is a spider reply, then search the spider status as a number using this. 0 means success, so gbstatus:0 would return all successful statuses."},
{"gbstatusmsg",FIELD_GENERIC,false,"If document is a spider reply, then search the spider status description, which might be something like 'TCP Timed out' or 'Robots.txt disallows' or 'Success', if no error."},
{"gbhasthumbnail",FIELD_GENERIC,false,"use gbhasthumbnail:0 and gbhasthumbnail:1 to restrict results to those that do not have or have thumbnails respectively."},
{"gbtag*", FIELD_TAG, false,"Matches all pages whose tag named * have the specified value. Example: gbtagingoogle:1 matches all pages that have a value of 1 for their ingoogle tag in tagdb."},
{"zip", FIELD_ZIP, false,"Matches all pages that have the specified zip code in their meta zip code tag. Not to be used with events."},
{"zipcode", FIELD_ZIP, false,"Same as zip:"},
{"suburl",
FIELD_SUBURL,
true,
"suburl:dog",
"Same as inurl.",
0},
{"intitle",
FIELD_TITLE,
false,
"title:cat",
"Matches all the documents that have the word cat in their "
"title.",
0 },
{"intitle",
FIELD_TITLE,
false,
"title:\"cat food\"",
"Matches all the documents that have the phrase \"cat food\" "
"in their title.",
QTF_DUP },
{"title",
FIELD_TITLE,
false,
"title:cat",
"Same as intitle:",
0},
//{"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."},
{"gbinrss",
FIELD_GBRSS,
true,
"gbinrss:1",
"Matches all documents that are in RSS feeds. Likewise, use "
"<i>gbinrss:0</i> to match all documents that are NOT in RSS feeds.",
0},
{"type",
FIELD_TYPE,
false,
"type:pdf",
"Matches all documents that are PDFs. Other possible types include "
"<i>html, text, xml, pdf, doc, xls, ppt, ps, css, json, status.</i> "
"<i>status</i> matches special documents that are stored every time "
"a url is spidered so you can see all the spider attempts and when "
"they occurred as well as the outcome.",
0},
{"filetype",
FIELD_TYPE,
false,
"filetype:pdf",
"Same as type: above.",
0},
{"gbisadult",
FIELD_GENERIC,
false,
"gbisadult:1",
"Matches all documents that have been detected as adult documents "
"and may be unsuitable for children. Likewise, use "
"<i>gbisadult:0</i> to match all documents that were NOT detected "
"as adult documents.",
0},
{"gbimage",
FIELD_URL,
false,
"gbimage:site.com/image.jpg",
"Matches all documents that contain the specified image.",
0},
{"gbhasthumbnail",
FIELD_GENERIC,
false,
"gbhasthumbnail:1",
"Matches all documents for which Gigablast detected a thumbnail. "
"Likewise use <i>gbhasthumbnail:0</i> to match all documents that "
"do not have thumbnails.",
0},
{"gbtag*",
FIELD_TAG,
false,
"gbtag*",
"Matches all documents whose tag named * have the specified value "
"in the tagdb entry for the url. Example: gbtagsitenuminlinks:2 "
"matches all documents that have 2 qualified "
"inlinks pointing to their site "
"based on the tagdb record. You can also provide your own "
"tags in addition to the tags already present. See the <i>tagdb</i> "
"menu for more information.",
0},
{"gbzipcode",
FIELD_ZIP,
false,
"gbzip:90210",
"Matches all documents that have the specified zip code "
"in their meta zip code tag.",
0},
//{"range", FIELD_RANGE, false,""}, // obsolete, datedb replaced
{"charset", FIELD_CHARSET, false,"Matches all pages in the given character set."},
{"urlhash",FIELD_URLHASH, false,""},
{"urlhashdiv10",FIELD_URLHASHDIV10, false,""},
{"urlhashdiv100",FIELD_URLHASHDIV100, false,""},
{"gblang",FIELD_GBLANG,false,"Matches all pages in the given language. Examples: gblang:en gblang:fr gblang:de"},
{"gbquality",FIELD_GBQUALITY,true,""},
{"gblinktextin",FIELD_LINKTEXTIN,true,""},
{"gblinktextout",FIELD_LINKTEXTOUT,true,""},
{"gbkeyword",FIELD_KEYWORD,true,""},
{"gbcharset", FIELD_CHARSET, false,""},
{"gbpathdepth", FIELD_GBOTHER, false,"the path depth of the url's path."},
{"gbhopcount", FIELD_GBOTHER, false,"the hop count of the url. roots are 0."},
{"gbhasfilename", FIELD_GBOTHER, false,""},
{"gbiscgi", FIELD_GBOTHER, false,""},
{"gbhasext", FIELD_GBOTHER, false,""},
{"gbsubmiturl", FIELD_GBOTHER, false,""},
{"gbcharset",
FIELD_CHARSET,
false,
"gbcharset:utf-8",
"Matches all documents originally in the Utf-8 charset. "
"Available character sets are listed in the <i>iana_charset.cpp</i> "
"file in the open source distribution. There are a lot. Some "
"more popular ones are: <i>us, latin1, iso-8859-1, csascii, ascii, "
"latin2, latin3, latin4, greek, shift_jis.",
0},
// this just complicates things for now, so comment out
//{"urlhash",FIELD_URLHASH, false,""},
//{"urlhashdiv10",FIELD_URLHASHDIV10, false,""},
//{"urlhashdiv100",FIELD_URLHASHDIV100, false,""},
{"gblang",
FIELD_GBLANG,
false,
"gblang:de",
"Matches all documents in german. "
"The supported language abbreviations "
"are at the bottom of the <i>url filters</i> page. Some more "
"common ones are <i>en, es, fr, zh_cn</i>.",
0},
//{"gbquality",FIELD_GBQUALITY,true,""},
//{"gblinktextin",FIELD_LINKTEXTIN,true,""},
//{"gblinktextout",FIELD_LINKTEXTOUT,true,""},
//{"gbkeyword",FIELD_KEYWORD,true,""},
//{"gbcharset", FIELD_CHARSET, false,""},
{"gbpathdepth",
FIELD_GBOTHER,
false,
"gbpathdepth:3",
"Matches all documents whose url has 3 path components to it like "
"http://somedomain.com/dir1/dir2/dir3/foo.html",
0},
{"gbhopcount",
FIELD_GBOTHER,
false,
"gbhopcount:2",
"Matches all documents that are a minimum of two link hops away "
"from a root url.",
0},
{"gbhasfilename",
FIELD_GBOTHER,
false,
"gbhasfilename:1",
"Matches all documents whose url ends in a filename like "
"<i>http://somedomain.com/dir1/myfile</i> and not "
"<i>http://somedomain.com/dir1/dir2/</i>. Likewise, use "
"<i>gbhasfilename:0</i> to match all the documents that do not "
"have a filename in their url.",
0},
{"gbiscgi",
FIELD_GBOTHER,
false,
"gbiscgi:1",
"Matches all documents that have a question mark in their url. "
"Likewise gbiscgi:0 matches all documents that do not.",
0},
{"gbhasext",
FIELD_GBOTHER,
false,
"gbhasext:1",
"Matches all documents that have a file extension in their url. "
"Likewise, <i>gbhasext:0</i> matches all documents that do not have "
"a file extension in their url.",
0},
{"gbsubmiturl",
FIELD_GBOTHER,
false,
"gbsubmiturl:domain.com/process.php",
"Matches all documents that have a form that submits to the "
"specified url.",
0},
//
// for content type CT_STATUS documents (Spider status docs)
{"gbpathdepth2", FIELD_GBOTHER, false,"the path depth of the url's path."},
{"gbhopcount2", FIELD_GBOTHER, false,"the hop count of the url. roots are 0."},
{"gbhasfilename2", FIELD_GBOTHER, false,""},
{"gbiscgi2", FIELD_GBOTHER, false,""},
{"gbhasext2", FIELD_GBOTHER, false,""},
//
{"gbstatus",
FIELD_GENERIC,
false,
"gbstatus:0",
"Matches all special spider status documents that spidered "
"their url successfully. Replace <i>0</i> with other numeric error "
"codes to get the other outcomes.",
0},
{"gbstatusmsg",
FIELD_GENERIC,
false,
"gbstatusmsg:tcp",
"Matches all special spider status documents that had a status "
"message containing the word <i>tcp</i> like in "
"<i>TCP Timed Out</i>. Similarly, gbstatus:success, "
"gbstatus:\"robots.txt\" are other possibilities.",
0},
{"site2",
FIELD_SITE,
true,
"site2:mysite.com",
"Matches all the special spider status documents on the "
"mysite.com domain.",
0 },
{"ip2",
FIELD_IP,
true,
"ip2:1.2.3.4",
"Matches all the special spider status "
"documents whose IP is 1.2.3.4.",
0 },
{"inurl2",
FIELD_SUBURL,
true,
"inurl2:dog",
"Matches all the special spider status "
"documents that have the word dog in their url, like "
"http://www.mysite.com/dog/food.html. However will not match "
"http://www.mysite.com/dogfood.html because it is not an "
"individual word. It must be delineated by punctuation.",
0 },
{"gbpathdepth2",
FIELD_GBOTHER,
false,
"gbpathdepth2:2",
"Similar to gbpathdepth: described above but for special "
"spider status documents.",
0},
{"gbhopcount2",
FIELD_GBOTHER,
false,
"gbhopcount2:3",
"Similar to gbhopcount: described above but for special "
"spider status documents.",
0},
{"gbhasfilename2",
FIELD_GBOTHER,
false,
"gbhasfilename2:1",
"Similar to gbhasfilename: described above but for special "
"spider status documents.",
0},
{"gbiscgi2",
FIELD_GBOTHER,
false,
"gbiscgi2:1",
"Similar to gbiscgi: described above but for special "
"spider status documents.",
0},
{"gbhasext2",
FIELD_GBOTHER,
false,
"gbhasext2:1",
"Similar to gbhasext: described above but for special "
"spider status documents.",
0}
/*
left off here
//{"qdom", FIELD_QUOTA, false,""},
//{"qhost", FIELD_QUOTA, false,""},
{"gbtagvector", FIELD_GBTAGVECTOR, false,""},
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
{"gbsortby", FIELD_GBSORTBY, false,
"Example: gbsortby:price. Fields can be "
"in JSON or in meta tag."},
"dog gbsortbyint:gbspiderdate",
"Sort the search results that contain 'dog' by "
"the date they were last spidered, with the newest "
"on top."},
{"gbrevsortby", FIELD_GBREVSORTBY, false,
"Example: gbrevsortby:item.price . "
"Fields can be in JSON or in meta tag."},
"dog gbrevsortbyint:gbspiderdate",
"Sort the search results that contain 'dog' by "
"the date they were last spidered, but with the "
"oldest on top."},
{"gbsortbyfloat", FIELD_GBSORTBY, false,
"cameras gbsortbyfloat:price","Sort all search results that "
"contain 'camera' by price. <i>price</i> can be a JSON field or "
"in a meta tag, or in an xml &lt;price&gt; tag.", 0 },
{"gbsortbyfloat", FIELD_GBSORTBY, false,
"cameras gbsortbyfloat:product.price","Sort all search results that "
"contain 'camera' by price. <i>price</i> can be in a JSON document "
"like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;price&gt;1500.00&lt;/price&gt;&lt;/product&gt;"
"</i>", QTF_DUP },
{"gbrevsortbyfloat",
FIELD_GBREVSORTBY,
false,
"Example: gbrevsortbyfloat:item.price . "
"Fields can be in JSON or in meta tag.",
0},
// gbmin:price:1.23
{"gbmin", FIELD_GBNUMBERMIN, false,"Usage: gbmin:price:1.99 . Numeric "
"fields can be in JSON or in meta tag."},
{"gbmax", FIELD_GBNUMBERMAX, false,"Usage: gbmax:price:1.99"},
{"gbminfloat", FIELD_GBNUMBERMIN, false,"Usage: gbminfloat:price:1.99 . Numeric "
"fields can be in JSON or in meta tag."},
{"gbmaxfloat", FIELD_GBNUMBERMAX, false,"Usage: gbmaxfloat:price:1.99"},
{"gbdocspiderdate",FIELD_GENERIC,false,
"Example: gbspiderdate:1400081479 will return results that have "
@ -3344,6 +3705,13 @@ struct QueryField g_fields[] = {
//{"gbsectionhash" ,FIELD_GBSECTIONHASH,false,"Internal use only."},
// they don't need to know about this
{"gbtagvector", FIELD_GBTAGVECTOR, false,"","",QTF_INTERNAL},
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,"","",QTF_INTERNAL},
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,"","",QTF_INTERNAL},
{"gbcontenthash", FIELD_GBCONTENTHASH, false,"","",QTF_INTERNAL},
{"gbduphash" ,FIELD_GBOTHER,false,"Internal use only."},
{"gbsitetemplate" ,FIELD_GBOTHER,false,"Internal use only."},
{"gboutlinkedtitle" ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."},
@ -3361,6 +3729,7 @@ struct QueryField g_fields[] = {
"were extract from this parent url. Example: "
"gbparenturl:www.gigablast.com/addurl.htm"},
{"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"}
*/
};
@ -3384,6 +3753,8 @@ static bool initFieldTable(){
// now add in all the stop words
long n = getNumFieldCodes();
for ( long i = 0 ; i < n ; i++ ) {
// skip if dup
if ( g_fields[i].m_flag & QTF_DUP ) continue;
long long h = hash64b ( g_fields[i].text );
// store the entity index in the hash table as score
if ( ! s_table.addTerm ( &h, i+1 ) ) return false;

15
Query.h

@ -136,11 +136,18 @@ char getFieldCode3 ( long long h64 ) ;
long getNumFieldCodes ( );
// . values for QueryField::m_flag
// . QTF_DUP means it is just for the help page in PageRoot.cpp to
// illustrate a second or third example
#define QTF_DUP 0x01
struct QueryField {
char *text;
char field;
bool hasColon;
char *example;
char *desc;
char m_flag;
};
extern struct QueryField g_fields[];
@ -421,6 +428,14 @@ class QueryTerm {
long m_termLen;
// point to the posdblist that represents us
class RdbList *m_posdbListPtr;
// the ()'s following an int/float facet term dictate the
// ranges for clustering the numeric values. like
// gbfacetfloat:price:(0-10,10-20,...)
// values outside the ranges will be ignored
char *m_parenList;
long m_parenListLen;
// . our representative bits
// . the bits in this bit vector is 1-1 with the QueryTerms
// . if a doc has query term #i then bit #i will be set

@ -26404,7 +26404,8 @@ bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_prefix = "zipcode";
//hi.m_prefix = "zipcode";
hi.m_prefix = "gbzipcode";
nextZip:
// . parse out the zip codes, may be multiple ones