fixed bug for product title extraction.
titledb-saved.dat tree loop corruption bug. no main coll bug. put the ajax widget on spider status page so you can see spider going in realtime. will give customers a good idea of the spider moving along. more widget fixes, to use new base64 thumbs, etc.
This commit is contained in:
@ -1252,6 +1252,33 @@ CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
|
||||
return g_collectiondb.getRec ( coll );
|
||||
}
|
||||
|
||||
char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
|
||||
char *coll = r->getString ( "c" );
|
||||
if ( coll && ! coll[0] ) coll = NULL;
|
||||
if ( coll ) return coll;
|
||||
CollectionRec *cr = NULL;
|
||||
// default to main first
|
||||
if ( ! coll ) {
|
||||
cr = g_collectiondb.getRec("main");
|
||||
// CAUTION: cr could be deleted so don't trust this ptr
|
||||
// if you give up control of the cpu
|
||||
if ( cr ) return cr->m_coll;
|
||||
}
|
||||
// try next in line
|
||||
if ( ! coll ) {
|
||||
cr = getFirstRec ();
|
||||
if ( cr ) return cr->m_coll;
|
||||
}
|
||||
// give up?
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
//CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
|
||||
// char *coll = getDefaultColl();
|
||||
// return g_collectiondb.getRec(coll);
|
||||
//}
|
||||
|
||||
// . get collectionRec from name
|
||||
// . returns NULL if not available
|
||||
CollectionRec *Collectiondb::getRec ( char *coll ) {
|
||||
|
@ -75,6 +75,12 @@ class Collectiondb {
|
||||
class CollectionRec *getRec ( class HttpRequest *r ,
|
||||
bool useDefaultRec = true );
|
||||
|
||||
// do not support diffbot style token/name style for this one:
|
||||
char *getDefaultColl ( HttpRequest *r ) ;
|
||||
|
||||
//class CollectionRec *getRec2 ( class HttpRequest *r ,
|
||||
// bool useDefaultRec = true );
|
||||
|
||||
// . get collectionRec from name
|
||||
// returns NULL if not available
|
||||
class CollectionRec *getRec ( char *coll );
|
||||
|
1
Images.h
1
Images.h
@ -24,6 +24,7 @@ class ThumbnailInfo {
|
||||
char m_buf[];
|
||||
char *getUrl() { return m_buf; };
|
||||
char *getData() { return m_buf + m_urlSize; };
|
||||
long getDataSize() { return m_dataSize; };
|
||||
long getSize () { return sizeof(ThumbnailInfo)+m_urlSize+m_dataSize;};
|
||||
|
||||
bool printThumbnailInHtml ( SafeBuf *sb ) {
|
||||
|
@ -771,7 +771,6 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// this prints the <form tag as well
|
||||
g_pages.printAdminTop ( &sb , socket , hr );
|
||||
|
||||
|
||||
//
|
||||
// show stats
|
||||
//
|
||||
@ -863,8 +862,100 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
|
||||
);
|
||||
|
||||
sb.safePrintf("</table>\n\n");
|
||||
|
||||
}
|
||||
|
||||
|
||||
// put the widget in here, just sort results by spidered date
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
|
||||
sb.safePrintf("<br>"
|
||||
"<script type=\"text/javascript\">"
|
||||
"function diffbot_handler() {"
|
||||
"if(this.readyState != 4 )return;"
|
||||
"if(!this.responseText)return;"
|
||||
"document.getElementById(\"diffbot_widget\")."
|
||||
"innerHTML=this.responseText;"
|
||||
"diffbot_scroll();}"
|
||||
"</script>"
|
||||
|
||||
"<script type=text/javascript>function "
|
||||
"diffbot_scroll() {var hd = document."
|
||||
"getElementById('diffbot_invisible');"
|
||||
"if ( ! hd ) {setTimeout('diffbot_scroll()',"
|
||||
"3);return;} var b=parseInt(hd.style.top);"
|
||||
"var step=4;b=b+step;hd.style.top=b+\"px\";"
|
||||
"var vd=document.getElementById"
|
||||
"('diffbot_visible');"
|
||||
"var c=parseInt(vd.style.top);"
|
||||
"c=c+step;"
|
||||
"vd.style.top=c+\"px\";"
|
||||
"if(b>=0)return;"
|
||||
"setTimeout('diffbot_scroll()',3);}"
|
||||
"</script>"
|
||||
);
|
||||
|
||||
long widgetWidth = 300;
|
||||
long widgetHeight = 500;
|
||||
|
||||
// make the ajax url that gets the search results
|
||||
SafeBuf ub;
|
||||
ub.safePrintf("/search"
|
||||
"?format=ajax"
|
||||
"&c=%s"
|
||||
"&q=gbrevsortbyint%%3Agbspiderdate"
|
||||
"&widgetheight=%li"
|
||||
"&widgetwidth=%li"
|
||||
"&topdocid="
|
||||
, cr->m_coll
|
||||
, widgetHeight
|
||||
, widgetWidth
|
||||
);
|
||||
|
||||
|
||||
// then the containing div. set the "id" so that the
|
||||
// style tag the user sets can control its appearance.
|
||||
// when the browser loads this the ajax sets the contents
|
||||
// to the reply from neo.
|
||||
|
||||
sb.safePrintf("<div id=diffbot_widget "
|
||||
"style=\"border:2px solid black;"
|
||||
"position:relative;border-radius:10px;"
|
||||
"width:%lipx;height:%lipx;\">"
|
||||
, widgetWidth
|
||||
, widgetHeight
|
||||
);
|
||||
|
||||
//sb.safePrintf("<style>"
|
||||
// "a{color:white;}"
|
||||
// "</style>");
|
||||
|
||||
// get the search results from neo as soon as this div is
|
||||
// being rendered, and set its contents to them
|
||||
sb.safePrintf("<script type=text/javascript>function "
|
||||
"diffbot_reload() {var client="
|
||||
"new XMLHttpRequest();"
|
||||
"client.onreadystatechange=diffbot_handler;"
|
||||
"var u='%s';"
|
||||
"var td=document.getElementById('topdocid');"
|
||||
"if ( td ) u=u+td.value;"
|
||||
"client.open('GET',u);"
|
||||
"client.send();"
|
||||
"setTimeout('diffbot_reload()',15000);}"
|
||||
"diffbot_reload();</script>"
|
||||
, ub.getBufStart()
|
||||
);
|
||||
|
||||
|
||||
sb.safePrintf("Waiting for Server...");
|
||||
|
||||
|
||||
// end the containing div
|
||||
sb.safePrintf("</div>");
|
||||
}
|
||||
|
||||
|
||||
//if ( fmt != FORMAT_JSON )
|
||||
// // wrap up the form, print a submit button
|
||||
// g_pages.printAdminBottom ( &sb );
|
||||
|
@ -2609,22 +2609,18 @@ bool printResult ( State0 *st, long ix ) {
|
||||
// , widgetwidth - 2*8 // padding is 8px
|
||||
// , mr->ptr_imgUrl);
|
||||
if ( mr->ptr_imgData ) {
|
||||
char *p = mr->ptr_imgData; // orig img url
|
||||
p += gbstrlen(p) + 1; // dx of thumb
|
||||
//long tdx = *(long *)p;
|
||||
p += 4;
|
||||
//long tdy = *(long *)p;
|
||||
p += 4;
|
||||
char *imgData = p;
|
||||
char *pend = mr->ptr_imgData + mr->size_imgData;
|
||||
long thumbBytes = pend - p;
|
||||
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
|
||||
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
|
||||
sb->safePrintf("background-repeat:no-repeat;"
|
||||
"background-size:%lipx 140px;"
|
||||
"background-image:url('data:image/"
|
||||
"jpg;base64,"
|
||||
, widgetwidth - 2*8); // padding is 8px
|
||||
// encode image in base 64
|
||||
sb->base64Encode (imgData,thumbBytes,0); // 0 niceness
|
||||
if ( ti )
|
||||
sb->base64Encode (ti->getData(),
|
||||
ti->getDataSize(),
|
||||
0); // niceness
|
||||
sb->safePrintf("');");
|
||||
}
|
||||
|
||||
@ -2633,14 +2629,14 @@ bool printResult ( State0 *st, long ix ) {
|
||||
sb->safePrintf("\">");
|
||||
sb->safePrintf ( "<a "
|
||||
"target=_blank "
|
||||
"style=text-decoration:none; href=" );
|
||||
"style=text-decoration:none; href=\"" );
|
||||
// truncate off -diffbotxyz%li
|
||||
long newLen = urlLen;
|
||||
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
|
||||
// print the url in the href tag
|
||||
sb->safeMemcpy ( url , newLen );
|
||||
// then finish the a href tag and start a bold for title
|
||||
sb->safePrintf ( ">");//<font size=+0>" );
|
||||
sb->safePrintf ( "\">");//<font size=+0>" );
|
||||
|
||||
sb->safePrintf("<b style=\""
|
||||
"text-decoration:none;"
|
||||
@ -2669,6 +2665,22 @@ bool printResult ( State0 *st, long ix ) {
|
||||
// then title over image
|
||||
}
|
||||
|
||||
// only do link here if we have no thumbnail so no bg image
|
||||
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
|
||||
si->m_format == FORMAT_WIDGET_AJAX ) &&
|
||||
! mr->ptr_imgData ) {
|
||||
sb->safePrintf ( "<a style=text-decoration:none;"
|
||||
"color:white; "
|
||||
"href=" );
|
||||
// truncate off -diffbotxyz%li
|
||||
long newLen = urlLen;
|
||||
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
|
||||
// print the url in the href tag
|
||||
sb->safeMemcpy ( url , newLen );
|
||||
// then finish the a href tag and start a bold for title
|
||||
sb->safePrintf ( ">");//<font size=+0>" );
|
||||
}
|
||||
|
||||
|
||||
// the a href tag
|
||||
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "\n\n" );
|
||||
@ -2698,20 +2710,6 @@ bool printResult ( State0 *st, long ix ) {
|
||||
}
|
||||
|
||||
|
||||
// only do link here
|
||||
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
|
||||
si->m_format == FORMAT_WIDGET_AJAX ) &&
|
||||
! mr->ptr_imgUrl ) {
|
||||
sb->safePrintf ( "<a href=" );
|
||||
// truncate off -diffbotxyz%li
|
||||
long newLen = urlLen;
|
||||
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
|
||||
// print the url in the href tag
|
||||
sb->safeMemcpy ( url , newLen );
|
||||
// then finish the a href tag and start a bold for title
|
||||
sb->safePrintf ( ">");//<font size=+0>" );
|
||||
}
|
||||
|
||||
// . then the title (should be NULL terminated)
|
||||
// . the title can be NULL
|
||||
// . highlight it first
|
||||
|
@ -914,8 +914,9 @@ bool Pages::printAdminTop (SafeBuf *sb ,
|
||||
//long user = getUserType ( s , r );
|
||||
//char *username = g_users.getUsername ( r );
|
||||
char *username = NULL;
|
||||
char *coll = r->getString ( "c" );
|
||||
if ( ! coll ) coll = "main";
|
||||
//char *coll = r->getString ( "c" );
|
||||
//if ( ! coll ) coll = "main";
|
||||
char *coll = g_collectiondb.getDefaultColl(r);
|
||||
|
||||
//char *pwd = r->getString ( "pwd" );
|
||||
// get username
|
||||
|
10
Parms.cpp
10
Parms.cpp
@ -1386,9 +1386,13 @@ bool Parms::printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r) {
|
||||
long page = g_pages.getDynamicPageNumber ( r );
|
||||
long nc = r->getLong("nc",1);
|
||||
long pd = r->getLong("pd",1);
|
||||
char *coll = r->getString ( "c" );
|
||||
if ( ! coll || ! coll[0] ) coll = "main";
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
char *coll = g_collectiondb.getDefaultColl(r);
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true);
|
||||
//char *coll = r->getString ( "c" );
|
||||
//if ( ! coll || ! coll[0] ) coll = "main";
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
// if "main" collection does not exist, try another
|
||||
//if ( ! cr ) cr = getCollRecFromHttpRequest ( r );
|
||||
printParms2 ( sb, page, cr, nc, pd,0,0 , s);
|
||||
return true;
|
||||
}
|
||||
|
@ -3144,7 +3144,7 @@ struct QueryField g_fields[] = {
|
||||
},
|
||||
|
||||
{"gbminint", FIELD_GBNUMBERMININT, false,
|
||||
"Example: 'gbminint:spiderdate:1391749680' "
|
||||
"Example: 'gbminint:gbspiderdate:1391749680' "
|
||||
"'gbminint:count:99'. Numeric "
|
||||
"fields can be in JSON or in meta tag. "
|
||||
"Use 'gbspiderdate' field for the last time the page was "
|
||||
@ -3152,7 +3152,7 @@ struct QueryField g_fields[] = {
|
||||
},
|
||||
|
||||
{"gbmaxint", FIELD_GBNUMBERMAXINT, false,
|
||||
"Example: 'gbmaxint:spiderdate:1391749680' "
|
||||
"Example: 'gbmaxint:gbspiderdate:1391749680' "
|
||||
"'gbmaxint:count:99'. Numeric "
|
||||
"fields can be in JSON or in meta tag. "
|
||||
"Use 'gbspiderdate' field for the last time the page was "
|
||||
|
@ -1305,8 +1305,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
if ( ! doChainTest ) continue;
|
||||
// ensure i goes back to head node
|
||||
long j = i;
|
||||
long loopCount = 0;
|
||||
while ( j >= 0 ) {
|
||||
if ( j == m_headNode ) break;
|
||||
// sanity -- loop check
|
||||
if ( ++loopCount > 10000 )
|
||||
return log("db: tree had loop");
|
||||
j = m_parents[j];
|
||||
}
|
||||
if ( j != m_headNode )
|
||||
|
@ -45764,7 +45764,8 @@ char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
|
||||
! gotOne &&
|
||||
p[1] == ':' &&
|
||||
// {"title":"whatever",...}
|
||||
depth == 1 &&
|
||||
// could be product:{title:... depth=2
|
||||
(depth == 1 ||depth==2) &&
|
||||
stringStart &&
|
||||
(p - stringStart) == fieldLen &&
|
||||
strncmp(field,stringStart,fieldLen)==0 ) {
|
||||
|
Reference in New Issue
Block a user