fixed bug for product title extraction.

titledb-saved.dat tree loop corruption bug.
no main coll bug.
put the ajax widget on spider status page so you can
see spider going in realtime. will give customers
a good idea of the spider moving along.
more widget fixes, to use new base64 thumbs, etc.
This commit is contained in:
Matt Wells
2014-04-28 13:30:24 -07:00
parent de4a0a13a8
commit e21e0a404c
10 changed files with 168 additions and 35 deletions

@ -1252,6 +1252,33 @@ CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
return g_collectiondb.getRec ( coll );
}
char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
char *coll = r->getString ( "c" );
if ( coll && ! coll[0] ) coll = NULL;
if ( coll ) return coll;
CollectionRec *cr = NULL;
// default to main first
if ( ! coll ) {
cr = g_collectiondb.getRec("main");
// CAUTION: cr could be deleted so don't trust this ptr
// if you give up control of the cpu
if ( cr ) return cr->m_coll;
}
// try next in line
if ( ! coll ) {
cr = getFirstRec ();
if ( cr ) return cr->m_coll;
}
// give up?
return NULL;
}
//CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
// char *coll = getDefaultColl();
// return g_collectiondb.getRec(coll);
//}
// . get collectionRec from name
// . returns NULL if not available
CollectionRec *Collectiondb::getRec ( char *coll ) {

@ -75,6 +75,12 @@ class Collectiondb {
class CollectionRec *getRec ( class HttpRequest *r ,
bool useDefaultRec = true );
// do not support diffbot style token/name style for this one:
char *getDefaultColl ( HttpRequest *r ) ;
//class CollectionRec *getRec2 ( class HttpRequest *r ,
// bool useDefaultRec = true );
// . get collectionRec from name
// returns NULL if not available
class CollectionRec *getRec ( char *coll );

@ -24,6 +24,7 @@ class ThumbnailInfo {
char m_buf[];
char *getUrl() { return m_buf; };
char *getData() { return m_buf + m_urlSize; };
long getDataSize() { return m_dataSize; };
long getSize () { return sizeof(ThumbnailInfo)+m_urlSize+m_dataSize;};
bool printThumbnailInHtml ( SafeBuf *sb ) {

@ -771,7 +771,6 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
// this prints the <form tag as well
g_pages.printAdminTop ( &sb , socket , hr );
//
// show stats
//
@ -863,8 +862,100 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
);
sb.safePrintf("</table>\n\n");
}
// put the widget in here, just sort results by spidered date
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<br>"
"<script type=\"text/javascript\">"
"function diffbot_handler() {"
"if(this.readyState != 4 )return;"
"if(!this.responseText)return;"
"document.getElementById(\"diffbot_widget\")."
"innerHTML=this.responseText;"
"diffbot_scroll();}"
"</script>"
"<script type=text/javascript>function "
"diffbot_scroll() {var hd = document."
"getElementById('diffbot_invisible');"
"if ( ! hd ) {setTimeout('diffbot_scroll()',"
"3);return;} var b=parseInt(hd.style.top);"
"var step=4;b=b+step;hd.style.top=b+\"px\";"
"var vd=document.getElementById"
"('diffbot_visible');"
"var c=parseInt(vd.style.top);"
"c=c+step;"
"vd.style.top=c+\"px\";"
"if(b>=0)return;"
"setTimeout('diffbot_scroll()',3);}"
"</script>"
);
long widgetWidth = 300;
long widgetHeight = 500;
// make the ajax url that gets the search results
SafeBuf ub;
ub.safePrintf("/search"
"?format=ajax"
"&c=%s"
"&q=gbrevsortbyint%%3Agbspiderdate"
"&widgetheight=%li"
"&widgetwidth=%li"
"&topdocid="
, cr->m_coll
, widgetHeight
, widgetWidth
);
// then the containing div. set the "id" so that the
// style tag the user sets can control its appearance.
// when the browser loads this the ajax sets the contents
// to the reply from neo.
sb.safePrintf("<div id=diffbot_widget "
"style=\"border:2px solid black;"
"position:relative;border-radius:10px;"
"width:%lipx;height:%lipx;\">"
, widgetWidth
, widgetHeight
);
//sb.safePrintf("<style>"
// "a{color:white;}"
// "</style>");
// get the search results from neo as soon as this div is
// being rendered, and set its contents to them
sb.safePrintf("<script type=text/javascript>function "
"diffbot_reload() {var client="
"new XMLHttpRequest();"
"client.onreadystatechange=diffbot_handler;"
"var u='%s';"
"var td=document.getElementById('topdocid');"
"if ( td ) u=u+td.value;"
"client.open('GET',u);"
"client.send();"
"setTimeout('diffbot_reload()',15000);}"
"diffbot_reload();</script>"
, ub.getBufStart()
);
sb.safePrintf("Waiting for Server...");
// end the containing div
sb.safePrintf("</div>");
}
//if ( fmt != FORMAT_JSON )
// // wrap up the form, print a submit button
// g_pages.printAdminBottom ( &sb );

@ -2609,22 +2609,18 @@ bool printResult ( State0 *st, long ix ) {
// , widgetwidth - 2*8 // padding is 8px
// , mr->ptr_imgUrl);
if ( mr->ptr_imgData ) {
char *p = mr->ptr_imgData; // orig img url
p += gbstrlen(p) + 1; // dx of thumb
//long tdx = *(long *)p;
p += 4;
//long tdy = *(long *)p;
p += 4;
char *imgData = p;
char *pend = mr->ptr_imgData + mr->size_imgData;
long thumbBytes = pend - p;
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
sb->safePrintf("background-repeat:no-repeat;"
"background-size:%lipx 140px;"
"background-image:url('data:image/"
"jpg;base64,"
, widgetwidth - 2*8); // padding is 8px
// encode image in base 64
sb->base64Encode (imgData,thumbBytes,0); // 0 niceness
if ( ti )
sb->base64Encode (ti->getData(),
ti->getDataSize(),
0); // niceness
sb->safePrintf("');");
}
@ -2633,14 +2629,14 @@ bool printResult ( State0 *st, long ix ) {
sb->safePrintf("\">");
sb->safePrintf ( "<a "
"target=_blank "
"style=text-decoration:none; href=" );
"style=text-decoration:none; href=\"" );
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
// print the url in the href tag
sb->safeMemcpy ( url , newLen );
// then finish the a href tag and start a bold for title
sb->safePrintf ( ">");//<font size=+0>" );
sb->safePrintf ( "\">");//<font size=+0>" );
sb->safePrintf("<b style=\""
"text-decoration:none;"
@ -2669,6 +2665,22 @@ bool printResult ( State0 *st, long ix ) {
// then title over image
}
// only do link here if we have no thumbnail so no bg image
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) &&
! mr->ptr_imgData ) {
sb->safePrintf ( "<a style=text-decoration:none;"
"color:white; "
"href=" );
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
// print the url in the href tag
sb->safeMemcpy ( url , newLen );
// then finish the a href tag and start a bold for title
sb->safePrintf ( ">");//<font size=+0>" );
}
// the a href tag
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "\n\n" );
@ -2698,20 +2710,6 @@ bool printResult ( State0 *st, long ix ) {
}
// only do link here
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) &&
! mr->ptr_imgUrl ) {
sb->safePrintf ( "<a href=" );
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
// print the url in the href tag
sb->safeMemcpy ( url , newLen );
// then finish the a href tag and start a bold for title
sb->safePrintf ( ">");//<font size=+0>" );
}
// . then the title (should be NULL terminated)
// . the title can be NULL
// . highlight it first

@ -914,8 +914,9 @@ bool Pages::printAdminTop (SafeBuf *sb ,
//long user = getUserType ( s , r );
//char *username = g_users.getUsername ( r );
char *username = NULL;
char *coll = r->getString ( "c" );
if ( ! coll ) coll = "main";
//char *coll = r->getString ( "c" );
//if ( ! coll ) coll = "main";
char *coll = g_collectiondb.getDefaultColl(r);
//char *pwd = r->getString ( "pwd" );
// get username

@ -1386,9 +1386,13 @@ bool Parms::printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r) {
long page = g_pages.getDynamicPageNumber ( r );
long nc = r->getLong("nc",1);
long pd = r->getLong("pd",1);
char *coll = r->getString ( "c" );
if ( ! coll || ! coll[0] ) coll = "main";
CollectionRec *cr = g_collectiondb.getRec ( coll );
char *coll = g_collectiondb.getDefaultColl(r);
CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true);
//char *coll = r->getString ( "c" );
//if ( ! coll || ! coll[0] ) coll = "main";
//CollectionRec *cr = g_collectiondb.getRec ( coll );
// if "main" collection does not exist, try another
//if ( ! cr ) cr = getCollRecFromHttpRequest ( r );
printParms2 ( sb, page, cr, nc, pd,0,0 , s);
return true;
}

@ -3144,7 +3144,7 @@ struct QueryField g_fields[] = {
},
{"gbminint", FIELD_GBNUMBERMININT, false,
"Example: 'gbminint:spiderdate:1391749680' "
"Example: 'gbminint:gbspiderdate:1391749680' "
"'gbminint:count:99'. Numeric "
"fields can be in JSON or in meta tag. "
"Use 'gbspiderdate' field for the last time the page was "
@ -3152,7 +3152,7 @@ struct QueryField g_fields[] = {
},
{"gbmaxint", FIELD_GBNUMBERMAXINT, false,
"Example: 'gbmaxint:spiderdate:1391749680' "
"Example: 'gbmaxint:gbspiderdate:1391749680' "
"'gbmaxint:count:99'. Numeric "
"fields can be in JSON or in meta tag. "
"Use 'gbspiderdate' field for the last time the page was "

@ -1305,8 +1305,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
if ( ! doChainTest ) continue;
// ensure i goes back to head node
long j = i;
long loopCount = 0;
while ( j >= 0 ) {
if ( j == m_headNode ) break;
// sanity -- loop check
if ( ++loopCount > 10000 )
return log("db: tree had loop");
j = m_parents[j];
}
if ( j != m_headNode )

@ -45764,7 +45764,8 @@ char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
! gotOne &&
p[1] == ':' &&
// {"title":"whatever",...}
depth == 1 &&
// could be product:{title:... depth=2
(depth == 1 ||depth==2) &&
stringStart &&
(p - stringStart) == fieldLen &&
strncmp(field,stringStart,fieldLen)==0 ) {