
194 lines
4.6 KiB

// Matt Wells, copyright Jan 2002
// normalizes urls from stdin
#include "gb-include.h"
#include "Url.h"
#include "Mem.h"
#include "Titledb.h"
#include "HttpMime.h"
#include "SiteGetter.h"
//#include "Tfndb.h"
//#include "Msg50.h"
//#include "Msg16.h"
bool mainShutdown ( bool urgent ) { return true; }
bool closeAll ( void *state , void (* callback)(void *state) ) {return true;}
bool allExit ( ) { return true; }
//int32_t g_qbufNeedSave = false;
//SafeBuf g_qbuf;
bool sendPageSEO(class TcpSocket *s, class HttpRequest *hr) {return true;}
char g_recoveryMode;
int main ( int argc , char *argv[] ) {
bool addWWW = true;
bool stripSession = true;
// check for arguments
for (int32_t i = 1; i < argc; i++) {
if (strcmp(argv[i], "-w") == 0)
addWWW = false;
else if (strcmp(argv[i], "-s") == 0)
stripSession = false;
// initialize
//g_conf.m_tfndbExtBits = 23;
// read a url from stddin
char sbuf[1024];
if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1);
char *s = sbuf;
char fbuf[1024];
// decode if we should
if ( strncmp(s,"http%3A%2F%2F",13) == 0 ||
strncmp(s,"https%3A%2F%2F",13) == 0 ) {
s = fbuf;
// old url
printf("old: %s",s);
int32_t slen = gbstrlen(s);
// remove any www. if !addWWW
if (!addWWW) {
if (slen >= 4 &&
strncasecmp(s, "www.", 4) == 0) {
slen -= 4;
memmove(s, &s[4], slen);
else {
// get past a ://
int32_t si = 0;
while (si < slen &&
( s[si] != ':' ||
s[si+1] != '/' ||
s[si+2] != '/' ) )
// remove the www.
if (si + 7 < slen) {
si += 3;
if (strncasecmp(&s[si], "www.", 4) == 0) {
slen -= 4;
memmove(&s[si], &s[si+4], slen-si);
// set it
Url u;
u.set ( s , slen ,
addWWW , /*add www?*/
stripSession ); /*strip session ids?*/
// print it
char out[1024*4];
char *p = out;
p += sprintf(p,"tld: ");
gbmemcpy ( p, u.getTLD(),u.getTLDLen());
p += u.getTLDLen();
char c = *p;
*p = '\0';
*p = c;
// dom
p = out;
sprintf ( p , "dom: ");
p += gbstrlen ( p );
gbmemcpy ( p , u.getDomain() , u.getDomainLen() );
p += u.getDomainLen();
c = *p;
*p = '\0';
*p = c;
// host
p = out;
sprintf ( p , "host: ");
p += gbstrlen ( p );
gbmemcpy ( p , u.getHost() , u.getHostLen() );
p += u.getHostLen();
c = *p;
*p = '\0';
*p = c;
// then the whole url
printf("url: %s\n", u.getUrl() );
int32_t siteLen;
char *site = u.getSite ( &siteLen , NULL , false );
if ( site ) {
c = site[siteLen];
site[siteLen] = '\0';
printf("site: %s\n", site );
if ( site ) site[siteLen] = c;
SiteGetter sg;
sg.getSite ( u.getUrl() ,
NULL , // tagrec
0 , // timestamp
NULL, // coll
0 , // niceness
//false , // addtags
NULL , // state
NULL ); // callback
if ( sg.m_siteLen )
printf("site: %s\n",sg.m_site);
printf("isRoot: %"INT32"\n",(int32_t)u.isRoot());
bool perm = ::isPermalink ( NULL , // coll
NULL , // Links ptr
&u , // the url
CT_HTML , // contentType
NULL , // LinkInfo ptr
false );// isRSS?
printf ("isPermalink: %"INT32"\n",(int32_t)perm);
// print the path too
p = out;
p += sprintf ( p , "path: " );
gbmemcpy ( p , u.getPath(), u.getPathLen() );
p += u.getPathLen();
if ( u.getFilename() ) {
p += sprintf ( p , "\nfilename: " );
gbmemcpy ( p , u.getFilename(), u.getFilenameLen() );
p += u.getFilenameLen();
*p = '\0';
printf("%s\n", out );
// encoded
char dst[MAX_URL_LEN+200];
urlEncode ( dst,MAX_URL_LEN+100,
u.getUrl(), u.getUrlLen(),
false ); // are we encoding a request path?
printf("encoded: %s\n",dst);
// the probable docid
int64_t pd = g_titledb.getProbableDocId(&u);
printf("pdocid: %"UINT64"\n", pd );
printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) );
//printf("ext23: 0x%"XINT32"\n",g_tfndb.makeExt(&u));
if ( u.isLinkLoop() ) printf("islinkloop: yes\n");
else printf("islinkloop: no\n");
int64_t hh64 = u.getHostHash64();
printf("hosthash64: 0x%016"XINT64"\n",hh64);
uint32_t hh32 = u.getHostHash32();
printf("hosthash32: 0x%08"XINT32" (%"UINT32")\n",hh32,hh32);
int64_t dh64 = u.getDomainHash64();
printf("domhash64: 0x%016"XINT64"\n",dh64);
int64_t uh64 = u.getUrlHash64();
printf("urlhash64: 0x%016"XINT64"\n",uh64);
//if(isUrlUnregulated(NULL ,0,&u)) printf("unregulated: yes\n");
//else printf("unregulated: no\n");
goto loop;