mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-13 02:36:06 -04:00
Remove old misc/urlinfo.cpp. Use tools/print_urlinfo.cpp instead
This commit is contained in:
184
misc/urlinfo.cpp
184
misc/urlinfo.cpp
@ -1,184 +0,0 @@
|
||||
// Matt Wells, copyright Jan 2002
|
||||
|
||||
// normalizes urls from stdin
|
||||
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "Url.h"
|
||||
#include "Mem.h"
|
||||
#include "Titledb.h"
|
||||
#include "HttpMime.h"
|
||||
#include "SiteGetter.h"
|
||||
|
||||
bool g_recoveryMode = false;
|
||||
int32_t g_recoveryLevel = 0;
|
||||
|
||||
int main ( int argc , char *argv[] ) {
|
||||
bool addWWW = true;
|
||||
bool stripSession = true;
|
||||
// check for arguments
|
||||
for (int32_t i = 1; i < argc; i++) {
|
||||
if (strcmp(argv[i], "-w") == 0)
|
||||
addWWW = false;
|
||||
else if (strcmp(argv[i], "-s") == 0)
|
||||
stripSession = false;
|
||||
}
|
||||
// initialize
|
||||
//g_mem.init(100*1024);
|
||||
hashinit();
|
||||
//g_conf.m_tfndbExtBits = 23;
|
||||
loop:
|
||||
// read a url from stddin
|
||||
char sbuf[1024];
|
||||
if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1);
|
||||
char *s = sbuf;
|
||||
char fbuf[1024];
|
||||
// decode if we should
|
||||
if ( strncmp(s,"http%3A%2F%2F",13) == 0 ||
|
||||
strncmp(s,"https%3A%2F%2F",13) == 0 ) {
|
||||
urlDecode(fbuf,s,gbstrlen(s));
|
||||
s = fbuf;
|
||||
}
|
||||
// old url
|
||||
printf("###############\n");
|
||||
printf("old: %s",s);
|
||||
int32_t slen = gbstrlen(s);
|
||||
// remove any www. if !addWWW
|
||||
if (!addWWW) {
|
||||
if (slen >= 4 &&
|
||||
strncasecmp(s, "www.", 4) == 0) {
|
||||
slen -= 4;
|
||||
memmove(s, &s[4], slen);
|
||||
}
|
||||
else {
|
||||
// get past a ://
|
||||
int32_t si = 0;
|
||||
while (si < slen &&
|
||||
( s[si] != ':' ||
|
||||
s[si+1] != '/' ||
|
||||
s[si+2] != '/' ) )
|
||||
si++;
|
||||
// remove the www.
|
||||
if (si + 7 < slen) {
|
||||
si += 3;
|
||||
if (strncasecmp(&s[si], "www.", 4) == 0) {
|
||||
slen -= 4;
|
||||
memmove(&s[si], &s[si+4], slen-si);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// set it
|
||||
Url u;
|
||||
u.set ( s , slen ,
|
||||
addWWW , /*add www?*/
|
||||
stripSession ); /*strip session ids?*/
|
||||
// print it
|
||||
char out[1024*4];
|
||||
char *p = out;
|
||||
p += sprintf(p,"tld: ");
|
||||
gbmemcpy ( p, u.getTLD(),u.getTLDLen());
|
||||
p += u.getTLDLen();
|
||||
char c = *p;
|
||||
*p = '\0';
|
||||
printf("%s\n",out);
|
||||
*p = c;
|
||||
|
||||
|
||||
// dom
|
||||
p = out;
|
||||
sprintf ( p , "dom: ");
|
||||
p += gbstrlen ( p );
|
||||
gbmemcpy ( p , u.getDomain() , u.getDomainLen() );
|
||||
p += u.getDomainLen();
|
||||
c = *p;
|
||||
*p = '\0';
|
||||
printf("%s\n",out);
|
||||
*p = c;
|
||||
// host
|
||||
p = out;
|
||||
sprintf ( p , "host: ");
|
||||
p += gbstrlen ( p );
|
||||
gbmemcpy ( p , u.getHost() , u.getHostLen() );
|
||||
p += u.getHostLen();
|
||||
c = *p;
|
||||
*p = '\0';
|
||||
printf("%s\n",out);
|
||||
*p = c;
|
||||
// then the whole url
|
||||
printf("url: %s\n", u.getUrl() );
|
||||
|
||||
/*
|
||||
int32_t siteLen;
|
||||
char *site = u.getSite ( &siteLen , NULL , false );
|
||||
if ( site ) {
|
||||
c = site[siteLen];
|
||||
site[siteLen] = '\0';
|
||||
}
|
||||
printf("site: %s\n", site );
|
||||
if ( site ) site[siteLen] = c;
|
||||
*/
|
||||
SiteGetter sg;
|
||||
sg.getSite ( u.getUrl() ,
|
||||
NULL , // tagrec
|
||||
0 , // timestamp
|
||||
0, // coll
|
||||
0 , // niceness
|
||||
NULL , // state
|
||||
NULL ); // callback
|
||||
|
||||
if ( sg.m_siteLen )
|
||||
printf("site: %s\n",sg.m_site);
|
||||
|
||||
printf("isRoot: %"INT32"\n",(int32_t)u.isRoot());
|
||||
|
||||
/*
|
||||
bool perm = ::isPermalink ( NULL , // coll
|
||||
NULL , // Links ptr
|
||||
&u , // the url
|
||||
CT_HTML , // contentType
|
||||
NULL , // LinkInfo ptr
|
||||
false );// isRSS?
|
||||
printf ("isPermalink: %"INT32"\n",(int32_t)perm);
|
||||
*/
|
||||
|
||||
// print the path too
|
||||
p = out;
|
||||
|
||||
p += sprintf ( p , "path: " );
|
||||
gbmemcpy ( p , u.getPath(), u.getPathLen() );
|
||||
p += u.getPathLen();
|
||||
|
||||
if ( u.getFilename() ) {
|
||||
p += sprintf ( p , "\nfilename: " );
|
||||
gbmemcpy ( p , u.getFilename(), u.getFilenameLen() );
|
||||
p += u.getFilenameLen();
|
||||
*p = '\0';
|
||||
printf("%s\n", out );
|
||||
}
|
||||
|
||||
// encoded
|
||||
char dst[MAX_URL_LEN+200];
|
||||
urlEncode ( dst,MAX_URL_LEN+100,
|
||||
u.getUrl(), u.getUrlLen(),
|
||||
false ); // are we encoding a request path?
|
||||
printf("encoded: %s\n",dst);
|
||||
|
||||
// the probable docid
|
||||
int64_t pd = Titledb::getProbableDocId(&u);
|
||||
printf("pdocid: %"UINT64"\n", pd );
|
||||
printf("dom8: 0x%"XINT32"\n", (int32_t)Titledb::getDomHash8FromDocId(pd) );
|
||||
if ( u.isLinkLoop() ) printf("islinkloop: yes\n");
|
||||
else printf("islinkloop: no\n");
|
||||
int64_t hh64 = u.getHostHash64();
|
||||
printf("hosthash64: 0x%016"XINT64"\n",hh64);
|
||||
uint32_t hh32 = u.getHostHash32();
|
||||
printf("hosthash32: 0x%08"XINT32" (%"UINT32")\n",hh32,hh32);
|
||||
int64_t dh64 = u.getDomainHash64();
|
||||
printf("domhash64: 0x%016"XINT64"\n",dh64);
|
||||
int64_t uh64 = u.getUrlHash64();
|
||||
printf("urlhash64: 0x%016"XINT64"\n",uh64);
|
||||
//if(isUrlUnregulated(NULL ,0,&u)) printf("unregulated: yes\n");
|
||||
//else printf("unregulated: no\n");
|
||||
goto loop;
|
||||
}
|
Reference in New Issue
Block a user