forked from Mirrors/privacore-open-source-search-engine
Fix bug in Url::getDisplayUrl where it does not handle xn-- in url path correctly with a non idn domain
This commit is contained in:
74
Url.cpp
74
Url.cpp
@ -2214,47 +2214,51 @@ int32_t getPathDepth ( char *s , bool hasHttp ) {
|
||||
return depth;
|
||||
}
|
||||
|
||||
char* Url::getDisplayUrl(char* url, SafeBuf* sb) {
|
||||
char* found;
|
||||
char* labelCursor = url;
|
||||
if((found = strstr(labelCursor, "xn--"))) {
|
||||
sb->safeMemcpy(url, found - url);
|
||||
char* Url::getDisplayUrl( const char* url, SafeBuf* sb ) {
|
||||
const char *urlEnd = url + strlen(url);
|
||||
const char *p = url;
|
||||
if ( strncmp( p, "http://", 7 ) == 0 )
|
||||
p += 7;
|
||||
else if ( strncmp(p, "https://", 8 ) == 0 )
|
||||
p += 8;
|
||||
|
||||
char* p = url;
|
||||
char* pend = url + gbstrlen(url);
|
||||
if(strncmp(p, "http://", 7) == 0) p += 7;
|
||||
else if(strncmp(p, "https://", 8) == 0) p += 8;
|
||||
const char *domEnd = static_cast<const char*>( memchr( p, '/', urlEnd - p ) ) ?: urlEnd;
|
||||
|
||||
while(p < pend && *p != '/') p++;
|
||||
char* domEnd = p;
|
||||
bool firstRun = true;
|
||||
const char *found = NULL;
|
||||
const char *labelCursor = url;
|
||||
|
||||
do {
|
||||
if(found > domEnd) {
|
||||
// Dont even look if it is past the domain
|
||||
break;
|
||||
}
|
||||
while( ( found = strstr( labelCursor, "xn--" ) ) && ( found < domEnd ) ) {
|
||||
if ( firstRun ) {
|
||||
sb->safeMemcpy( url, found - url );
|
||||
firstRun = false;
|
||||
}
|
||||
|
||||
char* encodedStart = found + 4;
|
||||
uint32_t decoded [ MAX_URL_LEN];
|
||||
size_t decodedLen = MAX_URL_LEN - 1 ;
|
||||
char* labelEnd = encodedStart;
|
||||
while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' )
|
||||
labelEnd++;
|
||||
const char* encodedStart = found + 4;
|
||||
uint32_t decoded [ MAX_URL_LEN];
|
||||
size_t decodedLen = MAX_URL_LEN - 1 ;
|
||||
const char* labelEnd = encodedStart;
|
||||
while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' ) {
|
||||
labelEnd++;
|
||||
}
|
||||
|
||||
punycode_status status = punycode_decode(labelEnd - encodedStart,
|
||||
encodedStart,
|
||||
&decodedLen,
|
||||
decoded, NULL);
|
||||
if(status != 0) {
|
||||
log("build: Bad Engineer, failed to depunycode international url %s", url);
|
||||
sb->safePrintf("%s", url);
|
||||
return url;
|
||||
}
|
||||
sb->utf32Encode(decoded, decodedLen);
|
||||
if(*labelEnd == '.') sb->pushChar(*labelEnd++);
|
||||
labelCursor = labelEnd;
|
||||
} while((found = strstr(labelCursor, "xn--")));
|
||||
punycode_status status = punycode_decode(labelEnd - encodedStart, encodedStart, &decodedLen, decoded, NULL);
|
||||
if ( status != 0 ) {
|
||||
log( "build: Bad Engineer, failed to depunycode international url %s", url );
|
||||
sb->safePrintf("%s", labelCursor);
|
||||
sb->nullTerm();
|
||||
return sb->getBufStart();
|
||||
}
|
||||
|
||||
sb->utf32Encode( decoded, decodedLen );
|
||||
|
||||
if ( *labelEnd == '.' ) {
|
||||
sb->pushChar( *labelEnd++ );
|
||||
}
|
||||
|
||||
labelCursor = labelEnd;
|
||||
}
|
||||
|
||||
// Copy in the rest
|
||||
sb->safePrintf("%s", labelCursor);
|
||||
sb->nullTerm();
|
||||
|
2
Url.h
2
Url.h
@ -200,7 +200,7 @@ public:
|
||||
// is probably more accurate than this function.
|
||||
bool isLinkLoop();
|
||||
|
||||
static char* getDisplayUrl(char* url, SafeBuf* sb);
|
||||
static char* getDisplayUrl( const char* url, SafeBuf* sb );
|
||||
|
||||
private:
|
||||
void set( const char *s, int32_t len, bool addWWW, bool stripSessionIds, bool stripPound, bool stripCommonFile,
|
||||
|
@ -1,9 +1,10 @@
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "Url.h"
|
||||
#include "SafeBuf.h"
|
||||
|
||||
TEST(UrlTest, SetNonAsciiValid) {
|
||||
char* input_urls[] = {
|
||||
TEST( UrlTest, SetNonAsciiValid ) {
|
||||
const char* input_urls[] = {
|
||||
"http://topbeskæring.dk/velkommen",
|
||||
"www.Alliancefrançaise.nu",
|
||||
"française.Alliance.nu",
|
||||
@ -23,63 +24,203 @@ TEST(UrlTest, SetNonAsciiValid) {
|
||||
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqués",
|
||||
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://腕時計通販.jp/"
|
||||
"http://腕時計通販.jp/",
|
||||
"http://сацминэнерго.рф/robots.txt",
|
||||
"http://faß.de/",
|
||||
"http://βόλος.com/",
|
||||
"http://ශ්රී.com/",
|
||||
"http://نامهای.com/"
|
||||
};
|
||||
|
||||
const char* expected_normalized[] = {
|
||||
"http://xn--topbeskring-g9a.dk/velkommen",
|
||||
"http://www.xn--alliancefranaise-npb.nu/",
|
||||
"http://xn--franaise-v0a.alliance.nu/",
|
||||
"http://xn--franaise-v0a.alliance.nu/asdf",
|
||||
"http://xn--franaise-v0a.alliance.nu/asdf",
|
||||
"http://xn--franaise-v0a.alliance.nu/",
|
||||
"http://xn--lwt711i.xn--mi7a.com/",
|
||||
"http://xn--lwt711i.xn--mi7a.com/asdf/%E8%BF%90/abc",
|
||||
"http://xn--lwt711i.xn--mi7a.com/asdf",
|
||||
"http://xn--lwt711i.xn--mi7a.com/asdf",
|
||||
"http://xn--d0a6das0ae0bir7j.org/%D0%90%D0%BA%D0%B0%D0%B4%D1%8D%D0%BC%D1%96%D1%87%D0%BD%D0%B0%D1%8F",
|
||||
"https://hi.xn--d0a6divjd1bi0f.com/",
|
||||
"https://fakedomain.xn--fiq228c.org/asdf",
|
||||
"https://gigablast.com/abc/%E6%96%87/efg",
|
||||
"https://gigablast.com/?q=%E6%96%87",
|
||||
"http://www.example.xn--80aswg/",
|
||||
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqu%C3%A9s",
|
||||
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.xn--80aswg/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://xn--kjvp61d69f6wc3zf.jp/"
|
||||
const char *expected_normalized[] = {
|
||||
"http://xn--topbeskring-g9a.dk/velkommen",
|
||||
"http://www.xn--alliancefranaise-npb.nu/",
|
||||
"http://xn--franaise-v0a.alliance.nu/",
|
||||
"http://xn--franaise-v0a.alliance.nu/asdf",
|
||||
"http://xn--franaise-v0a.alliance.nu/asdf",
|
||||
"http://xn--franaise-v0a.alliance.nu/",
|
||||
"http://xn--lwt711i.xn--mi7a.com/",
|
||||
"http://xn--lwt711i.xn--mi7a.com/asdf/%E8%BF%90/abc",
|
||||
"http://xn--lwt711i.xn--mi7a.com/asdf",
|
||||
"http://xn--lwt711i.xn--mi7a.com/asdf",
|
||||
"http://xn--d0a6das0ae0bir7j.org/%D0%90%D0%BA%D0%B0%D0%B4%D1%8D%D0%BC%D1%96%D1%87%D0%BD%D0%B0%D1%8F",
|
||||
"https://hi.xn--d0a6divjd1bi0f.com/",
|
||||
"https://fakedomain.xn--fiq228c.org/asdf",
|
||||
"https://gigablast.com/abc/%E6%96%87/efg",
|
||||
"https://gigablast.com/?q=%E6%96%87",
|
||||
"http://www.example.xn--80aswg/",
|
||||
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqu%C3%A9s",
|
||||
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.xn--80aswg/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://xn--kjvp61d69f6wc3zf.jp/",
|
||||
"http://xn--80agflthakqd0d1e.xn--p1ai/robots.txt",
|
||||
"http://xn--fa-hia.de/",
|
||||
"http://xn--nxasmm1c.com/",
|
||||
"http://xn--10cl1a0b660p.com/",
|
||||
"http://xn--mgba3gch31f060k.com/"
|
||||
};
|
||||
|
||||
ASSERT_EQ(sizeof(input_urls)/sizeof(input_urls[0]), sizeof(expected_normalized)/sizeof(expected_normalized[0]));
|
||||
ASSERT_EQ( sizeof( input_urls ) / sizeof( input_urls[0] ),
|
||||
sizeof( expected_normalized ) / sizeof( expected_normalized[0] ) );
|
||||
|
||||
size_t len = sizeof(input_urls) / sizeof(input_urls[0]);
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
size_t len = sizeof( input_urls ) / sizeof( input_urls[0] );
|
||||
for ( size_t i = 0; i < len; i++ ) {
|
||||
Url url;
|
||||
url.set(input_urls[i]);
|
||||
url.set( input_urls[i] );
|
||||
|
||||
EXPECT_STREQ(expected_normalized[i], (const char*)url.getUrl());
|
||||
}
|
||||
}
|
||||
|
||||
TEST(UrlTest, SetNonAsciiInValid) {
|
||||
char* input_urls[] = {
|
||||
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela\032s-arsenals.php",
|
||||
"https://pypi.python\n\n\t\t\t\t.org/packages/source/p/pyramid/pyramid-1.5.tar.gz",
|
||||
"http://undocs.org/ru/A/C.3/68/\vSR.48"
|
||||
TEST( UrlTest, SetNonAsciiInValid ) {
|
||||
const char* input_urls[] = {
|
||||
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela\032s-arsenals.php",
|
||||
"https://pypi.python\n\n\t\t\t\t.org/packages/source/p/pyramid/pyramid-1.5.tar.gz",
|
||||
"http://undocs.org/ru/A/C.3/68/\vSR.48"
|
||||
};
|
||||
|
||||
const char* expected_normalized[] = {
|
||||
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela%1As-arsenals.php",
|
||||
"https://pypi.python/",
|
||||
const char *expected_normalized[] = {
|
||||
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela%1As-arsenals.php",
|
||||
"https://pypi.python/",
|
||||
"http://undocs.org/ru/A/C.3/68/%0BSR.48"
|
||||
};
|
||||
|
||||
ASSERT_EQ(sizeof(input_urls)/sizeof(input_urls[0]), sizeof(expected_normalized)/sizeof(expected_normalized[0]));
|
||||
ASSERT_EQ( sizeof( input_urls ) / sizeof( input_urls[0] ),
|
||||
sizeof( expected_normalized ) / sizeof( expected_normalized[0] ) );
|
||||
|
||||
size_t len = sizeof(input_urls) / sizeof(input_urls[0]);
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
size_t len = sizeof( input_urls ) / sizeof( input_urls[0] );
|
||||
for ( size_t i = 0; i < len; i++ ) {
|
||||
Url url;
|
||||
url.set(input_urls[i]);
|
||||
url.set( input_urls[i] );
|
||||
|
||||
EXPECT_STREQ(expected_normalized[i], (const char*)url.getUrl());
|
||||
}
|
||||
}
|
||||
|
||||
TEST( UrlTest, GetDisplayUrlFromCharArray ) {
|
||||
const char* input_urls[] = {
|
||||
"http://xn--topbeskring-g9a.dk/velkommen",
|
||||
"www.xn--Alliancefranaise-npb.nu",
|
||||
"xn--franaise-v0a.Alliance.nu",
|
||||
"xn--franaise-v0a.Alliance.nu/asdf",
|
||||
"http://xn--franaise-v0a.Alliance.nu/asdf",
|
||||
"http://xn--franaise-v0a.Alliance.nu/",
|
||||
"xn--lwt711i.xn--mi7a.com",
|
||||
"xn--lwt711i.xn--mi7a.com/asdf/运/abc",
|
||||
"xn--lwt711i.xn--mi7a.com/asdf",
|
||||
"http://xn--lwt711i.xn--mi7a.com/asdf",
|
||||
"http://xn--d0a6das0ae0bir7j.org/Акадэмічная",
|
||||
"https://hi.xn--d0a6divjd1bi0f.com",
|
||||
"https://fakedomain.xn--fiq228c.org/asdf",
|
||||
"http://www.example.xn--80aswg",
|
||||
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.xn--80aswg/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://xn--kjvp61d69f6wc3zf.jp/",
|
||||
"http://xn--80agflthakqd0d1e.xn--p1ai/robots.txt",
|
||||
"http://xn--80agflthakqd0d1e.xn--p1ai",
|
||||
"http://сацминэнерго.рф",
|
||||
"http://mct.verisign-grs.com/convertServlet?input=r7d.xn--g1a8ac.xn--p1ai"
|
||||
};
|
||||
|
||||
const char *expected_display[] = {
|
||||
"http://topbeskæring.dk/velkommen",
|
||||
"www.Alliancefrançaise.nu",
|
||||
"française.Alliance.nu",
|
||||
"française.Alliance.nu/asdf",
|
||||
"http://française.Alliance.nu/asdf",
|
||||
"http://française.Alliance.nu/",
|
||||
"幸运.龍.com",
|
||||
"幸运.龍.com/asdf/运/abc",
|
||||
"幸运.龍.com/asdf",
|
||||
"http://幸运.龍.com/asdf",
|
||||
"http://Беларуская.org/Акадэмічная",
|
||||
"https://hi.Български.com",
|
||||
"https://fakedomain.中文.org/asdf",
|
||||
"http://www.example.сайт",
|
||||
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://腕時計通販.jp/",
|
||||
"http://сацминэнерго.рф/robots.txt",
|
||||
"http://сацминэнерго.рф",
|
||||
"http://сацминэнерго.рф",
|
||||
"http://mct.verisign-grs.com/convertServlet?input=r7d.xn--g1a8ac.xn--p1ai"
|
||||
};
|
||||
|
||||
ASSERT_EQ( sizeof( input_urls ) / sizeof( input_urls[0] ),
|
||||
sizeof( expected_display ) / sizeof( expected_display[0] ) );
|
||||
|
||||
size_t len = sizeof( input_urls ) / sizeof( input_urls[0] );
|
||||
for ( size_t i = 0; i < len; i++ ) {
|
||||
StackBuf( tmpBuf );
|
||||
EXPECT_STREQ( expected_display[i], (const char *) Url::getDisplayUrl( input_urls[i], &tmpBuf ));
|
||||
}
|
||||
}
|
||||
|
||||
TEST( UrlTest, GetDisplayUrlFromUrl ) {
|
||||
const char* input_urls[] = {
|
||||
"http://topbeskæring.dk/velkommen",
|
||||
"www.Alliancefrançaise.nu",
|
||||
"française.Alliance.nu",
|
||||
"française.Alliance.nu/asdf",
|
||||
"http://française.Alliance.nu/asdf",
|
||||
"http://française.Alliance.nu/",
|
||||
"幸运.龍.com",
|
||||
"幸运.龍.com/asdf/运/abc",
|
||||
"幸运.龍.com/asdf",
|
||||
"http://幸运.龍.com/asdf",
|
||||
"http://Беларуская.org/Акадэмічная",
|
||||
"https://hi.Български.com",
|
||||
"https://fakedomain.中文.org/asdf",
|
||||
"https://gigablast.com/abc/文/efg",
|
||||
"https://gigablast.com/?q=文",
|
||||
"http://www.example.сайт",
|
||||
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqués",
|
||||
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://腕時計通販.jp/",
|
||||
"http://сацминэнерго.рф/robots.txt",
|
||||
"http://сацминэнерго.рф",
|
||||
"http://mct.verisign-grs.com/convertServlet?input=r7d.xn--g1a8ac.xn--p1ai"
|
||||
};
|
||||
|
||||
const char *expected_display[] = {
|
||||
"http://topbeskæring.dk/velkommen",
|
||||
"http://www.alliancefrançaise.nu/",
|
||||
"http://française.alliance.nu/",
|
||||
"http://française.alliance.nu/asdf",
|
||||
"http://française.alliance.nu/asdf",
|
||||
"http://française.alliance.nu/",
|
||||
"http://幸运.龍.com/",
|
||||
"http://幸运.龍.com/asdf/%E8%BF%90/abc",
|
||||
"http://幸运.龍.com/asdf",
|
||||
"http://幸运.龍.com/asdf",
|
||||
"http://Беларуская.org/%D0%90%D0%BA%D0%B0%D0%B4%D1%8D%D0%BC%D1%96%D1%87%D0%BD%D0%B0%D1%8F",
|
||||
"https://hi.Български.com/",
|
||||
"https://fakedomain.中文.org/asdf",
|
||||
"https://gigablast.com/abc/%E6%96%87/efg",
|
||||
"https://gigablast.com/?q=%E6%96%87",
|
||||
"http://www.example.сайт/",
|
||||
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqu%C3%A9s",
|
||||
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
|
||||
"http://腕時計通販.jp/",
|
||||
"http://сацминэнерго.рф/robots.txt",
|
||||
"http://сацминэнерго.рф/",
|
||||
"http://mct.verisign-grs.com/convertServlet?input=r7d.xn--g1a8ac.xn--p1ai"
|
||||
};
|
||||
|
||||
ASSERT_EQ( sizeof( input_urls ) / sizeof( input_urls[0] ),
|
||||
sizeof( expected_display ) / sizeof( expected_display[0] ) );
|
||||
|
||||
size_t len = sizeof( input_urls ) / sizeof( input_urls[0] );
|
||||
for ( size_t i = 0; i < len; i++ ) {
|
||||
Url url;
|
||||
url.set( input_urls[i] );
|
||||
|
||||
StackBuf( tmpBuf );
|
||||
EXPECT_STREQ( expected_display[i], (const char*)Url::getDisplayUrl( url.getUrl(), &tmpBuf ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user