Fix bug in Url::getDisplayUrl where it does not handle xn-- in url path correctly with a non idn domain

This commit is contained in:
Ai Lin Chia
2016-04-06 15:50:09 +02:00
parent 4c3d6b4619
commit a1df6baa1d
3 changed files with 221 additions and 76 deletions

74
Url.cpp

@ -2214,47 +2214,51 @@ int32_t getPathDepth ( char *s , bool hasHttp ) {
return depth;
}
char* Url::getDisplayUrl(char* url, SafeBuf* sb) {
char* found;
char* labelCursor = url;
if((found = strstr(labelCursor, "xn--"))) {
sb->safeMemcpy(url, found - url);
char* Url::getDisplayUrl( const char* url, SafeBuf* sb ) {
const char *urlEnd = url + strlen(url);
const char *p = url;
if ( strncmp( p, "http://", 7 ) == 0 )
p += 7;
else if ( strncmp(p, "https://", 8 ) == 0 )
p += 8;
char* p = url;
char* pend = url + gbstrlen(url);
if(strncmp(p, "http://", 7) == 0) p += 7;
else if(strncmp(p, "https://", 8) == 0) p += 8;
const char *domEnd = static_cast<const char*>( memchr( p, '/', urlEnd - p ) ) ?: urlEnd;
while(p < pend && *p != '/') p++;
char* domEnd = p;
bool firstRun = true;
const char *found = NULL;
const char *labelCursor = url;
do {
if(found > domEnd) {
// Dont even look if it is past the domain
break;
}
while( ( found = strstr( labelCursor, "xn--" ) ) && ( found < domEnd ) ) {
if ( firstRun ) {
sb->safeMemcpy( url, found - url );
firstRun = false;
}
char* encodedStart = found + 4;
uint32_t decoded [ MAX_URL_LEN];
size_t decodedLen = MAX_URL_LEN - 1 ;
char* labelEnd = encodedStart;
while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' )
labelEnd++;
const char* encodedStart = found + 4;
uint32_t decoded [ MAX_URL_LEN];
size_t decodedLen = MAX_URL_LEN - 1 ;
const char* labelEnd = encodedStart;
while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' ) {
labelEnd++;
}
punycode_status status = punycode_decode(labelEnd - encodedStart,
encodedStart,
&decodedLen,
decoded, NULL);
if(status != 0) {
log("build: Bad Engineer, failed to depunycode international url %s", url);
sb->safePrintf("%s", url);
return url;
}
sb->utf32Encode(decoded, decodedLen);
if(*labelEnd == '.') sb->pushChar(*labelEnd++);
labelCursor = labelEnd;
} while((found = strstr(labelCursor, "xn--")));
punycode_status status = punycode_decode(labelEnd - encodedStart, encodedStart, &decodedLen, decoded, NULL);
if ( status != 0 ) {
log( "build: Bad Engineer, failed to depunycode international url %s", url );
sb->safePrintf("%s", labelCursor);
sb->nullTerm();
return sb->getBufStart();
}
sb->utf32Encode( decoded, decodedLen );
if ( *labelEnd == '.' ) {
sb->pushChar( *labelEnd++ );
}
labelCursor = labelEnd;
}
// Copy in the rest
sb->safePrintf("%s", labelCursor);
sb->nullTerm();

2
Url.h

@ -200,7 +200,7 @@ public:
// is probably more accurate than this function.
bool isLinkLoop();
static char* getDisplayUrl(char* url, SafeBuf* sb);
static char* getDisplayUrl( const char* url, SafeBuf* sb );
private:
void set( const char *s, int32_t len, bool addWWW, bool stripSessionIds, bool stripPound, bool stripCommonFile,

@ -1,9 +1,10 @@
#include "gtest/gtest.h"
#include "Url.h"
#include "SafeBuf.h"
TEST(UrlTest, SetNonAsciiValid) {
char* input_urls[] = {
TEST( UrlTest, SetNonAsciiValid ) {
const char* input_urls[] = {
"http://topbeskæring.dk/velkommen",
"www.Alliancefrançaise.nu",
"française.Alliance.nu",
@ -23,63 +24,203 @@ TEST(UrlTest, SetNonAsciiValid) {
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqués",
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
"http://腕時計通販.jp/"
"http://腕時計通販.jp/",
"http://сацминэнерго.рф/robots.txt",
"http://faß.de/",
"http://βόλος.com/",
"http://ශ්‍රී.com/",
"http://نامه‌ای.com/"
};
const char* expected_normalized[] = {
"http://xn--topbeskring-g9a.dk/velkommen",
"http://www.xn--alliancefranaise-npb.nu/",
"http://xn--franaise-v0a.alliance.nu/",
"http://xn--franaise-v0a.alliance.nu/asdf",
"http://xn--franaise-v0a.alliance.nu/asdf",
"http://xn--franaise-v0a.alliance.nu/",
"http://xn--lwt711i.xn--mi7a.com/",
"http://xn--lwt711i.xn--mi7a.com/asdf/%E8%BF%90/abc",
"http://xn--lwt711i.xn--mi7a.com/asdf",
"http://xn--lwt711i.xn--mi7a.com/asdf",
"http://xn--d0a6das0ae0bir7j.org/%D0%90%D0%BA%D0%B0%D0%B4%D1%8D%D0%BC%D1%96%D1%87%D0%BD%D0%B0%D1%8F",
"https://hi.xn--d0a6divjd1bi0f.com/",
"https://fakedomain.xn--fiq228c.org/asdf",
"https://gigablast.com/abc/%E6%96%87/efg",
"https://gigablast.com/?q=%E6%96%87",
"http://www.example.xn--80aswg/",
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqu%C3%A9s",
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.xn--80aswg/xn--fooled-you-into-trying-to-decode-this",
"http://xn--kjvp61d69f6wc3zf.jp/"
const char *expected_normalized[] = {
"http://xn--topbeskring-g9a.dk/velkommen",
"http://www.xn--alliancefranaise-npb.nu/",
"http://xn--franaise-v0a.alliance.nu/",
"http://xn--franaise-v0a.alliance.nu/asdf",
"http://xn--franaise-v0a.alliance.nu/asdf",
"http://xn--franaise-v0a.alliance.nu/",
"http://xn--lwt711i.xn--mi7a.com/",
"http://xn--lwt711i.xn--mi7a.com/asdf/%E8%BF%90/abc",
"http://xn--lwt711i.xn--mi7a.com/asdf",
"http://xn--lwt711i.xn--mi7a.com/asdf",
"http://xn--d0a6das0ae0bir7j.org/%D0%90%D0%BA%D0%B0%D0%B4%D1%8D%D0%BC%D1%96%D1%87%D0%BD%D0%B0%D1%8F",
"https://hi.xn--d0a6divjd1bi0f.com/",
"https://fakedomain.xn--fiq228c.org/asdf",
"https://gigablast.com/abc/%E6%96%87/efg",
"https://gigablast.com/?q=%E6%96%87",
"http://www.example.xn--80aswg/",
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqu%C3%A9s",
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.xn--80aswg/xn--fooled-you-into-trying-to-decode-this",
"http://xn--kjvp61d69f6wc3zf.jp/",
"http://xn--80agflthakqd0d1e.xn--p1ai/robots.txt",
"http://xn--fa-hia.de/",
"http://xn--nxasmm1c.com/",
"http://xn--10cl1a0b660p.com/",
"http://xn--mgba3gch31f060k.com/"
};
ASSERT_EQ(sizeof(input_urls)/sizeof(input_urls[0]), sizeof(expected_normalized)/sizeof(expected_normalized[0]));
ASSERT_EQ( sizeof( input_urls ) / sizeof( input_urls[0] ),
sizeof( expected_normalized ) / sizeof( expected_normalized[0] ) );
size_t len = sizeof(input_urls) / sizeof(input_urls[0]);
for (size_t i = 0; i < len; i++) {
size_t len = sizeof( input_urls ) / sizeof( input_urls[0] );
for ( size_t i = 0; i < len; i++ ) {
Url url;
url.set(input_urls[i]);
url.set( input_urls[i] );
EXPECT_STREQ(expected_normalized[i], (const char*)url.getUrl());
}
}
TEST(UrlTest, SetNonAsciiInValid) {
char* input_urls[] = {
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela\032s-arsenals.php",
"https://pypi.python\n\n\t\t\t\t.org/packages/source/p/pyramid/pyramid-1.5.tar.gz",
"http://undocs.org/ru/A/C.3/68/\vSR.48"
TEST( UrlTest, SetNonAsciiInValid ) {
const char* input_urls[] = {
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela\032s-arsenals.php",
"https://pypi.python\n\n\t\t\t\t.org/packages/source/p/pyramid/pyramid-1.5.tar.gz",
"http://undocs.org/ru/A/C.3/68/\vSR.48"
};
const char* expected_normalized[] = {
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela%1As-arsenals.php",
"https://pypi.python/",
const char *expected_normalized[] = {
"http://www.fas.org/blog/ssp/2009/08/securing-venezuela%1As-arsenals.php",
"https://pypi.python/",
"http://undocs.org/ru/A/C.3/68/%0BSR.48"
};
ASSERT_EQ(sizeof(input_urls)/sizeof(input_urls[0]), sizeof(expected_normalized)/sizeof(expected_normalized[0]));
ASSERT_EQ( sizeof( input_urls ) / sizeof( input_urls[0] ),
sizeof( expected_normalized ) / sizeof( expected_normalized[0] ) );
size_t len = sizeof(input_urls) / sizeof(input_urls[0]);
for (size_t i = 0; i < len; i++) {
size_t len = sizeof( input_urls ) / sizeof( input_urls[0] );
for ( size_t i = 0; i < len; i++ ) {
Url url;
url.set(input_urls[i]);
url.set( input_urls[i] );
EXPECT_STREQ(expected_normalized[i], (const char*)url.getUrl());
}
}
TEST( UrlTest, GetDisplayUrlFromCharArray ) {
const char* input_urls[] = {
"http://xn--topbeskring-g9a.dk/velkommen",
"www.xn--Alliancefranaise-npb.nu",
"xn--franaise-v0a.Alliance.nu",
"xn--franaise-v0a.Alliance.nu/asdf",
"http://xn--franaise-v0a.Alliance.nu/asdf",
"http://xn--franaise-v0a.Alliance.nu/",
"xn--lwt711i.xn--mi7a.com",
"xn--lwt711i.xn--mi7a.com/asdf/运/abc",
"xn--lwt711i.xn--mi7a.com/asdf",
"http://xn--lwt711i.xn--mi7a.com/asdf",
"http://xn--d0a6das0ae0bir7j.org/Акадэмічная",
"https://hi.xn--d0a6divjd1bi0f.com",
"https://fakedomain.xn--fiq228c.org/asdf",
"http://www.example.xn--80aswg",
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.xn--80aswg/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
"http://xn--kjvp61d69f6wc3zf.jp/",
"http://xn--80agflthakqd0d1e.xn--p1ai/robots.txt",
"http://xn--80agflthakqd0d1e.xn--p1ai",
"http://сацминэнерго.рф",
"http://mct.verisign-grs.com/convertServlet?input=r7d.xn--g1a8ac.xn--p1ai"
};
const char *expected_display[] = {
"http://topbeskæring.dk/velkommen",
"www.Alliancefrançaise.nu",
"française.Alliance.nu",
"française.Alliance.nu/asdf",
"http://française.Alliance.nu/asdf",
"http://française.Alliance.nu/",
"幸运.龍.com",
"幸运.龍.com/asdf/运/abc",
"幸运.龍.com/asdf",
"http://幸运.龍.com/asdf",
"http://Беларуская.org/Акадэмічная",
"https://hi.Български.com",
"https://fakedomain.中文.org/asdf",
"http://www.example.сайт",
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
"http://腕時計通販.jp/",
"http://сацминэнерго.рф/robots.txt",
"http://сацминэнерго.рф",
"http://сацминэнерго.рф",
"http://mct.verisign-grs.com/convertServlet?input=r7d.xn--g1a8ac.xn--p1ai"
};
ASSERT_EQ( sizeof( input_urls ) / sizeof( input_urls[0] ),
sizeof( expected_display ) / sizeof( expected_display[0] ) );
size_t len = sizeof( input_urls ) / sizeof( input_urls[0] );
for ( size_t i = 0; i < len; i++ ) {
StackBuf( tmpBuf );
EXPECT_STREQ( expected_display[i], (const char *) Url::getDisplayUrl( input_urls[i], &tmpBuf ));
}
}
TEST( UrlTest, GetDisplayUrlFromUrl ) {
const char* input_urls[] = {
"http://topbeskæring.dk/velkommen",
"www.Alliancefrançaise.nu",
"française.Alliance.nu",
"française.Alliance.nu/asdf",
"http://française.Alliance.nu/asdf",
"http://française.Alliance.nu/",
"幸运.龍.com",
"幸运.龍.com/asdf/运/abc",
"幸运.龍.com/asdf",
"http://幸运.龍.com/asdf",
"http://Беларуская.org/Акадэмічная",
"https://hi.Български.com",
"https://fakedomain.中文.org/asdf",
"https://gigablast.com/abc/文/efg",
"https://gigablast.com/?q=文",
"http://www.example.сайт",
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqués",
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
"http://腕時計通販.jp/",
"http://сацминэнерго.рф/robots.txt",
"http://сацминэнерго.рф",
"http://mct.verisign-grs.com/convertServlet?input=r7d.xn--g1a8ac.xn--p1ai"
};
const char *expected_display[] = {
"http://topbeskæring.dk/velkommen",
"http://www.alliancefrançaise.nu/",
"http://française.alliance.nu/",
"http://française.alliance.nu/asdf",
"http://française.alliance.nu/asdf",
"http://française.alliance.nu/",
"http://幸运.龍.com/",
"http://幸运.龍.com/asdf/%E8%BF%90/abc",
"http://幸运.龍.com/asdf",
"http://幸运.龍.com/asdf",
"http://Беларуская.org/%D0%90%D0%BA%D0%B0%D0%B4%D1%8D%D0%BC%D1%96%D1%87%D0%BD%D0%B0%D1%8F",
"https://hi.Български.com/",
"https://fakedomain.中文.org/asdf",
"https://gigablast.com/abc/%E6%96%87/efg",
"https://gigablast.com/?q=%E6%96%87",
"http://www.example.сайт/",
"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqu%C3%A9s",
"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
"http://腕時計通販.jp/",
"http://сацминэнерго.рф/robots.txt",
"http://сацминэнерго.рф/",
"http://mct.verisign-grs.com/convertServlet?input=r7d.xn--g1a8ac.xn--p1ai"
};
ASSERT_EQ( sizeof( input_urls ) / sizeof( input_urls[0] ),
sizeof( expected_display ) / sizeof( expected_display[0] ) );
size_t len = sizeof( input_urls ) / sizeof( input_urls[0] );
for ( size_t i = 0; i < len; i++ ) {
Url url;
url.set( input_urls[i] );
StackBuf( tmpBuf );
EXPECT_STREQ( expected_display[i], (const char*)Url::getDisplayUrl( url.getUrl(), &tmpBuf ) );
}
}