From 2a848fa7279002259a3e651223ce6f3230ca22d7 Mon Sep 17 00:00:00 2001 From: Jeremy Selier Date: Mon, 13 Jun 2011 14:43:16 +0200 Subject: [PATCH] Close #1149 IDNA and Punycode support in url.parse Using @bnoordhuis's punycode lib. Close #1174 also --- LICENSE | 2 + lib/punycode.js | 218 +++++++++++++++++++++++++++++++++++ lib/url.js | 54 +++++++-- test/simple/test-punycode.js | 38 ++++++ test/simple/test-url.js | 57 ++++++++- 5 files changed, 357 insertions(+), 12 deletions(-) create mode 100644 lib/punycode.js create mode 100644 test/simple/test-punycode.js diff --git a/LICENSE b/LICENSE index 0a4aa24162..84cdd438d6 100644 --- a/LICENSE +++ b/LICENSE @@ -69,3 +69,5 @@ The externally maintained libraries used by Node are: - lib/buffer_ieee754.js is copyright 2008 Fair Oaks Labs, Inc. and released under the New BSD license. + + - lib/punycode.js is copyright 2011 Ben Noordhuis and released under the MIT license. diff --git a/lib/punycode.js b/lib/punycode.js new file mode 100644 index 0000000000..a7c07a6634 --- /dev/null +++ b/lib/punycode.js @@ -0,0 +1,218 @@ +// Copyright (C) 2011 by Ben Noordhuis +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +exports.encode = encode; +exports.decode = decode; + +var TMIN = 1; +var TMAX = 26; +var BASE = 36; +var SKEW = 38; +var DAMP = 700; // initial bias scaler +var INITIAL_N = 128; +var INITIAL_BIAS = 72; + +function adapt_bias(delta, n_points, is_first) { + // scale back, then increase delta + delta /= is_first ? DAMP : 2; + delta += ~~(delta / n_points); + + var s = (BASE - TMIN); + var t = ~~((s * TMAX) / 2); // threshold=455 + + for (var k = 0; delta > t; k += BASE) { + delta = ~~(delta / s); + } + + var a = (BASE - TMIN + 1) * delta; + var b = (delta + SKEW); + + return k + ~~(a / b); +} + +function next_smallest_codepoint(codepoints, n) { + var m = 0x110000; // unicode upper bound + 1 + + for (var i = 0, len = codepoints.length; i < len; ++i) { + var c = codepoints[i]; + if (c >= n && c < m) { + m = c; + } + } + + // sanity check - should not happen + if (m >= 0x110000) { + throw new Error('Next smallest code point not found.'); + } + + return m; +} + +function encode_digit(d) { + return d + (d < 26 ? 97 : 22); +} + +function decode_digit(d) { + if (d >= 48 && d <= 57) { + return d - 22; // 0..9 + } + if (d >= 65 && d <= 90) { + return d - 65; // A..Z + } + if (d >= 97 && d <= 122) { + return d - 97; // a..z + } + throw new Error('Illegal digit #' + d); +} + +function threshold(k, bias) { + if (k <= bias + TMIN) { + return TMIN; + } + if (k >= bias + TMAX) { + return TMAX; + } + return k - bias; +} + +function encode_int(bias, delta) { + var result = []; + + for (var k = BASE, q = delta;; k += BASE) { + var t = threshold(k, bias); + if (q < t) { + result.push(encode_digit(q)); + break; + } + else { + result.push(encode_digit(t + ((q - t) % (BASE - t)))); + q = ~~((q - t) / (BASE - t)); + } + } + + return result; +} + +function encode(input) { + if (typeof input != 'string') { + throw new Error('Argument must be a string.'); + } + + input = input.split('').map(function(c) { + return c.charCodeAt(0); + }); + + var output = []; + var non_basic = []; + + for (var i = 0, len = input.length; i < len; ++i) { + var c = input[i]; + if (c < 128) { + output.push(c); + } + else { + non_basic.push(c); + } + } + + var b, h; + b = h = output.length; + + if (b) { + output.push(45); // delimiter '-' + } + + var n = INITIAL_N; + var bias = INITIAL_BIAS; + var delta = 0; + + for (var len = input.length; h < len; ++n, ++delta) { + var m = next_smallest_codepoint(non_basic, n); + delta += (m - n) * (h + 1); + n = m; + + for (var i = 0; i < len; ++i) { + var c = input[i]; + if (c < n) { + if (++delta == 0) { + throw new Error('Delta overflow.'); + } + } + else if (c == n) { + // TODO append in-place? + // i.e. -> output.push.apply(output, encode_int(bias, delta)); + output = output.concat(encode_int(bias, delta)); + bias = adapt_bias(delta, h + 1, b == h); + delta = 0; + h++; + } + } + } + + return String.fromCharCode.apply(String, output); +} + +function decode(input) { + if (typeof input != 'string') { + throw new Error('Argument must be a string.'); + } + + // find basic code points/delta separator + var b = 1 + input.lastIndexOf('-'); + + input = input.split('').map(function(c) { + return c.charCodeAt(0); + }); + + // start with a copy of the basic code points + var output = input.slice(0, b ? (b - 1) : 0); + + var n = INITIAL_N; + var bias = INITIAL_BIAS; + + for (var i = 0, len = input.length; b < len; ++i) { + var org_i = i; + + for (var k = BASE, w = 1;; k += BASE) { + var d = decode_digit(input[b++]); + + // TODO overflow check + i += d * w; + + var t = threshold(k, bias); + if (d < t) { + break; + } + + // TODO overflow check + w *= BASE - t; + } + + var x = 1 + output.length; + bias = adapt_bias(i - org_i, x, org_i == 0); + // TODO overflow check + n += ~~(i / x); + i %= x; + + output.splice(i, 0, n); + } + + return String.fromCharCode.apply(String, output); +} diff --git a/lib/url.js b/lib/url.js index 8b01c8548f..ed90e5cad1 100644 --- a/lib/url.js +++ b/lib/url.js @@ -19,6 +19,8 @@ // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE // USE OR OTHER DEALINGS IN THE SOFTWARE. +var punycode = require('punycode'); + exports.parse = urlParse; exports.resolve = urlResolve; exports.resolveObject = urlResolveObject; @@ -183,24 +185,56 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { var part = hostparts[i]; if (!part) continue; if (!part.match(hostnamePartPattern)) { - var validParts = hostparts.slice(0, i); - var notHost = hostparts.slice(i + 1); - var bit = part.match(hostnamePartStart); - if (bit) { - validParts.push(bit[1]); - notHost.unshift(bit[2]); + var newpart = ''; + for (var j = 0, k = part.length; j < k; j++) { + if (part.charCodeAt(j) > 127) { + // we replace non-ASCII char with a temporary placeholder + // we need this to make sure size of hostname is not + // broken by replacing non-ASCII by nothing + newpart += 'x'; + } else { + newpart += part[j]; + } } - if (notHost.length) { - rest = '/' + notHost.join('.') + rest + // we test again with ASCII char only + if (!newpart.match(hostnamePartPattern)) { + var validParts = hostparts.slice(0, i); + var notHost = hostparts.slice(i + 1); + var bit = part.match(hostnamePartStart); + if (bit) { + validParts.push(bit[1]); + notHost.unshift(bit[2]); + } + if (notHost.length) { + rest = '/' + notHost.join('.') + rest; + } + out.hostname = validParts.join('.'); + break; } - out.hostname = validParts.join('.'); - break; } } } + // hostnames are always lower case. out.hostname = out.hostname.toLowerCase(); + // IDNA Support: Returns a puny coded representation of "domain". + // It only converts the part of the domain name that + // has non ASCII characters. I.e. it dosent matter if + // you call it with a domain that already is in ASCII. + try { + var domainArray = out.hostname.split('.'); + var newOut = []; + for (var i = 0; i < domainArray.length; ++i) { + var s = domainArray[i]; + newOut.push(s.match(/[^A-Za-z0-9-]/) ? + 'xn--' + punycode.encode(s) : s); + } + out.hostname = newOut.join('.'); + } catch (e) { + // if encode fail for some reason, we just do the classic behavior. + } + out.host = ((out.auth) ? out.auth + '@' : '') + (out.hostname || '') + ((out.port) ? ':' + out.port : ''); diff --git a/test/simple/test-punycode.js b/test/simple/test-punycode.js new file mode 100644 index 0000000000..64f8100ee5 --- /dev/null +++ b/test/simple/test-punycode.js @@ -0,0 +1,38 @@ +// Copyright (C) 2011 by Ben Noordhuis +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +punycode = require('punycode'); +assert = require('assert'); + +assert.equal(punycode.encode('ü'), 'tda'); +assert.equal(punycode.encode('Goethe'), 'Goethe-'); +assert.equal(punycode.encode('Bücher'), 'Bcher-kva'); +assert.equal(punycode.encode( + 'Willst du die Blüthe des frühen, die Früchte des späteren Jahres'), + 'Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal'); +assert.equal(punycode.encode('日本語'), 'wgv71a119e'); + +assert.equal(punycode.decode('tda'), 'ü'); +assert.equal(punycode.decode('Goethe-'), 'Goethe'); +assert.equal(punycode.decode('Bcher-kva'), 'Bücher'); +assert.equal(punycode.decode( + 'Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal'), + 'Willst du die Blüthe des frühen, die Früchte des späteren Jahres'); +assert.equal(punycode.decode('wgv71a119e'), '日本語'); diff --git a/test/simple/test-url.js b/test/simple/test-url.js index ea85bc967f..f07edb5cd4 100644 --- a/test/simple/test-url.js +++ b/test/simple/test-url.js @@ -79,7 +79,7 @@ var parseTests = { 'protocol': 'http:', 'host': 'x.com', 'hostname': 'x.com', - 'pathname': '/Y', + 'pathname': '/Y' }, // an unexpected invalid char in the hostname. 'HtTp://x.y.cOm*a/b/c?d=e#f gi' : { @@ -113,7 +113,7 @@ var parseTests = { }, 'http://x/p/"quoted"': { 'href': 'http://x/p/', - 'protocol':'http:', + 'protocol': 'http:', 'host': 'x', 'hostname': 'x', 'pathname': '/p/' @@ -274,6 +274,59 @@ var parseTests = { 'search' : '?search=foo', 'query' : 'search=foo', 'hash' : '#bar' + }, + // IDNA tests + 'http://www.日本語.com/' : { + 'href': 'http://www.xn--wgv71a119e.com/', + 'protocol': 'http:', + 'host': 'www.xn--wgv71a119e.com', + 'hostname': 'www.xn--wgv71a119e.com', + 'pathname': '/' + }, + 'http://example.Bücher.com/' : { + 'href': 'http://example.xn--bcher-kva.com/', + 'protocol': 'http:', + 'host': 'example.xn--bcher-kva.com', + 'hostname': 'example.xn--bcher-kva.com', + 'pathname': '/' + }, + 'http://www.Äffchen.com/' : { + 'href': 'http://www.xn--ffchen-9ta.com/', + 'protocol': 'http:', + 'host': 'www.xn--ffchen-9ta.com', + 'hostname': 'www.xn--ffchen-9ta.com', + 'pathname': '/' + }, + 'http://www.Äffchen.cOm*A/b/c?d=e#f gi' : { + 'href': 'http://www.xn--ffchen-9ta.com/*A/b/c?d=e#f', + 'protocol': 'http:', + 'host': 'www.xn--ffchen-9ta.com', + 'hostname': 'www.xn--ffchen-9ta.com', + 'pathname': '/*A/b/c', + 'search': '?d=e', + 'query': 'd=e', + 'hash': '#f' + }, + 'http://SÉLIER.COM/' : { + 'href': 'http://xn--slier-bsa.com/', + 'protocol': 'http:', + 'host': 'xn--slier-bsa.com', + 'hostname': 'xn--slier-bsa.com', + 'pathname': '/' + }, + 'http://ليهمابتكلموشعربي؟.ي؟/' : { + 'href': 'http://xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f/', + 'protocol': 'http:', + 'host': 'xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f', + 'hostname': 'xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f', + 'pathname': '/' + }, + 'http://➡.ws/➡' : { + 'href': 'http://xn--hgi.ws/➡', + 'protocol': 'http:', + 'host': 'xn--hgi.ws', + 'hostname': 'xn--hgi.ws', + 'pathname': '/➡' } }; for (var u in parseTests) {