From 7144be70db2e6ce337d0f6ec47a28a06802d1c7a Mon Sep 17 00:00:00 2001 From: isaacs Date: Thu, 13 Sep 2012 11:09:54 -0700 Subject: [PATCH] url: Go much faster by using Url class V8 loves it when JavaScript pretends to be a Classic inheritance type of language. Before: $ ./node benchmark/url.js benchmarking parse() ... 1.868 sec benchmarking format() ... 1.906 sec benchmarking resolve("../foo/bar?baz=boom") ... 7.800 sec benchmarking resolve("foo/bar") ... 7.099 sec benchmarking resolve("http://nodejs.org") ... 8.403 sec benchmarking resolve("./foo/bar?baz") ... 7.974 sec After: $ ./node benchmark/url.js benchmarking parse() ... 1.769 sec benchmarking format() ... 1.793 sec benchmarking resolve("../foo/bar?baz=boom") ... 4.254 sec benchmarking resolve("foo/bar") ... 3.932 sec benchmarking resolve("http://nodejs.org") ... 4.382 sec benchmarking resolve("./foo/bar?baz") ... 4.293 sec --- lib/url.js | 381 +++++++++++++++++++++++----------------- test/simple/test-url.js | 30 ++-- 2 files changed, 231 insertions(+), 180 deletions(-) diff --git a/lib/url.js b/lib/url.js index 50eb8b20f6..980a9bb84a 100644 --- a/lib/url.js +++ b/lib/url.js @@ -26,6 +26,22 @@ exports.resolve = urlResolve; exports.resolveObject = urlResolveObject; exports.format = urlFormat; +exports.Url = Url; + +function Url() { + this.protocol = null; + this.slashes = null; + this.auth = null; + this.host = null; + this.port = null; + this.hostname = null; + this.hash = null; + this.search = null; + this.query = null; + this.pathname = null; + this.path = null; +} + // Reference: RFC 3986, RFC 1808, RFC 2396 // define these here so at least they only have to be @@ -90,14 +106,19 @@ var protocolPattern = /^([a-z0-9.+-]+:)/i, querystring = require('querystring'); function urlParse(url, parseQueryString, slashesDenoteHost) { - if (url && typeof(url) === 'object' && url.href) return url; + if (url && typeof(url) === 'object' && url instanceof Url) return url; + + var u = new Url; + u.parse(url, parseQueryString, slashesDenoteHost); + return u; +} +Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { if (typeof url !== 'string') { throw new TypeError("Parameter 'url' must be a string, not " + typeof url); } - var out = {}, - rest = url; + var rest = url; // trim before proceeding. // This is to support parse stuff like " http://foo.com \n" @@ -107,7 +128,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { if (proto) { proto = proto[0]; var lowerProto = proto.toLowerCase(); - out.protocol = lowerProto; + this.protocol = lowerProto; rest = rest.substr(proto.length); } @@ -119,7 +140,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { var slashes = rest.substr(0, 2) === '//'; if (slashes && !(proto && hostlessProtocol[proto])) { rest = rest.substr(2); - out.slashes = true; + this.slashes = true; } } @@ -149,7 +170,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { if (hasAuth) { // pluck off the auth portion. - out.auth = decodeURIComponent(auth); + this.auth = decodeURIComponent(auth); rest = rest.substr(atSign + 1); } } @@ -162,35 +183,28 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { } if (firstNonHost !== -1) { - out.host = rest.substr(0, firstNonHost); + this.host = rest.substr(0, firstNonHost); rest = rest.substr(firstNonHost); } else { - out.host = rest; + this.host = rest; rest = ''; } // pull out port. - var p = parseHost(out.host); - var keys = Object.keys(p); - for (var i = 0, l = keys.length; i < l; i++) { - var key = keys[i]; - out[key] = p[key]; - } + this.parseHost(); // we've indicated that there is a hostname, // so even if it's empty, it has to be present. - out.hostname = out.hostname || ''; + this.hostname = this.hostname || ''; // if hostname begins with [ and ends with ] // assume that it's an IPv6 address. - var ipv6Hostname = out.hostname[0] === '[' && - out.hostname[out.hostname.length - 1] === ']'; + var ipv6Hostname = this.hostname[0] === '[' && + this.hostname[this.hostname.length - 1] === ']'; // validate a little. - if (out.hostname.length > hostnameMaxLen) { - out.hostname = ''; - } else if (!ipv6Hostname) { - var hostparts = out.hostname.split(/\./); + if (!ipv6Hostname) { + var hostparts = this.hostname.split(/\./); for (var i = 0, l = hostparts.length; i < l; i++) { var part = hostparts[i]; if (!part) continue; @@ -218,38 +232,44 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { if (notHost.length) { rest = '/' + notHost.join('.') + rest; } - out.hostname = validParts.join('.'); + this.hostname = validParts.join('.'); break; } } } } - // hostnames are always lower case. - out.hostname = out.hostname.toLowerCase(); + if (this.hostname.length > hostnameMaxLen) { + this.hostname = ''; + } else { + // hostnames are always lower case. + this.hostname = this.hostname.toLowerCase(); + } if (!ipv6Hostname) { // IDNA Support: Returns a puny coded representation of "domain". // It only converts the part of the domain name that // has non ASCII characters. I.e. it dosent matter if // you call it with a domain that already is in ASCII. - var domainArray = out.hostname.split('.'); + var domainArray = this.hostname.split('.'); var newOut = []; for (var i = 0; i < domainArray.length; ++i) { var s = domainArray[i]; newOut.push(s.match(/[^A-Za-z0-9_-]/) ? 'xn--' + punycode.encode(s) : s); } - out.hostname = newOut.join('.'); + this.hostname = newOut.join('.'); } - out.host = (out.hostname || '') + - ((out.port) ? ':' + out.port : ''); - out.href += out.host; + var p = this.port ? ':' + this.port : ''; + var h = this.hostname || ''; + this.host = h + p; + this.href += this.host; // strip [ and ] from the hostname + // the host field still retains them, though if (ipv6Hostname) { - out.hostname = out.hostname.substr(1, out.hostname.length - 2); + this.hostname = this.hostname.substr(1, this.hostname.length - 2); if (rest[0] !== '/') { rest = '/' + rest; } @@ -278,38 +298,39 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { var hash = rest.indexOf('#'); if (hash !== -1) { // got a fragment string. - out.hash = rest.substr(hash); + this.hash = rest.substr(hash); rest = rest.slice(0, hash); } var qm = rest.indexOf('?'); if (qm !== -1) { - out.search = rest.substr(qm); - out.query = rest.substr(qm + 1); + this.search = rest.substr(qm); + this.query = rest.substr(qm + 1); if (parseQueryString) { - out.query = querystring.parse(out.query); + this.query = querystring.parse(this.query); } rest = rest.slice(0, qm); } else if (parseQueryString) { // no query string, but parseQueryString still requested - out.search = ''; - out.query = {}; + this.search = ''; + this.query = {}; } - if (rest) out.pathname = rest; + if (rest) this.pathname = rest; if (slashedProtocol[proto] && - out.hostname && !out.pathname) { - out.pathname = '/'; + this.hostname && !this.pathname) { + this.pathname = '/'; } //to support http.request - if (out.pathname || out.search) { - out.path = (out.pathname ? out.pathname : '') + - (out.search ? out.search : ''); + if (this.pathname || this.search) { + var p = this.pathname || ''; + var s = this.search || ''; + this.path = p + s; } // finally, reconstruct the href based on what has been validated. - out.href = urlFormat(out); - return out; -} + this.href = this.format(); + return this; +}; // format a parsed object into a url string function urlFormat(obj) { @@ -318,43 +339,47 @@ function urlFormat(obj) { // this way, you can call url_format() on strings // to clean up potentially wonky urls. if (typeof(obj) === 'string') obj = urlParse(obj); + if (!(obj instanceof Url)) return Url.prototype.format.call(obj); + return obj.format(); +} - var auth = obj.auth || ''; +Url.prototype.format = function() { + var auth = this.auth || ''; if (auth) { auth = encodeURIComponent(auth); auth = auth.replace(/%3A/i, ':'); auth += '@'; } - var protocol = obj.protocol || '', - pathname = obj.pathname || '', - hash = obj.hash || '', + var protocol = this.protocol || '', + pathname = this.pathname || '', + hash = this.hash || '', host = false, query = ''; - if (obj.host !== undefined) { - host = auth + obj.host; - } else if (obj.hostname !== undefined) { - host = auth + (obj.hostname.indexOf(':') === -1 ? - obj.hostname : - '[' + obj.hostname + ']'); - if (obj.port) { - host += ':' + obj.port; + if (this.host) { + host = auth + this.host; + } else if (this.hostname) { + host = auth + (this.hostname.indexOf(':') === -1 ? + this.hostname : + '[' + this.hostname + ']'); + if (this.port) { + host += ':' + this.port; } } - if (obj.query && typeof obj.query === 'object' && - Object.keys(obj.query).length) { - query = querystring.stringify(obj.query); + if (this.query && typeof this.query === 'object' && + Object.keys(this.query).length) { + query = querystring.stringify(this.query); } - var search = obj.search || (query && ('?' + query)) || ''; + var search = this.search || (query && ('?' + query)) || ''; if (protocol && protocol.substr(-1) !== ':') protocol += ':'; // only the slashedProtocols get the //. Not mailto:, xmpp:, etc. // unless they had them to begin with. - if (obj.slashes || + if (this.slashes || (!protocol || slashedProtocol[protocol]) && host !== false) { host = '//' + (host || ''); if (pathname && pathname.charAt(0) !== '/') pathname = '/' + pathname; @@ -366,39 +391,62 @@ function urlFormat(obj) { if (search && search.charAt(0) !== '?') search = '?' + search; return protocol + host + pathname + search + hash; -} +}; function urlResolve(source, relative) { - return urlFormat(urlResolveObject(source, relative)); + return urlParse(source, false, true).resolve(relative); } +Url.prototype.resolve = function(relative) { + return this.resolveObject(urlParse(relative, false, true)).format(); +}; + function urlResolveObject(source, relative) { if (!source) return relative; + return urlParse(source, false, true).resolveObject(relative); +} + +Url.prototype.resolveObject = function(relative) { + if (typeof relative === 'string') { + var rel = new Url(); + rel.parse(relative, false, true); + relative = rel; + } - source = urlParse(urlFormat(source), false, true); - relative = urlParse(urlFormat(relative), false, true); + var result = new Url(); + Object.keys(this).forEach(function(k) { + result[k] = this[k]; + }, this); // hash is always overridden, no matter what. - source.hash = relative.hash; + // even href="" will remove it. + result.hash = relative.hash; + // if the relative url is empty, then there's nothing left to do here. if (relative.href === '') { - source.href = urlFormat(source); - return source; + result.href = result.format(); + return result; } // hrefs like //foo/bar always cut to the protocol. if (relative.slashes && !relative.protocol) { - relative.protocol = source.protocol; + // take everything except the protocol from relative + Object.keys(relative).forEach(function(k) { + if (k !== 'protocol') + result[k] = relative[k]; + }); + //urlParse appends trailing / to urls like http://www.example.com - if (slashedProtocol[relative.protocol] && - relative.hostname && !relative.pathname) { - relative.path = relative.pathname = '/'; + if (slashedProtocol[result.protocol] && + result.hostname && !result.pathname) { + result.path = result.pathname = '/'; } - relative.href = urlFormat(relative); - return relative; + + result.href = result.format(); + return result; } - if (relative.protocol && relative.protocol !== source.protocol) { + if (relative.protocol && relative.protocol !== result.protocol) { // if it's a known url protocol, then changing // the protocol does weird things // first, if it's not file:, then we MUST have a host, @@ -408,10 +456,14 @@ function urlResolveObject(source, relative) { // because that's known to be hostless. // anything else is assumed to be absolute. if (!slashedProtocol[relative.protocol]) { - relative.href = urlFormat(relative); - return relative; + Object.keys(relative).forEach(function(k) { + result[k] = relative[k]; + }); + result.href = result.format(); + return result; } - source.protocol = relative.protocol; + + result.protocol = relative.protocol; if (!relative.host && !hostlessProtocol[relative.protocol]) { var relPath = (relative.pathname || '').split('/'); while (relPath.length && !(relative.host = relPath.shift())); @@ -419,72 +471,72 @@ function urlResolveObject(source, relative) { if (!relative.hostname) relative.hostname = ''; if (relPath[0] !== '') relPath.unshift(''); if (relPath.length < 2) relPath.unshift(''); - relative.pathname = relPath.join('/'); + result.pathname = relPath.join('/'); + } else { + result.pathname = relative.pathname; } - source.pathname = relative.pathname; - source.search = relative.search; - source.query = relative.query; - source.host = relative.host || ''; - source.auth = relative.auth; - source.hostname = relative.hostname || relative.host; - source.port = relative.port; - //to support http.request - if (source.pathname !== undefined || source.search !== undefined) { - source.path = (source.pathname ? source.pathname : '') + - (source.search ? source.search : ''); + result.search = relative.search; + result.query = relative.query; + result.host = relative.host || ''; + result.auth = relative.auth; + result.hostname = relative.hostname || relative.host; + result.port = relative.port; + // to support http.request + if (result.pathname || result.search) { + var p = result.pathname || ''; + var s = result.search || ''; + result.path = p + s; } - source.slashes = source.slashes || relative.slashes; - source.href = urlFormat(source); - return source; + result.slashes = result.slashes || relative.slashes; + result.href = result.format(); + return result; } - var isSourceAbs = (source.pathname && source.pathname.charAt(0) === '/'), + var isSourceAbs = (result.pathname && result.pathname.charAt(0) === '/'), isRelAbs = ( - relative.host !== undefined || + relative.host || relative.pathname && relative.pathname.charAt(0) === '/' ), mustEndAbs = (isRelAbs || isSourceAbs || - (source.host && relative.pathname)), + (result.host && relative.pathname)), removeAllDots = mustEndAbs, - srcPath = source.pathname && source.pathname.split('/') || [], + srcPath = result.pathname && result.pathname.split('/') || [], relPath = relative.pathname && relative.pathname.split('/') || [], - psychotic = source.protocol && - !slashedProtocol[source.protocol]; + psychotic = result.protocol && !slashedProtocol[result.protocol]; // if the url is a non-slashed url, then relative // links like ../.. should be able // to crawl up to the hostname, as well. This is strange. - // source.protocol has already been set by now. + // result.protocol has already been set by now. // Later on, put the first path part into the host field. if (psychotic) { - - delete source.hostname; - delete source.port; - if (source.host) { - if (srcPath[0] === '') srcPath[0] = source.host; - else srcPath.unshift(source.host); + result.hostname = ''; + result.port = null; + if (result.host) { + if (srcPath[0] === '') srcPath[0] = result.host; + else srcPath.unshift(result.host); } - delete source.host; + result.host = ''; if (relative.protocol) { - delete relative.hostname; - delete relative.port; + relative.hostname = null; + relative.port = null; if (relative.host) { if (relPath[0] === '') relPath[0] = relative.host; else relPath.unshift(relative.host); } - delete relative.host; + relative.host = null; } mustEndAbs = mustEndAbs && (relPath[0] === '' || srcPath[0] === ''); } if (isRelAbs) { // it's absolute. - source.host = (relative.host || relative.host === '') ? - relative.host : source.host; - source.hostname = (relative.hostname || relative.hostname === '') ? - relative.hostname : source.hostname; - source.search = relative.search; - source.query = relative.query; + result.host = (relative.host || relative.host === '') ? + relative.host : result.host; + result.hostname = (relative.hostname || relative.hostname === '') ? + relative.hostname : result.hostname; + result.search = relative.search; + result.query = relative.query; srcPath = relPath; // fall through to the dot-handling below. } else if (relPath.length) { @@ -493,53 +545,55 @@ function urlResolveObject(source, relative) { if (!srcPath) srcPath = []; srcPath.pop(); srcPath = srcPath.concat(relPath); - source.search = relative.search; - source.query = relative.query; - } else if ('search' in relative) { + result.search = relative.search; + result.query = relative.query; + } else if (relative.search !== null && relative.search !== undefined) { // just pull out the search. // like href='?foo'. // Put this after the other two cases because it simplifies the booleans if (psychotic) { - source.hostname = source.host = srcPath.shift(); + result.hostname = result.host = srcPath.shift(); //occationaly the auth can get stuck only in host //this especialy happens in cases like //url.resolveObject('mailto:local1@domain1', 'local2@domain2') - var authInHost = source.host && source.host.indexOf('@') > 0 ? - source.host.split('@') : false; + var authInHost = result.host && result.host.indexOf('@') > 0 ? + result.host.split('@') : false; if (authInHost) { - source.auth = authInHost.shift(); - source.host = source.hostname = authInHost.shift(); + result.auth = authInHost.shift(); + result.host = result.hostname = authInHost.shift(); } } - source.search = relative.search; - source.query = relative.query; + result.search = relative.search; + result.query = relative.query; //to support http.request - if (source.pathname !== undefined || source.search !== undefined) { - source.path = (source.pathname ? source.pathname : '') + - (source.search ? source.search : ''); + if (result.pathname !== null || result.search !== null) { + result.path = (result.pathname ? result.pathname : '') + + (result.search ? result.search : ''); } - source.href = urlFormat(source); - return source; + result.href = result.format(); + return result; } + if (!srcPath.length) { // no path at all. easy. // we've already handled the other stuff above. - delete source.pathname; + result.pathname = null; //to support http.request - if (!source.search) { - source.path = '/' + source.search; + if (result.search) { + result.path = '/' + result.search; } else { - delete source.path; + result.path = null; } - source.href = urlFormat(source); - return source; + result.href = result.format(); + return result; } + // if a url ENDs in . or .., then it must get a trailing slash. // however, if it ends in anything else non-slashy, // then it must NOT get a trailing slash. var last = srcPath.slice(-1)[0]; var hasTrailingSlash = ( - (source.host || relative.host) && (last === '.' || last === '..') || + (result.host || relative.host) && (last === '.' || last === '..') || last === ''); // strip single dots, resolve double dots to parent dir @@ -579,47 +633,52 @@ function urlResolveObject(source, relative) { // put the host back if (psychotic) { - source.hostname = source.host = isAbsolute ? '' : + result.hostname = result.host = isAbsolute ? '' : srcPath.length ? srcPath.shift() : ''; //occationaly the auth can get stuck only in host //this especialy happens in cases like //url.resolveObject('mailto:local1@domain1', 'local2@domain2') - var authInHost = source.host && source.host.indexOf('@') > 0 ? - source.host.split('@') : false; + var authInHost = result.host && result.host.indexOf('@') > 0 ? + result.host.split('@') : false; if (authInHost) { - source.auth = authInHost.shift(); - source.host = source.hostname = authInHost.shift(); + result.auth = authInHost.shift(); + result.host = result.hostname = authInHost.shift(); } } - mustEndAbs = mustEndAbs || (source.host && srcPath.length); + mustEndAbs = mustEndAbs || (result.host && srcPath.length); if (mustEndAbs && !isAbsolute) { srcPath.unshift(''); } - source.pathname = srcPath.join('/'); - //to support request.http - if (source.pathname !== undefined || source.search !== undefined) { - source.path = (source.pathname ? source.pathname : '') + - (source.search ? source.search : ''); + if (!srcPath.length) { + result.pathname = null; + result.path = null; + } else { + result.pathname = srcPath.join('/'); } - source.auth = relative.auth || source.auth; - source.slashes = source.slashes || relative.slashes; - source.href = urlFormat(source); - return source; -} -function parseHost(host) { - var out = {}; + //to support request.http + if (result.pathname !== null || result.search !== null) { + result.path = (result.pathname ? result.pathname : '') + + (result.search ? result.search : ''); + } + result.auth = relative.auth || result.auth; + result.slashes = result.slashes || relative.slashes; + result.href = result.format(); + return result; +}; + +Url.prototype.parseHost = function() { + var host = this.host; var port = portPattern.exec(host); if (port) { port = port[0]; if (port !== ':') { - out.port = port.substr(1); + this.port = port.substr(1); } host = host.substr(0, host.length - port.length); } - if (host) out.hostname = host; - return out; -} + if (host) this.hostname = host; +}; diff --git a/test/simple/test-url.js b/test/simple/test-url.js index ff4a8449eb..e8229e0591 100644 --- a/test/simple/test-url.js +++ b/test/simple/test-url.js @@ -659,6 +659,12 @@ for (var u in parseTests) { spaced = url.parse(' \t ' + u + '\n\t'); expected = parseTests[u]; + Object.keys(actual).forEach(function (i) { + if (expected[i] === undefined && actual[i] === null) { + expected[i] = null; + } + }); + assert.deepEqual(actual, expected); assert.deepEqual(spaced, expected); @@ -695,6 +701,11 @@ var parseTestsWithQueryString = { for (var u in parseTestsWithQueryString) { var actual = url.parse(u, true); var expected = parseTestsWithQueryString[u]; + for (var i in actual) { + if (actual[i] === null && expected[i] === undefined) { + expected[i] = null; + } + } assert.deepEqual(actual, expected); } @@ -1227,15 +1238,6 @@ relativeTests.forEach(function(relativeTest) { var actual = url.resolveObject(url.parse(relativeTest[0]), relativeTest[1]), expected = url.parse(relativeTest[2]); - //because of evaluation order - //resolveObject(parse(x), y) == parse(resolve(x, y)) will differ by - //false-ish values. remove all except host and hostname - for (var i in actual) { - if (actual[i] === undefined || - (!emptyIsImportant.hasOwnProperty(i) && !actual[i])) { - delete actual[i]; - } - } assert.deepEqual(actual, expected); @@ -1264,16 +1266,6 @@ relativeTests2.forEach(function(relativeTest) { var actual = url.resolveObject(url.parse(relativeTest[1]), relativeTest[0]), expected = url.parse(relativeTest[2]); - //because of evaluation order - //resolveObject(parse(x), y) == parse(resolve(x, y)) will differ by - //false-ish values. remove all except host and hostname - for (var i in actual) { - if (actual[i] === undefined || - (!emptyIsImportant.hasOwnProperty(i) && !actual[i])) { - delete actual[i]; - } - } - assert.deepEqual(actual, expected); var expected = relativeTest[2],