Browse Source

url: Go much faster by using Url class

V8 loves it when JavaScript pretends to be a Classic inheritance
type of language.

Before:

$ ./node benchmark/url.js
benchmarking parse() ... 1.868 sec
benchmarking format() ... 1.906 sec
benchmarking resolve("../foo/bar?baz=boom") ... 7.800 sec
benchmarking resolve("foo/bar") ... 7.099 sec
benchmarking resolve("http://nodejs.org") ... 8.403 sec
benchmarking resolve("./foo/bar?baz") ... 7.974 sec

After:

$ ./node benchmark/url.js
benchmarking parse() ... 1.769 sec
benchmarking format() ... 1.793 sec
benchmarking resolve("../foo/bar?baz=boom") ... 4.254 sec
benchmarking resolve("foo/bar") ... 3.932 sec
benchmarking resolve("http://nodejs.org") ... 4.382 sec
benchmarking resolve("./foo/bar?baz") ... 4.293 sec
v0.9.2-release
isaacs 13 years ago
parent
commit
7144be70db
  1. 389
      lib/url.js
  2. 30
      test/simple/test-url.js

389
lib/url.js

@ -26,6 +26,22 @@ exports.resolve = urlResolve;
exports.resolveObject = urlResolveObject;
exports.format = urlFormat;
exports.Url = Url;
function Url() {
this.protocol = null;
this.slashes = null;
this.auth = null;
this.host = null;
this.port = null;
this.hostname = null;
this.hash = null;
this.search = null;
this.query = null;
this.pathname = null;
this.path = null;
}
// Reference: RFC 3986, RFC 1808, RFC 2396
// define these here so at least they only have to be
@ -90,14 +106,19 @@ var protocolPattern = /^([a-z0-9.+-]+:)/i,
querystring = require('querystring');
function urlParse(url, parseQueryString, slashesDenoteHost) {
if (url && typeof(url) === 'object' && url.href) return url;
if (url && typeof(url) === 'object' && url instanceof Url) return url;
var u = new Url;
u.parse(url, parseQueryString, slashesDenoteHost);
return u;
}
Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) {
if (typeof url !== 'string') {
throw new TypeError("Parameter 'url' must be a string, not " + typeof url);
}
var out = {},
rest = url;
var rest = url;
// trim before proceeding.
// This is to support parse stuff like " http://foo.com \n"
@ -107,7 +128,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
if (proto) {
proto = proto[0];
var lowerProto = proto.toLowerCase();
out.protocol = lowerProto;
this.protocol = lowerProto;
rest = rest.substr(proto.length);
}
@ -119,7 +140,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
var slashes = rest.substr(0, 2) === '//';
if (slashes && !(proto && hostlessProtocol[proto])) {
rest = rest.substr(2);
out.slashes = true;
this.slashes = true;
}
}
@ -149,7 +170,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
if (hasAuth) {
// pluck off the auth portion.
out.auth = decodeURIComponent(auth);
this.auth = decodeURIComponent(auth);
rest = rest.substr(atSign + 1);
}
}
@ -162,35 +183,28 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
}
if (firstNonHost !== -1) {
out.host = rest.substr(0, firstNonHost);
this.host = rest.substr(0, firstNonHost);
rest = rest.substr(firstNonHost);
} else {
out.host = rest;
this.host = rest;
rest = '';
}
// pull out port.
var p = parseHost(out.host);
var keys = Object.keys(p);
for (var i = 0, l = keys.length; i < l; i++) {
var key = keys[i];
out[key] = p[key];
}
this.parseHost();
// we've indicated that there is a hostname,
// so even if it's empty, it has to be present.
out.hostname = out.hostname || '';
this.hostname = this.hostname || '';
// if hostname begins with [ and ends with ]
// assume that it's an IPv6 address.
var ipv6Hostname = out.hostname[0] === '[' &&
out.hostname[out.hostname.length - 1] === ']';
var ipv6Hostname = this.hostname[0] === '[' &&
this.hostname[this.hostname.length - 1] === ']';
// validate a little.
if (out.hostname.length > hostnameMaxLen) {
out.hostname = '';
} else if (!ipv6Hostname) {
var hostparts = out.hostname.split(/\./);
if (!ipv6Hostname) {
var hostparts = this.hostname.split(/\./);
for (var i = 0, l = hostparts.length; i < l; i++) {
var part = hostparts[i];
if (!part) continue;
@ -218,38 +232,44 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
if (notHost.length) {
rest = '/' + notHost.join('.') + rest;
}
out.hostname = validParts.join('.');
this.hostname = validParts.join('.');
break;
}
}
}
}
if (this.hostname.length > hostnameMaxLen) {
this.hostname = '';
} else {
// hostnames are always lower case.
out.hostname = out.hostname.toLowerCase();
this.hostname = this.hostname.toLowerCase();
}
if (!ipv6Hostname) {
// IDNA Support: Returns a puny coded representation of "domain".
// It only converts the part of the domain name that
// has non ASCII characters. I.e. it dosent matter if
// you call it with a domain that already is in ASCII.
var domainArray = out.hostname.split('.');
var domainArray = this.hostname.split('.');
var newOut = [];
for (var i = 0; i < domainArray.length; ++i) {
var s = domainArray[i];
newOut.push(s.match(/[^A-Za-z0-9_-]/) ?
'xn--' + punycode.encode(s) : s);
}
out.hostname = newOut.join('.');
this.hostname = newOut.join('.');
}
out.host = (out.hostname || '') +
((out.port) ? ':' + out.port : '');
out.href += out.host;
var p = this.port ? ':' + this.port : '';
var h = this.hostname || '';
this.host = h + p;
this.href += this.host;
// strip [ and ] from the hostname
// the host field still retains them, though
if (ipv6Hostname) {
out.hostname = out.hostname.substr(1, out.hostname.length - 2);
this.hostname = this.hostname.substr(1, this.hostname.length - 2);
if (rest[0] !== '/') {
rest = '/' + rest;
}
@ -278,38 +298,39 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
var hash = rest.indexOf('#');
if (hash !== -1) {
// got a fragment string.
out.hash = rest.substr(hash);
this.hash = rest.substr(hash);
rest = rest.slice(0, hash);
}
var qm = rest.indexOf('?');
if (qm !== -1) {
out.search = rest.substr(qm);
out.query = rest.substr(qm + 1);
this.search = rest.substr(qm);
this.query = rest.substr(qm + 1);
if (parseQueryString) {
out.query = querystring.parse(out.query);
this.query = querystring.parse(this.query);
}
rest = rest.slice(0, qm);
} else if (parseQueryString) {
// no query string, but parseQueryString still requested
out.search = '';
out.query = {};
this.search = '';
this.query = {};
}
if (rest) out.pathname = rest;
if (rest) this.pathname = rest;
if (slashedProtocol[proto] &&
out.hostname && !out.pathname) {
out.pathname = '/';
this.hostname && !this.pathname) {
this.pathname = '/';
}
//to support http.request
if (out.pathname || out.search) {
out.path = (out.pathname ? out.pathname : '') +
(out.search ? out.search : '');
if (this.pathname || this.search) {
var p = this.pathname || '';
var s = this.search || '';
this.path = p + s;
}
// finally, reconstruct the href based on what has been validated.
out.href = urlFormat(out);
return out;
}
this.href = this.format();
return this;
};
// format a parsed object into a url string
function urlFormat(obj) {
@ -318,43 +339,47 @@ function urlFormat(obj) {
// this way, you can call url_format() on strings
// to clean up potentially wonky urls.
if (typeof(obj) === 'string') obj = urlParse(obj);
if (!(obj instanceof Url)) return Url.prototype.format.call(obj);
return obj.format();
}
var auth = obj.auth || '';
Url.prototype.format = function() {
var auth = this.auth || '';
if (auth) {
auth = encodeURIComponent(auth);
auth = auth.replace(/%3A/i, ':');
auth += '@';
}
var protocol = obj.protocol || '',
pathname = obj.pathname || '',
hash = obj.hash || '',
var protocol = this.protocol || '',
pathname = this.pathname || '',
hash = this.hash || '',
host = false,
query = '';
if (obj.host !== undefined) {
host = auth + obj.host;
} else if (obj.hostname !== undefined) {
host = auth + (obj.hostname.indexOf(':') === -1 ?
obj.hostname :
'[' + obj.hostname + ']');
if (obj.port) {
host += ':' + obj.port;
if (this.host) {
host = auth + this.host;
} else if (this.hostname) {
host = auth + (this.hostname.indexOf(':') === -1 ?
this.hostname :
'[' + this.hostname + ']');
if (this.port) {
host += ':' + this.port;
}
}
if (obj.query && typeof obj.query === 'object' &&
Object.keys(obj.query).length) {
query = querystring.stringify(obj.query);
if (this.query && typeof this.query === 'object' &&
Object.keys(this.query).length) {
query = querystring.stringify(this.query);
}
var search = obj.search || (query && ('?' + query)) || '';
var search = this.search || (query && ('?' + query)) || '';
if (protocol && protocol.substr(-1) !== ':') protocol += ':';
// only the slashedProtocols get the //. Not mailto:, xmpp:, etc.
// unless they had them to begin with.
if (obj.slashes ||
if (this.slashes ||
(!protocol || slashedProtocol[protocol]) && host !== false) {
host = '//' + (host || '');
if (pathname && pathname.charAt(0) !== '/') pathname = '/' + pathname;
@ -366,39 +391,62 @@ function urlFormat(obj) {
if (search && search.charAt(0) !== '?') search = '?' + search;
return protocol + host + pathname + search + hash;
}
};
function urlResolve(source, relative) {
return urlFormat(urlResolveObject(source, relative));
return urlParse(source, false, true).resolve(relative);
}
Url.prototype.resolve = function(relative) {
return this.resolveObject(urlParse(relative, false, true)).format();
};
function urlResolveObject(source, relative) {
if (!source) return relative;
return urlParse(source, false, true).resolveObject(relative);
}
source = urlParse(urlFormat(source), false, true);
relative = urlParse(urlFormat(relative), false, true);
Url.prototype.resolveObject = function(relative) {
if (typeof relative === 'string') {
var rel = new Url();
rel.parse(relative, false, true);
relative = rel;
}
var result = new Url();
Object.keys(this).forEach(function(k) {
result[k] = this[k];
}, this);
// hash is always overridden, no matter what.
source.hash = relative.hash;
// even href="" will remove it.
result.hash = relative.hash;
// if the relative url is empty, then there's nothing left to do here.
if (relative.href === '') {
source.href = urlFormat(source);
return source;
result.href = result.format();
return result;
}
// hrefs like //foo/bar always cut to the protocol.
if (relative.slashes && !relative.protocol) {
relative.protocol = source.protocol;
// take everything except the protocol from relative
Object.keys(relative).forEach(function(k) {
if (k !== 'protocol')
result[k] = relative[k];
});
//urlParse appends trailing / to urls like http://www.example.com
if (slashedProtocol[relative.protocol] &&
relative.hostname && !relative.pathname) {
relative.path = relative.pathname = '/';
if (slashedProtocol[result.protocol] &&
result.hostname && !result.pathname) {
result.path = result.pathname = '/';
}
relative.href = urlFormat(relative);
return relative;
result.href = result.format();
return result;
}
if (relative.protocol && relative.protocol !== source.protocol) {
if (relative.protocol && relative.protocol !== result.protocol) {
// if it's a known url protocol, then changing
// the protocol does weird things
// first, if it's not file:, then we MUST have a host,
@ -408,10 +456,14 @@ function urlResolveObject(source, relative) {
// because that's known to be hostless.
// anything else is assumed to be absolute.
if (!slashedProtocol[relative.protocol]) {
relative.href = urlFormat(relative);
return relative;
Object.keys(relative).forEach(function(k) {
result[k] = relative[k];
});
result.href = result.format();
return result;
}
source.protocol = relative.protocol;
result.protocol = relative.protocol;
if (!relative.host && !hostlessProtocol[relative.protocol]) {
var relPath = (relative.pathname || '').split('/');
while (relPath.length && !(relative.host = relPath.shift()));
@ -419,72 +471,72 @@ function urlResolveObject(source, relative) {
if (!relative.hostname) relative.hostname = '';
if (relPath[0] !== '') relPath.unshift('');
if (relPath.length < 2) relPath.unshift('');
relative.pathname = relPath.join('/');
}
source.pathname = relative.pathname;
source.search = relative.search;
source.query = relative.query;
source.host = relative.host || '';
source.auth = relative.auth;
source.hostname = relative.hostname || relative.host;
source.port = relative.port;
//to support http.request
if (source.pathname !== undefined || source.search !== undefined) {
source.path = (source.pathname ? source.pathname : '') +
(source.search ? source.search : '');
}
source.slashes = source.slashes || relative.slashes;
source.href = urlFormat(source);
return source;
}
var isSourceAbs = (source.pathname && source.pathname.charAt(0) === '/'),
result.pathname = relPath.join('/');
} else {
result.pathname = relative.pathname;
}
result.search = relative.search;
result.query = relative.query;
result.host = relative.host || '';
result.auth = relative.auth;
result.hostname = relative.hostname || relative.host;
result.port = relative.port;
// to support http.request
if (result.pathname || result.search) {
var p = result.pathname || '';
var s = result.search || '';
result.path = p + s;
}
result.slashes = result.slashes || relative.slashes;
result.href = result.format();
return result;
}
var isSourceAbs = (result.pathname && result.pathname.charAt(0) === '/'),
isRelAbs = (
relative.host !== undefined ||
relative.host ||
relative.pathname && relative.pathname.charAt(0) === '/'
),
mustEndAbs = (isRelAbs || isSourceAbs ||
(source.host && relative.pathname)),
(result.host && relative.pathname)),
removeAllDots = mustEndAbs,
srcPath = source.pathname && source.pathname.split('/') || [],
srcPath = result.pathname && result.pathname.split('/') || [],
relPath = relative.pathname && relative.pathname.split('/') || [],
psychotic = source.protocol &&
!slashedProtocol[source.protocol];
psychotic = result.protocol && !slashedProtocol[result.protocol];
// if the url is a non-slashed url, then relative
// links like ../.. should be able
// to crawl up to the hostname, as well. This is strange.
// source.protocol has already been set by now.
// result.protocol has already been set by now.
// Later on, put the first path part into the host field.
if (psychotic) {
delete source.hostname;
delete source.port;
if (source.host) {
if (srcPath[0] === '') srcPath[0] = source.host;
else srcPath.unshift(source.host);
result.hostname = '';
result.port = null;
if (result.host) {
if (srcPath[0] === '') srcPath[0] = result.host;
else srcPath.unshift(result.host);
}
delete source.host;
result.host = '';
if (relative.protocol) {
delete relative.hostname;
delete relative.port;
relative.hostname = null;
relative.port = null;
if (relative.host) {
if (relPath[0] === '') relPath[0] = relative.host;
else relPath.unshift(relative.host);
}
delete relative.host;
relative.host = null;
}
mustEndAbs = mustEndAbs && (relPath[0] === '' || srcPath[0] === '');
}
if (isRelAbs) {
// it's absolute.
source.host = (relative.host || relative.host === '') ?
relative.host : source.host;
source.hostname = (relative.hostname || relative.hostname === '') ?
relative.hostname : source.hostname;
source.search = relative.search;
source.query = relative.query;
result.host = (relative.host || relative.host === '') ?
relative.host : result.host;
result.hostname = (relative.hostname || relative.hostname === '') ?
relative.hostname : result.hostname;
result.search = relative.search;
result.query = relative.query;
srcPath = relPath;
// fall through to the dot-handling below.
} else if (relPath.length) {
@ -493,53 +545,55 @@ function urlResolveObject(source, relative) {
if (!srcPath) srcPath = [];
srcPath.pop();
srcPath = srcPath.concat(relPath);
source.search = relative.search;
source.query = relative.query;
} else if ('search' in relative) {
result.search = relative.search;
result.query = relative.query;
} else if (relative.search !== null && relative.search !== undefined) {
// just pull out the search.
// like href='?foo'.
// Put this after the other two cases because it simplifies the booleans
if (psychotic) {
source.hostname = source.host = srcPath.shift();
result.hostname = result.host = srcPath.shift();
//occationaly the auth can get stuck only in host
//this especialy happens in cases like
//url.resolveObject('mailto:local1@domain1', 'local2@domain2')
var authInHost = source.host && source.host.indexOf('@') > 0 ?
source.host.split('@') : false;
var authInHost = result.host && result.host.indexOf('@') > 0 ?
result.host.split('@') : false;
if (authInHost) {
source.auth = authInHost.shift();
source.host = source.hostname = authInHost.shift();
result.auth = authInHost.shift();
result.host = result.hostname = authInHost.shift();
}
}
source.search = relative.search;
source.query = relative.query;
result.search = relative.search;
result.query = relative.query;
//to support http.request
if (source.pathname !== undefined || source.search !== undefined) {
source.path = (source.pathname ? source.pathname : '') +
(source.search ? source.search : '');
if (result.pathname !== null || result.search !== null) {
result.path = (result.pathname ? result.pathname : '') +
(result.search ? result.search : '');
}
source.href = urlFormat(source);
return source;
result.href = result.format();
return result;
}
if (!srcPath.length) {
// no path at all. easy.
// we've already handled the other stuff above.
delete source.pathname;
result.pathname = null;
//to support http.request
if (!source.search) {
source.path = '/' + source.search;
if (result.search) {
result.path = '/' + result.search;
} else {
delete source.path;
result.path = null;
}
source.href = urlFormat(source);
return source;
result.href = result.format();
return result;
}
// if a url ENDs in . or .., then it must get a trailing slash.
// however, if it ends in anything else non-slashy,
// then it must NOT get a trailing slash.
var last = srcPath.slice(-1)[0];
var hasTrailingSlash = (
(source.host || relative.host) && (last === '.' || last === '..') ||
(result.host || relative.host) && (last === '.' || last === '..') ||
last === '');
// strip single dots, resolve double dots to parent dir
@ -579,47 +633,52 @@ function urlResolveObject(source, relative) {
// put the host back
if (psychotic) {
source.hostname = source.host = isAbsolute ? '' :
result.hostname = result.host = isAbsolute ? '' :
srcPath.length ? srcPath.shift() : '';
//occationaly the auth can get stuck only in host
//this especialy happens in cases like
//url.resolveObject('mailto:local1@domain1', 'local2@domain2')
var authInHost = source.host && source.host.indexOf('@') > 0 ?
source.host.split('@') : false;
var authInHost = result.host && result.host.indexOf('@') > 0 ?
result.host.split('@') : false;
if (authInHost) {
source.auth = authInHost.shift();
source.host = source.hostname = authInHost.shift();
result.auth = authInHost.shift();
result.host = result.hostname = authInHost.shift();
}
}
mustEndAbs = mustEndAbs || (source.host && srcPath.length);
mustEndAbs = mustEndAbs || (result.host && srcPath.length);
if (mustEndAbs && !isAbsolute) {
srcPath.unshift('');
}
source.pathname = srcPath.join('/');
//to support request.http
if (source.pathname !== undefined || source.search !== undefined) {
source.path = (source.pathname ? source.pathname : '') +
(source.search ? source.search : '');
}
source.auth = relative.auth || source.auth;
source.slashes = source.slashes || relative.slashes;
source.href = urlFormat(source);
return source;
}
if (!srcPath.length) {
result.pathname = null;
result.path = null;
} else {
result.pathname = srcPath.join('/');
}
function parseHost(host) {
var out = {};
//to support request.http
if (result.pathname !== null || result.search !== null) {
result.path = (result.pathname ? result.pathname : '') +
(result.search ? result.search : '');
}
result.auth = relative.auth || result.auth;
result.slashes = result.slashes || relative.slashes;
result.href = result.format();
return result;
};
Url.prototype.parseHost = function() {
var host = this.host;
var port = portPattern.exec(host);
if (port) {
port = port[0];
if (port !== ':') {
out.port = port.substr(1);
this.port = port.substr(1);
}
host = host.substr(0, host.length - port.length);
}
if (host) out.hostname = host;
return out;
}
if (host) this.hostname = host;
};

30
test/simple/test-url.js

@ -659,6 +659,12 @@ for (var u in parseTests) {
spaced = url.parse(' \t ' + u + '\n\t');
expected = parseTests[u];
Object.keys(actual).forEach(function (i) {
if (expected[i] === undefined && actual[i] === null) {
expected[i] = null;
}
});
assert.deepEqual(actual, expected);
assert.deepEqual(spaced, expected);
@ -695,6 +701,11 @@ var parseTestsWithQueryString = {
for (var u in parseTestsWithQueryString) {
var actual = url.parse(u, true);
var expected = parseTestsWithQueryString[u];
for (var i in actual) {
if (actual[i] === null && expected[i] === undefined) {
expected[i] = null;
}
}
assert.deepEqual(actual, expected);
}
@ -1227,15 +1238,6 @@ relativeTests.forEach(function(relativeTest) {
var actual = url.resolveObject(url.parse(relativeTest[0]), relativeTest[1]),
expected = url.parse(relativeTest[2]);
//because of evaluation order
//resolveObject(parse(x), y) == parse(resolve(x, y)) will differ by
//false-ish values. remove all except host and hostname
for (var i in actual) {
if (actual[i] === undefined ||
(!emptyIsImportant.hasOwnProperty(i) && !actual[i])) {
delete actual[i];
}
}
assert.deepEqual(actual, expected);
@ -1264,16 +1266,6 @@ relativeTests2.forEach(function(relativeTest) {
var actual = url.resolveObject(url.parse(relativeTest[1]), relativeTest[0]),
expected = url.parse(relativeTest[2]);
//because of evaluation order
//resolveObject(parse(x), y) == parse(resolve(x, y)) will differ by
//false-ish values. remove all except host and hostname
for (var i in actual) {
if (actual[i] === undefined ||
(!emptyIsImportant.hasOwnProperty(i) && !actual[i])) {
delete actual[i];
}
}
assert.deepEqual(actual, expected);
var expected = relativeTest[2],

Loading…
Cancel
Save