mirror of https://github.com/lukechilds/node.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
473 lines
12 KiB
473 lines
12 KiB
'use strict';
|
|
|
|
/* Dependencies. */
|
|
var characterEntities = require('character-entities');
|
|
var legacy = require('character-entities-legacy');
|
|
var invalid = require('character-reference-invalid');
|
|
var decimal = require('is-decimal');
|
|
var hexadecimal = require('is-hexadecimal');
|
|
var alphanumerical = require('is-alphanumerical');
|
|
|
|
/* Expose. */
|
|
module.exports = wrapper;
|
|
|
|
/* Methods. */
|
|
var own = {}.hasOwnProperty;
|
|
var fromCharCode = String.fromCharCode;
|
|
var noop = Function.prototype;
|
|
|
|
/* Characters. */
|
|
var REPLACEMENT = '\uFFFD';
|
|
var FORM_FEED = '\f';
|
|
var AMPERSAND = '&';
|
|
var OCTOTHORP = '#';
|
|
var SEMICOLON = ';';
|
|
var NEWLINE = '\n';
|
|
var X_LOWER = 'x';
|
|
var X_UPPER = 'X';
|
|
var SPACE = ' ';
|
|
var LESS_THAN = '<';
|
|
var EQUAL = '=';
|
|
var EMPTY = '';
|
|
var TAB = '\t';
|
|
|
|
/* Default settings. */
|
|
var defaults = {
|
|
warning: null,
|
|
reference: null,
|
|
text: null,
|
|
warningContext: null,
|
|
referenceContext: null,
|
|
textContext: null,
|
|
position: {},
|
|
additional: null,
|
|
attribute: false,
|
|
nonTerminated: true
|
|
};
|
|
|
|
/* Reference types. */
|
|
var NAMED = 'named';
|
|
var HEXADECIMAL = 'hexadecimal';
|
|
var DECIMAL = 'decimal';
|
|
|
|
/* Map of bases. */
|
|
var BASE = {};
|
|
|
|
BASE[HEXADECIMAL] = 16;
|
|
BASE[DECIMAL] = 10;
|
|
|
|
/* Map of types to tests. Each type of character reference
|
|
* accepts different characters. This test is used to
|
|
* detect whether a reference has ended (as the semicolon
|
|
* is not strictly needed). */
|
|
var TESTS = {};
|
|
|
|
TESTS[NAMED] = alphanumerical;
|
|
TESTS[DECIMAL] = decimal;
|
|
TESTS[HEXADECIMAL] = hexadecimal;
|
|
|
|
/* Warning messages. */
|
|
var NAMED_NOT_TERMINATED = 1;
|
|
var NUMERIC_NOT_TERMINATED = 2;
|
|
var NAMED_EMPTY = 3;
|
|
var NUMERIC_EMPTY = 4;
|
|
var NAMED_UNKNOWN = 5;
|
|
var NUMERIC_DISALLOWED = 6;
|
|
var NUMERIC_PROHIBITED = 7;
|
|
|
|
var NUMERIC_REFERENCE = 'Numeric character references';
|
|
var NAMED_REFERENCE = 'Named character references';
|
|
var TERMINATED = ' must be terminated by a semicolon';
|
|
var VOID = ' cannot be empty';
|
|
|
|
var MESSAGES = {};
|
|
|
|
MESSAGES[NAMED_NOT_TERMINATED] = NAMED_REFERENCE + TERMINATED;
|
|
MESSAGES[NUMERIC_NOT_TERMINATED] = NUMERIC_REFERENCE + TERMINATED;
|
|
MESSAGES[NAMED_EMPTY] = NAMED_REFERENCE + VOID;
|
|
MESSAGES[NUMERIC_EMPTY] = NUMERIC_REFERENCE + VOID;
|
|
MESSAGES[NAMED_UNKNOWN] = NAMED_REFERENCE + ' must be known';
|
|
MESSAGES[NUMERIC_DISALLOWED] = NUMERIC_REFERENCE + ' cannot be disallowed';
|
|
MESSAGES[NUMERIC_PROHIBITED] = NUMERIC_REFERENCE + ' cannot be outside the ' +
|
|
'permissible Unicode range';
|
|
|
|
/* Wrap to ensure clean parameters are given to `parse`. */
|
|
function wrapper(value, options) {
|
|
var settings = {};
|
|
var option;
|
|
var key;
|
|
|
|
if (!options) {
|
|
options = {};
|
|
}
|
|
|
|
for (key in defaults) {
|
|
option = options[key];
|
|
settings[key] = option === null || option === undefined ? defaults[key] : option;
|
|
}
|
|
|
|
if (settings.position.indent || settings.position.start) {
|
|
settings.indent = settings.position.indent || [];
|
|
settings.position = settings.position.start;
|
|
}
|
|
|
|
return parse(value, settings);
|
|
}
|
|
|
|
/* Parse entities. */
|
|
function parse(value, settings) {
|
|
var additional = settings.additional;
|
|
var nonTerminated = settings.nonTerminated;
|
|
var handleText = settings.text;
|
|
var handleReference = settings.reference;
|
|
var handleWarning = settings.warning;
|
|
var textContext = settings.textContext;
|
|
var referenceContext = settings.referenceContext;
|
|
var warningContext = settings.warningContext;
|
|
var pos = settings.position;
|
|
var indent = settings.indent || [];
|
|
var length = value.length;
|
|
var index = 0;
|
|
var lines = -1;
|
|
var column = pos.column || 1;
|
|
var line = pos.line || 1;
|
|
var queue = EMPTY;
|
|
var result = [];
|
|
var entityCharacters;
|
|
var terminated;
|
|
var characters;
|
|
var character;
|
|
var reference;
|
|
var following;
|
|
var warning;
|
|
var reason;
|
|
var output;
|
|
var entity;
|
|
var begin;
|
|
var start;
|
|
var type;
|
|
var test;
|
|
var prev;
|
|
var next;
|
|
var diff;
|
|
var end;
|
|
|
|
/* Cache the current point. */
|
|
prev = now();
|
|
|
|
/* Wrap `handleWarning`. */
|
|
warning = handleWarning ? parseError : noop;
|
|
|
|
/* Ensure the algorithm walks over the first character
|
|
* and the end (inclusive). */
|
|
index--;
|
|
length++;
|
|
|
|
while (++index < length) {
|
|
/* If the previous character was a newline. */
|
|
if (character === NEWLINE) {
|
|
column = indent[lines] || 1;
|
|
}
|
|
|
|
character = at(index);
|
|
|
|
/* Handle anything other than an ampersand,
|
|
* including newlines and EOF. */
|
|
if (character !== AMPERSAND) {
|
|
if (character === NEWLINE) {
|
|
line++;
|
|
lines++;
|
|
column = 0;
|
|
}
|
|
|
|
if (character) {
|
|
queue += character;
|
|
column++;
|
|
} else {
|
|
flush();
|
|
}
|
|
} else {
|
|
following = at(index + 1);
|
|
|
|
/* The behaviour depends on the identity of the next
|
|
* character. */
|
|
if (
|
|
following === TAB ||
|
|
following === NEWLINE ||
|
|
following === FORM_FEED ||
|
|
following === SPACE ||
|
|
following === LESS_THAN ||
|
|
following === AMPERSAND ||
|
|
following === EMPTY ||
|
|
(additional && following === additional)
|
|
) {
|
|
/* Not a character reference. No characters
|
|
* are consumed, and nothing is returned.
|
|
* This is not an error, either. */
|
|
queue += character;
|
|
column++;
|
|
|
|
continue;
|
|
}
|
|
|
|
start = index + 1;
|
|
begin = start;
|
|
end = start;
|
|
|
|
/* Numerical entity. */
|
|
if (following !== OCTOTHORP) {
|
|
type = NAMED;
|
|
} else {
|
|
end = ++begin;
|
|
|
|
/* The behaviour further depends on the
|
|
* character after the U+0023 NUMBER SIGN. */
|
|
following = at(end);
|
|
|
|
if (following === X_LOWER || following === X_UPPER) {
|
|
/* ASCII hex digits. */
|
|
type = HEXADECIMAL;
|
|
end = ++begin;
|
|
} else {
|
|
/* ASCII digits. */
|
|
type = DECIMAL;
|
|
}
|
|
}
|
|
|
|
entityCharacters = EMPTY;
|
|
entity = EMPTY;
|
|
characters = EMPTY;
|
|
test = TESTS[type];
|
|
end--;
|
|
|
|
while (++end < length) {
|
|
following = at(end);
|
|
|
|
if (!test(following)) {
|
|
break;
|
|
}
|
|
|
|
characters += following;
|
|
|
|
/* Check if we can match a legacy named
|
|
* reference. If so, we cache that as the
|
|
* last viable named reference. This
|
|
* ensures we do not need to walk backwards
|
|
* later. */
|
|
if (type === NAMED && own.call(legacy, characters)) {
|
|
entityCharacters = characters;
|
|
entity = legacy[characters];
|
|
}
|
|
}
|
|
|
|
terminated = at(end) === SEMICOLON;
|
|
|
|
if (terminated) {
|
|
end++;
|
|
|
|
if (type === NAMED && own.call(characterEntities, characters)) {
|
|
entityCharacters = characters;
|
|
entity = characterEntities[characters];
|
|
}
|
|
}
|
|
|
|
diff = 1 + end - start;
|
|
|
|
if (!terminated && !nonTerminated) {
|
|
/* Empty. */
|
|
} else if (!characters) {
|
|
/* An empty (possible) entity is valid, unless
|
|
* its numeric (thus an ampersand followed by
|
|
* an octothorp). */
|
|
if (type !== NAMED) {
|
|
warning(NUMERIC_EMPTY, diff);
|
|
}
|
|
} else if (type === NAMED) {
|
|
/* An ampersand followed by anything
|
|
* unknown, and not terminated, is invalid. */
|
|
if (terminated && !entity) {
|
|
warning(NAMED_UNKNOWN, 1);
|
|
} else {
|
|
/* If theres something after an entity
|
|
* name which is not known, cap the
|
|
* reference. */
|
|
if (entityCharacters !== characters) {
|
|
end = begin + entityCharacters.length;
|
|
diff = 1 + end - begin;
|
|
terminated = false;
|
|
}
|
|
|
|
/* If the reference is not terminated,
|
|
* warn. */
|
|
if (!terminated) {
|
|
reason = entityCharacters ?
|
|
NAMED_NOT_TERMINATED :
|
|
NAMED_EMPTY;
|
|
|
|
if (!settings.attribute) {
|
|
warning(reason, diff);
|
|
} else {
|
|
following = at(end);
|
|
|
|
if (following === EQUAL) {
|
|
warning(reason, diff);
|
|
entity = null;
|
|
} else if (alphanumerical(following)) {
|
|
entity = null;
|
|
} else {
|
|
warning(reason, diff);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
reference = entity;
|
|
} else {
|
|
if (!terminated) {
|
|
/* All non-terminated numeric entities are
|
|
* not rendered, and trigger a warning. */
|
|
warning(NUMERIC_NOT_TERMINATED, diff);
|
|
}
|
|
|
|
/* When terminated and number, parse as
|
|
* either hexadecimal or decimal. */
|
|
reference = parseInt(characters, BASE[type]);
|
|
|
|
/* Trigger a warning when the parsed number
|
|
* is prohibited, and replace with
|
|
* replacement character. */
|
|
if (isProhibited(reference)) {
|
|
warning(NUMERIC_PROHIBITED, diff);
|
|
|
|
reference = REPLACEMENT;
|
|
} else if (reference in invalid) {
|
|
/* Trigger a warning when the parsed number
|
|
* is disallowed, and replace by an
|
|
* alternative. */
|
|
warning(NUMERIC_DISALLOWED, diff);
|
|
|
|
reference = invalid[reference];
|
|
} else {
|
|
/* Parse the number. */
|
|
output = EMPTY;
|
|
|
|
/* Trigger a warning when the parsed
|
|
* number should not be used. */
|
|
if (isWarning(reference)) {
|
|
warning(NUMERIC_DISALLOWED, diff);
|
|
}
|
|
|
|
/* Stringify the number. */
|
|
if (reference > 0xFFFF) {
|
|
reference -= 0x10000;
|
|
output += fromCharCode((reference >>> (10 & 0x3FF)) | 0xD800);
|
|
reference = 0xDC00 | (reference & 0x3FF);
|
|
}
|
|
|
|
reference = output + fromCharCode(reference);
|
|
}
|
|
}
|
|
|
|
/* If we could not find a reference, queue the
|
|
* checked characters (as normal characters),
|
|
* and move the pointer to their end. This is
|
|
* possible because we can be certain neither
|
|
* newlines nor ampersands are included. */
|
|
if (!reference) {
|
|
characters = value.slice(start - 1, end);
|
|
queue += characters;
|
|
column += characters.length;
|
|
index = end - 1;
|
|
} else {
|
|
/* Found it! First eat the queued
|
|
* characters as normal text, then eat
|
|
* an entity. */
|
|
flush();
|
|
|
|
prev = now();
|
|
index = end - 1;
|
|
column += end - start + 1;
|
|
result.push(reference);
|
|
next = now();
|
|
next.offset++;
|
|
|
|
if (handleReference) {
|
|
handleReference.call(referenceContext, reference, {
|
|
start: prev,
|
|
end: next
|
|
}, value.slice(start - 1, end));
|
|
}
|
|
|
|
prev = next;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Return the reduced nodes, and any possible warnings. */
|
|
return result.join(EMPTY);
|
|
|
|
/* Get current position. */
|
|
function now() {
|
|
return {
|
|
line: line,
|
|
column: column,
|
|
offset: index + (pos.offset || 0)
|
|
};
|
|
}
|
|
|
|
/* “Throw” a parse-error: a warning. */
|
|
function parseError(code, offset) {
|
|
var position = now();
|
|
|
|
position.column += offset;
|
|
position.offset += offset;
|
|
|
|
handleWarning.call(warningContext, MESSAGES[code], position, code);
|
|
}
|
|
|
|
/* Get character at position. */
|
|
function at(position) {
|
|
return value.charAt(position);
|
|
}
|
|
|
|
/* Flush `queue` (normal text). Macro invoked before
|
|
* each entity and at the end of `value`.
|
|
* Does nothing when `queue` is empty. */
|
|
function flush() {
|
|
if (queue) {
|
|
result.push(queue);
|
|
|
|
if (handleText) {
|
|
handleText.call(textContext, queue, {
|
|
start: prev,
|
|
end: now()
|
|
});
|
|
}
|
|
|
|
queue = EMPTY;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Check if `character` is outside the permissible
|
|
* unicode range. */
|
|
function isProhibited(code) {
|
|
return (code >= 0xD800 && code <= 0xDFFF) || (code > 0x10FFFF);
|
|
}
|
|
|
|
/* Check if `character` is disallowed. */
|
|
function isWarning(code) {
|
|
if (
|
|
(code >= 0x0001 && code <= 0x0008) ||
|
|
code === 0x000B ||
|
|
(code >= 0x000D && code <= 0x001F) ||
|
|
(code >= 0x007F && code <= 0x009F) ||
|
|
(code >= 0xFDD0 && code <= 0xFDEF) ||
|
|
(code & 0xFFFF) === 0xFFFF ||
|
|
(code & 0xFFFF) === 0xFFFE
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|