You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

473 lines
12 KiB

'use strict';
/* Dependencies. */
var characterEntities = require('character-entities');
var legacy = require('character-entities-legacy');
var invalid = require('character-reference-invalid');
var decimal = require('is-decimal');
var hexadecimal = require('is-hexadecimal');
var alphanumerical = require('is-alphanumerical');
/* Expose. */
module.exports = wrapper;
/* Methods. */
var own = {}.hasOwnProperty;
var fromCharCode = String.fromCharCode;
var noop = Function.prototype;
/* Characters. */
var REPLACEMENT = '\uFFFD';
var FORM_FEED = '\f';
var AMPERSAND = '&';
var OCTOTHORP = '#';
var SEMICOLON = ';';
var NEWLINE = '\n';
var X_LOWER = 'x';
var X_UPPER = 'X';
var SPACE = ' ';
var LESS_THAN = '<';
var EQUAL = '=';
var EMPTY = '';
var TAB = '\t';
/* Default settings. */
var defaults = {
warning: null,
reference: null,
text: null,
warningContext: null,
referenceContext: null,
textContext: null,
position: {},
additional: null,
attribute: false,
nonTerminated: true
};
/* Reference types. */
var NAMED = 'named';
var HEXADECIMAL = 'hexadecimal';
var DECIMAL = 'decimal';
/* Map of bases. */
var BASE = {};
BASE[HEXADECIMAL] = 16;
BASE[DECIMAL] = 10;
/* Map of types to tests. Each type of character reference
* accepts different characters. This test is used to
* detect whether a reference has ended (as the semicolon
* is not strictly needed). */
var TESTS = {};
TESTS[NAMED] = alphanumerical;
TESTS[DECIMAL] = decimal;
TESTS[HEXADECIMAL] = hexadecimal;
/* Warning messages. */
var NAMED_NOT_TERMINATED = 1;
var NUMERIC_NOT_TERMINATED = 2;
var NAMED_EMPTY = 3;
var NUMERIC_EMPTY = 4;
var NAMED_UNKNOWN = 5;
var NUMERIC_DISALLOWED = 6;
var NUMERIC_PROHIBITED = 7;
var NUMERIC_REFERENCE = 'Numeric character references';
var NAMED_REFERENCE = 'Named character references';
var TERMINATED = ' must be terminated by a semicolon';
var VOID = ' cannot be empty';
var MESSAGES = {};
MESSAGES[NAMED_NOT_TERMINATED] = NAMED_REFERENCE + TERMINATED;
MESSAGES[NUMERIC_NOT_TERMINATED] = NUMERIC_REFERENCE + TERMINATED;
MESSAGES[NAMED_EMPTY] = NAMED_REFERENCE + VOID;
MESSAGES[NUMERIC_EMPTY] = NUMERIC_REFERENCE + VOID;
MESSAGES[NAMED_UNKNOWN] = NAMED_REFERENCE + ' must be known';
MESSAGES[NUMERIC_DISALLOWED] = NUMERIC_REFERENCE + ' cannot be disallowed';
MESSAGES[NUMERIC_PROHIBITED] = NUMERIC_REFERENCE + ' cannot be outside the ' +
'permissible Unicode range';
/* Wrap to ensure clean parameters are given to `parse`. */
function wrapper(value, options) {
var settings = {};
var option;
var key;
if (!options) {
options = {};
}
for (key in defaults) {
option = options[key];
settings[key] = option === null || option === undefined ? defaults[key] : option;
}
if (settings.position.indent || settings.position.start) {
settings.indent = settings.position.indent || [];
settings.position = settings.position.start;
}
return parse(value, settings);
}
/* Parse entities. */
function parse(value, settings) {
var additional = settings.additional;
var nonTerminated = settings.nonTerminated;
var handleText = settings.text;
var handleReference = settings.reference;
var handleWarning = settings.warning;
var textContext = settings.textContext;
var referenceContext = settings.referenceContext;
var warningContext = settings.warningContext;
var pos = settings.position;
var indent = settings.indent || [];
var length = value.length;
var index = 0;
var lines = -1;
var column = pos.column || 1;
var line = pos.line || 1;
var queue = EMPTY;
var result = [];
var entityCharacters;
var terminated;
var characters;
var character;
var reference;
var following;
var warning;
var reason;
var output;
var entity;
var begin;
var start;
var type;
var test;
var prev;
var next;
var diff;
var end;
/* Cache the current point. */
prev = now();
/* Wrap `handleWarning`. */
warning = handleWarning ? parseError : noop;
/* Ensure the algorithm walks over the first character
* and the end (inclusive). */
index--;
length++;
while (++index < length) {
/* If the previous character was a newline. */
if (character === NEWLINE) {
column = indent[lines] || 1;
}
character = at(index);
/* Handle anything other than an ampersand,
* including newlines and EOF. */
if (character !== AMPERSAND) {
if (character === NEWLINE) {
line++;
lines++;
column = 0;
}
if (character) {
queue += character;
column++;
} else {
flush();
}
} else {
following = at(index + 1);
/* The behaviour depends on the identity of the next
* character. */
if (
following === TAB ||
following === NEWLINE ||
following === FORM_FEED ||
following === SPACE ||
following === LESS_THAN ||
following === AMPERSAND ||
following === EMPTY ||
(additional && following === additional)
) {
/* Not a character reference. No characters
* are consumed, and nothing is returned.
* This is not an error, either. */
queue += character;
column++;
continue;
}
start = index + 1;
begin = start;
end = start;
/* Numerical entity. */
if (following !== OCTOTHORP) {
type = NAMED;
} else {
end = ++begin;
/* The behaviour further depends on the
* character after the U+0023 NUMBER SIGN. */
following = at(end);
if (following === X_LOWER || following === X_UPPER) {
/* ASCII hex digits. */
type = HEXADECIMAL;
end = ++begin;
} else {
/* ASCII digits. */
type = DECIMAL;
}
}
entityCharacters = EMPTY;
entity = EMPTY;
characters = EMPTY;
test = TESTS[type];
end--;
while (++end < length) {
following = at(end);
if (!test(following)) {
break;
}
characters += following;
/* Check if we can match a legacy named
* reference. If so, we cache that as the
* last viable named reference. This
* ensures we do not need to walk backwards
* later. */
if (type === NAMED && own.call(legacy, characters)) {
entityCharacters = characters;
entity = legacy[characters];
}
}
terminated = at(end) === SEMICOLON;
if (terminated) {
end++;
if (type === NAMED && own.call(characterEntities, characters)) {
entityCharacters = characters;
entity = characterEntities[characters];
}
}
diff = 1 + end - start;
if (!terminated && !nonTerminated) {
/* Empty. */
} else if (!characters) {
/* An empty (possible) entity is valid, unless
* its numeric (thus an ampersand followed by
* an octothorp). */
if (type !== NAMED) {
warning(NUMERIC_EMPTY, diff);
}
} else if (type === NAMED) {
/* An ampersand followed by anything
* unknown, and not terminated, is invalid. */
if (terminated && !entity) {
warning(NAMED_UNKNOWN, 1);
} else {
/* If theres something after an entity
* name which is not known, cap the
* reference. */
if (entityCharacters !== characters) {
end = begin + entityCharacters.length;
diff = 1 + end - begin;
terminated = false;
}
/* If the reference is not terminated,
* warn. */
if (!terminated) {
reason = entityCharacters ?
NAMED_NOT_TERMINATED :
NAMED_EMPTY;
if (!settings.attribute) {
warning(reason, diff);
} else {
following = at(end);
if (following === EQUAL) {
warning(reason, diff);
entity = null;
} else if (alphanumerical(following)) {
entity = null;
} else {
warning(reason, diff);
}
}
}
}
reference = entity;
} else {
if (!terminated) {
/* All non-terminated numeric entities are
* not rendered, and trigger a warning. */
warning(NUMERIC_NOT_TERMINATED, diff);
}
/* When terminated and number, parse as
* either hexadecimal or decimal. */
reference = parseInt(characters, BASE[type]);
/* Trigger a warning when the parsed number
* is prohibited, and replace with
* replacement character. */
if (isProhibited(reference)) {
warning(NUMERIC_PROHIBITED, diff);
reference = REPLACEMENT;
} else if (reference in invalid) {
/* Trigger a warning when the parsed number
* is disallowed, and replace by an
* alternative. */
warning(NUMERIC_DISALLOWED, diff);
reference = invalid[reference];
} else {
/* Parse the number. */
output = EMPTY;
/* Trigger a warning when the parsed
* number should not be used. */
if (isWarning(reference)) {
warning(NUMERIC_DISALLOWED, diff);
}
/* Stringify the number. */
if (reference > 0xFFFF) {
reference -= 0x10000;
output += fromCharCode((reference >>> (10 & 0x3FF)) | 0xD800);
reference = 0xDC00 | (reference & 0x3FF);
}
reference = output + fromCharCode(reference);
}
}
/* If we could not find a reference, queue the
* checked characters (as normal characters),
* and move the pointer to their end. This is
* possible because we can be certain neither
* newlines nor ampersands are included. */
if (!reference) {
characters = value.slice(start - 1, end);
queue += characters;
column += characters.length;
index = end - 1;
} else {
/* Found it! First eat the queued
* characters as normal text, then eat
* an entity. */
flush();
prev = now();
index = end - 1;
column += end - start + 1;
result.push(reference);
next = now();
next.offset++;
if (handleReference) {
handleReference.call(referenceContext, reference, {
start: prev,
end: next
}, value.slice(start - 1, end));
}
prev = next;
}
}
}
/* Return the reduced nodes, and any possible warnings. */
return result.join(EMPTY);
/* Get current position. */
function now() {
return {
line: line,
column: column,
offset: index + (pos.offset || 0)
};
}
/* “Throw” a parse-error: a warning. */
function parseError(code, offset) {
var position = now();
position.column += offset;
position.offset += offset;
handleWarning.call(warningContext, MESSAGES[code], position, code);
}
/* Get character at position. */
function at(position) {
return value.charAt(position);
}
/* Flush `queue` (normal text). Macro invoked before
* each entity and at the end of `value`.
* Does nothing when `queue` is empty. */
function flush() {
if (queue) {
result.push(queue);
if (handleText) {
handleText.call(textContext, queue, {
start: prev,
end: now()
});
}
queue = EMPTY;
}
}
}
/* Check if `character` is outside the permissible
* unicode range. */
function isProhibited(code) {
return (code >= 0xD800 && code <= 0xDFFF) || (code > 0x10FFFF);
}
/* Check if `character` is disallowed. */
function isWarning(code) {
if (
(code >= 0x0001 && code <= 0x0008) ||
code === 0x000B ||
(code >= 0x000D && code <= 0x001F) ||
(code >= 0x007F && code <= 0x009F) ||
(code >= 0xFDD0 && code <= 0xFDEF) ||
(code & 0xFFFF) === 0xFFFF ||
(code & 0xFFFF) === 0xFFFE
) {
return true;
}
return false;
}