Browse Source

readline: use icu based string width calculation

Rather than the pseudo-wcwidth impl used currently, use the ICU
character properties database to calculate string width and
determine if a character is full width or not. This allows the
algorithm to correctly identify emoji's as full width, ensures
the algorithm will continue to fucntion properly as new unicode
codepoints are added, and it's faster.

This was originally part of a proposal to add a new unicode module,
but has been split out.

Refs: https://github.com/nodejs/node/pull/8075
PR-URL: https://github.com/nodejs/node/pull/9040
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
Reviewed-By: Steven R Loomis <srloomis@us.ibm.com>
v7.x
James M Snell 8 years ago
committed by Evan Lucas
parent
commit
a5c62cb4f2
  1. 160
      lib/internal/readline.js
  2. 8
      lib/readline.js
  3. 90
      src/node_i18n.cc
  4. 43
      test/parallel/test-icu-stringwidth.js

160
lib/internal/readline.js

@ -1,103 +1,117 @@
'use strict'; 'use strict';
// Regexes used for ansi escape code splitting // Regex used for ansi escape code splitting
// eslint-disable-next-line no-control-regex // eslint-disable-next-line no-control-regex
const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/; // Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [ // License: MIT, authors: @sindresorhus, Qix-, and arjunmehta
'(\\d+)(?:;(\\d+))?([~^$])', // Matches all ansi escape code sequences in a string
'(?:M([@ #!a`])(.)(.))', // mouse const ansi =
'(?:1;)?(\\d+)?([a-zA-Z])' /[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g;
].join('|') + ')');
module.exports = { module.exports = {
emitKeys, emitKeys,
getStringWidth,
isFullWidthCodePoint,
stripVTControlCharacters stripVTControlCharacters
}; };
if (process.binding('config').hasIntl) {
const icu = process.binding('icu');
module.exports.getStringWidth = function getStringWidth(str, options) {
options = options || {};
if (!Number.isInteger(str))
str = stripVTControlCharacters(String(str));
return icu.getStringWidth(str,
Boolean(options.ambiguousAsFullWidth),
Boolean(options.expandEmojiSequence));
};
module.exports.isFullWidthCodePoint =
function isFullWidthCodePoint(code, options) {
if (typeof code !== 'number')
return false;
return icu.getStringWidth(code, options) === 2;
};
} else {
/**
* Returns the number of columns required to display the given string.
*/
module.exports.getStringWidth = function getStringWidth(str) {
if (Number.isInteger(str))
return module.exports.isFullWidthCodePoint(str) ? 2 : 1;
/** let width = 0;
* Returns the number of columns required to display the given string.
*/
function getStringWidth(str) {
let width = 0;
str = stripVTControlCharacters(str); str = stripVTControlCharacters(String(str));
for (var i = 0; i < str.length; i++) { for (var i = 0; i < str.length; i++) {
const code = str.codePointAt(i); const code = str.codePointAt(i);
if (code >= 0x10000) { // surrogates if (code >= 0x10000) { // surrogates
i++; i++;
} }
if (isFullWidthCodePoint(code)) { if (module.exports.isFullWidthCodePoint(code)) {
width += 2; width += 2;
} else { } else {
width++; width++;
}
} }
}
return width;
}
return width;
};
/** /**
* Returns true if the character represented by a given * Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false. * Unicode code point is full-width. Otherwise returns false.
*/ */
function isFullWidthCodePoint(code) { module.exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) {
if (isNaN(code)) { if (!Number.isInteger(code)) {
return false; return false;
} }
// Code points are derived from: // Code points are derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (code >= 0x1100 && ( if (code >= 0x1100 && (
code <= 0x115f || // Hangul Jamo code <= 0x115f || // Hangul Jamo
0x2329 === code || // LEFT-POINTING ANGLE BRACKET 0x2329 === code || // LEFT-POINTING ANGLE BRACKET
0x232a === code || // RIGHT-POINTING ANGLE BRACKET 0x232a === code || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months // CJK Radicals Supplement .. Enclosed CJK Letters and Months
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) || (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
0x3250 <= code && code <= 0x4dbf || 0x3250 <= code && code <= 0x4dbf ||
// CJK Unified Ideographs .. Yi Radicals // CJK Unified Ideographs .. Yi Radicals
0x4e00 <= code && code <= 0xa4c6 || 0x4e00 <= code && code <= 0xa4c6 ||
// Hangul Jamo Extended-A // Hangul Jamo Extended-A
0xa960 <= code && code <= 0xa97c || 0xa960 <= code && code <= 0xa97c ||
// Hangul Syllables // Hangul Syllables
0xac00 <= code && code <= 0xd7a3 || 0xac00 <= code && code <= 0xd7a3 ||
// CJK Compatibility Ideographs // CJK Compatibility Ideographs
0xf900 <= code && code <= 0xfaff || 0xf900 <= code && code <= 0xfaff ||
// Vertical Forms // Vertical Forms
0xfe10 <= code && code <= 0xfe19 || 0xfe10 <= code && code <= 0xfe19 ||
// CJK Compatibility Forms .. Small Form Variants // CJK Compatibility Forms .. Small Form Variants
0xfe30 <= code && code <= 0xfe6b || 0xfe30 <= code && code <= 0xfe6b ||
// Halfwidth and Fullwidth Forms // Halfwidth and Fullwidth Forms
0xff01 <= code && code <= 0xff60 || 0xff01 <= code && code <= 0xff60 ||
0xffe0 <= code && code <= 0xffe6 || 0xffe0 <= code && code <= 0xffe6 ||
// Kana Supplement // Kana Supplement
0x1b000 <= code && code <= 0x1b001 || 0x1b000 <= code && code <= 0x1b001 ||
// Enclosed Ideographic Supplement // Enclosed Ideographic Supplement
0x1f200 <= code && code <= 0x1f251 || 0x1f200 <= code && code <= 0x1f251 ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
0x20000 <= code && code <= 0x3fffd)) { 0x20000 <= code && code <= 0x3fffd)) {
return true; return true;
} }
return false; return false;
};
} }
/** /**
* Tries to remove all VT control characters. Use to estimate displayed * Tries to remove all VT control characters. Use to estimate displayed
* string width. May be buggy due to not running a real state machine * string width. May be buggy due to not running a real state machine
*/ */
function stripVTControlCharacters(str) { function stripVTControlCharacters(str) {
str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), ''); return str.replace(ansi, '');
return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), '');
} }

8
lib/readline.js

@ -124,6 +124,14 @@ function Interface(input, output, completer, terminal) {
function onkeypress(s, key) { function onkeypress(s, key) {
self._ttyWrite(s, key); self._ttyWrite(s, key);
if (key && key.sequence) {
// if the key.sequence is half of a surrogate pair
// (>= 0xd800 and <= 0xdfff), refresh the line so
// the character is displayed appropriately.
const ch = key.sequence.codePointAt(0);
if (ch >= 0xd800 && ch <= 0xdfff)
self._refreshLine();
}
} }
function onresize() { function onresize() {

90
src/node_i18n.cc

@ -31,6 +31,7 @@
#include "v8.h" #include "v8.h"
#include <unicode/putil.h> #include <unicode/putil.h>
#include <unicode/uchar.h>
#include <unicode/udata.h> #include <unicode/udata.h>
#include <unicode/uidna.h> #include <unicode/uidna.h>
@ -185,6 +186,94 @@ static void ToASCII(const FunctionCallbackInfo<Value>& args) {
len).ToLocalChecked()); len).ToLocalChecked());
} }
// This is similar to wcwidth except that it takes the current unicode
// character properties database into consideration, allowing it to
// correctly calculate the column widths of things like emoji's and
// newer wide characters. wcwidth, on the other hand, uses a fixed
// algorithm that does not take things like emoji into proper
// consideration.
static int GetColumnWidth(UChar32 codepoint,
bool ambiguous_as_full_width = false) {
if (!u_isdefined(codepoint) ||
u_iscntrl(codepoint) ||
u_getCombiningClass(codepoint) > 0 ||
u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER)) {
return 0;
}
// UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
// codepoint as being full width, wide, ambiguous, neutral, narrow,
// or halfwidth.
const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
switch (eaw) {
case U_EA_FULLWIDTH:
case U_EA_WIDE:
return 2;
case U_EA_AMBIGUOUS:
// See: http://www.unicode.org/reports/tr11/#Ambiguous for details
if (ambiguous_as_full_width) {
return 2;
}
// Fall through if ambiguous_as_full_width if false.
case U_EA_NEUTRAL:
if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
return 2;
}
// Fall through
case U_EA_HALFWIDTH:
case U_EA_NARROW:
default:
return 1;
}
}
// Returns the column width for the given String.
static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
if (args.Length() < 1)
return;
bool ambiguous_as_full_width = args[1]->BooleanValue();
bool expand_emoji_sequence = args[2]->BooleanValue();
if (args[0]->IsNumber()) {
args.GetReturnValue().Set(
GetColumnWidth(args[0]->Uint32Value(),
ambiguous_as_full_width));
return;
}
TwoByteValue value(env->isolate(), args[0]);
// reinterpret_cast is required by windows to compile
UChar* str = reinterpret_cast<UChar*>(*value);
UChar32 c;
UChar32 p;
size_t n = 0;
uint32_t width = 0;
while (n < value.length()) {
p = c;
U16_NEXT(str, n, value.length(), c);
// Don't count individual emoji codepoints that occur within an
// emoji sequence. This is not necessarily foolproof. Some
// environments display emoji sequences in the appropriate
// condensed form (as a single emoji glyph), other environments
// may not understand an emoji sequence and will display each
// individual emoji separately. When this happens, the width
// calculated will be off, and there's no reliable way of knowing
// in advance if a particular sequence is going to be supported.
// The expand_emoji_sequence option allows the caller to skip this
// check and count each code within an emoji sequence separately.
if (!expand_emoji_sequence &&
n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner)
(u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
continue;
}
width += GetColumnWidth(c, ambiguous_as_full_width);
}
args.GetReturnValue().Set(width);
}
void Init(Local<Object> target, void Init(Local<Object> target,
Local<Value> unused, Local<Value> unused,
Local<Context> context, Local<Context> context,
@ -192,6 +281,7 @@ void Init(Local<Object> target,
Environment* env = Environment::GetCurrent(context); Environment* env = Environment::GetCurrent(context);
env->SetMethod(target, "toUnicode", ToUnicode); env->SetMethod(target, "toUnicode", ToUnicode);
env->SetMethod(target, "toASCII", ToASCII); env->SetMethod(target, "toASCII", ToASCII);
env->SetMethod(target, "getStringWidth", GetStringWidth);
} }
} // namespace i18n } // namespace i18n

43
test/parallel/test-icu-stringwidth.js

@ -0,0 +1,43 @@
// Flags: --expose_internals
'use strict';
const common = require('../common');
const assert = require('assert');
const readline = require('internal/readline');
if (!process.binding('config').hasIntl) {
common.skip('missing intl... skipping test');
return;
}
// Test column width
assert.strictEqual(readline.getStringWidth('a'), 1);
assert.strictEqual(readline.getStringWidth('丁'), 2);
assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 2);
assert.strictEqual(readline.getStringWidth('👅'), 2);
assert.strictEqual(readline.getStringWidth('\n'), 0);
assert.strictEqual(readline.getStringWidth('\u200Ef\u200F'), 1);
assert.strictEqual(readline.getStringWidth(97), 1);
// The following is an emoji sequence. In some implementations, it is
// represented as a single glyph, in other implementations as a sequence
// of individual glyphs. By default, the algorithm will assume the single
// glyph interpretation and return a value of 2. By passing the
// expandEmojiSequence: true option, each component will be counted
// individually.
assert.strictEqual(readline.getStringWidth('👩‍👩‍👧‍👧'), 2);
assert.strictEqual(
readline.getStringWidth('👩‍👩‍👧‍👧', {expandEmojiSequence: true}), 8);
// By default, unicode characters whose width is considered ambiguous will
// be considered half-width. For these characters, getStringWidth will return
// 1. In some contexts, however, it is more appropriate to consider them full
// width. By default, the algorithm will assume half width. By passing
// the ambiguousAsFullWidth: true option, ambiguous characters will be counted
// as 2 columns.
assert.strictEqual(readline.getStringWidth('\u01d4'), 1);
assert.strictEqual(
readline.getStringWidth('\u01d4', {ambiguousAsFullWidth: true}), 2);
// Control chars and combining chars are zero
assert.strictEqual(readline.getStringWidth('\u200E\n\u220A\u20D2'), 1);
Loading…
Cancel
Save