// Copyright Joyent, Inc. and other Node contributors. // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the // "Software"), to deal in the Software without restriction, including // without limitation the rights to use, copy, modify, merge, publish, // distribute, sublicense, and/or sell copies of the Software, and to permit // persons to whom the Software is furnished to do so, subject to the // following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE // USE OR OTHER DEALINGS IN THE SOFTWARE. /* * notes: by srl295 * - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data * ( stubdata/libicudata.a ) containing nothing, no data, and it's also * linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT * macro names. That's the "english+root" data. * * If icu_data_path is non-null, the user has provided a path and we assume * it goes somewhere useful. We set that path in ICU, and exit. * If icu_data_path is null, they haven't set a path and we want the * "english+root" data. We call * udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...) * to load up the english+root data. * * - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full * data. All of the variables and command line options for changing data at * runtime are disabled, as they wouldn't fully override the internal data. * See: http://bugs.icu-project.org/trac/ticket/10924 */ #include "node_i18n.h" #if defined(NODE_HAVE_I18N_SUPPORT) #include "node.h" #include "node_buffer.h" #include "env.h" #include "env-inl.h" #include "util.h" #include "util-inl.h" #include "v8.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NODE_HAVE_SMALL_ICU /* if this is defined, we have a 'secondary' entry point. compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */ #define SMALL_ICUDATA_ENTRY_POINT \ SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME) #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff) #ifndef U_LIB_SUFFIX_C_NAME #define SMALL_DEF(major, suff) icusmdt##major##_dat #else #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat #endif extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[]; #endif namespace node { using v8::Context; using v8::FunctionCallbackInfo; using v8::Isolate; using v8::Local; using v8::MaybeLocal; using v8::Object; using v8::String; using v8::Value; namespace i18n { namespace { template MaybeLocal ToBufferEndian(Environment* env, MaybeStackBuffer* buf) { MaybeLocal ret = Buffer::New(env, buf); if (ret.IsEmpty()) return ret; static_assert(sizeof(T) == 1 || sizeof(T) == 2, "Currently only one- or two-byte buffers are supported"); if (sizeof(T) > 1 && IsBigEndian()) { SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf); SwapBytes16(retbuf_data, retbuf_length); } return ret; } struct Converter { explicit Converter(const char* name, const char* sub = NULL) : conv(nullptr) { UErrorCode status = U_ZERO_ERROR; conv = ucnv_open(name, &status); CHECK(U_SUCCESS(status)); if (sub != NULL) { ucnv_setSubstChars(conv, sub, strlen(sub), &status); } } ~Converter() { ucnv_close(conv); } UConverter* conv; }; // One-Shot Converters void CopySourceBuffer(MaybeStackBuffer* dest, const char* data, const size_t length, const size_t length_in_chars) { dest->AllocateSufficientStorage(length_in_chars); char* dst = reinterpret_cast(**dest); memcpy(dst, data, length); if (IsBigEndian()) { SwapBytes16(dst, length); } } typedef MaybeLocal (*TranscodeFunc)(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status); MaybeLocal Transcode(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeLocal ret; MaybeStackBuffer result; Converter to(toEncoding, "?"); Converter from(fromEncoding); const uint32_t limit = source_length * ucnv_getMaxCharSize(to.conv); result.AllocateSufficientStorage(limit); char* target = *result; ucnv_convertEx(to.conv, from.conv, &target, target + limit, &source, source + source_length, nullptr, nullptr, nullptr, nullptr, true, true, status); if (U_SUCCESS(*status)) { result.SetLength(target - &result[0]); ret = ToBufferEndian(env, &result); } return ret; } MaybeLocal TranscodeToUcs2(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeLocal ret; MaybeStackBuffer destbuf(source_length); Converter from(fromEncoding); const size_t length_in_chars = source_length * sizeof(UChar); ucnv_toUChars(from.conv, *destbuf, length_in_chars, source, source_length, status); if (U_SUCCESS(*status)) ret = ToBufferEndian(env, &destbuf); return ret; } MaybeLocal TranscodeFromUcs2(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeStackBuffer sourcebuf; MaybeLocal ret; Converter to(toEncoding, "?"); const size_t length_in_chars = source_length / sizeof(UChar); CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); MaybeStackBuffer destbuf(length_in_chars); const uint32_t len = ucnv_fromUChars(to.conv, *destbuf, length_in_chars, *sourcebuf, length_in_chars, status); if (U_SUCCESS(*status)) { destbuf.SetLength(len); ret = ToBufferEndian(env, &destbuf); } return ret; } MaybeLocal TranscodeUcs2FromUtf8(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeStackBuffer destbuf; int32_t result_length; u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length, source, source_length, status); MaybeLocal ret; if (U_SUCCESS(*status)) { destbuf.SetLength(result_length); ret = ToBufferEndian(env, &destbuf); } else if (*status == U_BUFFER_OVERFLOW_ERROR) { *status = U_ZERO_ERROR; destbuf.AllocateSufficientStorage(result_length); u_strFromUTF8(*destbuf, result_length, &result_length, source, source_length, status); if (U_SUCCESS(*status)) { destbuf.SetLength(result_length); ret = ToBufferEndian(env, &destbuf); } } return ret; } MaybeLocal TranscodeUtf8FromUcs2(Environment* env, const char* fromEncoding, const char* toEncoding, const char* source, const size_t source_length, UErrorCode* status) { *status = U_ZERO_ERROR; MaybeLocal ret; const size_t length_in_chars = source_length / sizeof(UChar); int32_t result_length; MaybeStackBuffer sourcebuf; MaybeStackBuffer destbuf; CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); u_strToUTF8(*destbuf, destbuf.capacity(), &result_length, *sourcebuf, length_in_chars, status); if (U_SUCCESS(*status)) { destbuf.SetLength(result_length); ret = ToBufferEndian(env, &destbuf); } else if (*status == U_BUFFER_OVERFLOW_ERROR) { *status = U_ZERO_ERROR; destbuf.AllocateSufficientStorage(result_length); u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf, length_in_chars, status); if (U_SUCCESS(*status)) { destbuf.SetLength(result_length); ret = ToBufferEndian(env, &destbuf); } } return ret; } const char* EncodingName(const enum encoding encoding) { switch (encoding) { case ASCII: return "us-ascii"; case LATIN1: return "iso8859-1"; case UCS2: return "utf16le"; case UTF8: return "utf-8"; default: return NULL; } } bool SupportedEncoding(const enum encoding encoding) { switch (encoding) { case ASCII: case LATIN1: case UCS2: case UTF8: return true; default: return false; } } void Transcode(const FunctionCallbackInfo&args) { Environment* env = Environment::GetCurrent(args); Isolate* isolate = env->isolate(); UErrorCode status = U_ZERO_ERROR; MaybeLocal result; THROW_AND_RETURN_UNLESS_BUFFER(env, args[0]); SPREAD_BUFFER_ARG(args[0], ts_obj); const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER); const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER); if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) { TranscodeFunc tfn = &Transcode; switch (fromEncoding) { case ASCII: case LATIN1: if (toEncoding == UCS2) tfn = &TranscodeToUcs2; break; case UTF8: if (toEncoding == UCS2) tfn = &TranscodeUcs2FromUtf8; break; case UCS2: switch (toEncoding) { case UCS2: tfn = &Transcode; break; case UTF8: tfn = &TranscodeUtf8FromUcs2; break; default: tfn = &TranscodeFromUcs2; } break; default: // This should not happen because of the SupportedEncoding checks ABORT(); } result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding), ts_obj_data, ts_obj_length, &status); } else { status = U_ILLEGAL_ARGUMENT_ERROR; } if (result.IsEmpty()) return args.GetReturnValue().Set(status); return args.GetReturnValue().Set(result.ToLocalChecked()); } void ICUErrorName(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); UErrorCode status = static_cast(args[0]->Int32Value()); args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), u_errorName(status), v8::NewStringType::kNormal).ToLocalChecked()); } #define TYPE_ICU "icu" #define TYPE_UNICODE "unicode" #define TYPE_CLDR "cldr" #define TYPE_TZ "tz" /** * This is the workhorse function that deals with the actual version info. * Get an ICU version. * @param type the type of version to get. One of VERSION_TYPES * @param buf optional buffer for result * @param status ICU error status. If failure, assume result is undefined. * @return version number, or NULL. May or may not be buf. */ const char* GetVersion(const char* type, char buf[U_MAX_VERSION_STRING_LENGTH], UErrorCode* status) { if (!strcmp(type, TYPE_ICU)) { return U_ICU_VERSION; } else if (!strcmp(type, TYPE_UNICODE)) { return U_UNICODE_VERSION; } else if (!strcmp(type, TYPE_TZ)) { return TimeZone::getTZDataVersion(*status); } else if (!strcmp(type, TYPE_CLDR)) { UVersionInfo versionArray; ulocdata_getCLDRVersion(versionArray, status); if (U_SUCCESS(*status)) { u_versionToString(versionArray, buf); return buf; } } // Fall through - unknown type or error case return nullptr; } void GetVersion(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); if ( args.Length() == 0 ) { // With no args - return a comma-separated list of allowed values args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), TYPE_ICU "," TYPE_UNICODE "," TYPE_CLDR "," TYPE_TZ)); } else { CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); UErrorCode status = U_ZERO_ERROR; char buf[U_MAX_VERSION_STRING_LENGTH] = ""; // Possible output buffer. const char* versionString = GetVersion(*val, buf, &status); if (U_SUCCESS(status) && versionString) { // Success. args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), versionString)); } } } } // anonymous namespace bool InitializeICUDirectory(const std::string& path) { UErrorCode status = U_ZERO_ERROR; if (path.empty()) { #ifdef NODE_HAVE_SMALL_ICU // install the 'small' data. udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status); #else // !NODE_HAVE_SMALL_ICU // no small data, so nothing to do. #endif // !NODE_HAVE_SMALL_ICU } else { u_setDataDirectory(path.c_str()); u_init(&status); } return status == U_ZERO_ERROR; } int32_t ToUnicode(MaybeStackBuffer* buf, const char* input, size_t length) { UErrorCode status = U_ZERO_ERROR; uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE; UIDNA* uidna = uidna_openUTS46(options, &status); if (U_FAILURE(status)) return -1; UIDNAInfo info = UIDNA_INFO_INITIALIZER; int32_t len = uidna_nameToUnicodeUTF8(uidna, input, length, **buf, buf->capacity(), &info, &status); // Do not check info.errors like we do with ToASCII since ToUnicode always // returns a string, despite any possible errors that may have occurred. if (status == U_BUFFER_OVERFLOW_ERROR) { status = U_ZERO_ERROR; buf->AllocateSufficientStorage(len); len = uidna_nameToUnicodeUTF8(uidna, input, length, **buf, buf->capacity(), &info, &status); } // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode // string, regardless of whether an error occurred. if (U_FAILURE(status)) { len = -1; buf->SetLength(0); } else { buf->SetLength(len); } uidna_close(uidna); return len; } int32_t ToASCII(MaybeStackBuffer* buf, const char* input, size_t length, enum idna_mode mode) { UErrorCode status = U_ZERO_ERROR; uint32_t options = // CheckHyphens = false; handled later UIDNA_CHECK_BIDI | // CheckBidi = true UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing if (mode == IDNA_STRICT) { options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict // VerifyDnsLength = beStrict; // handled later } UIDNA* uidna = uidna_openUTS46(options, &status); if (U_FAILURE(status)) return -1; UIDNAInfo info = UIDNA_INFO_INITIALIZER; int32_t len = uidna_nameToASCII_UTF8(uidna, input, length, **buf, buf->capacity(), &info, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { status = U_ZERO_ERROR; buf->AllocateSufficientStorage(len); len = uidna_nameToASCII_UTF8(uidna, input, length, **buf, buf->capacity(), &info, &status); } // In UTS #46 which specifies ToASCII, certain error conditions are // configurable through options, and the WHATWG URL Standard promptly elects // to disable some of them to accommodate for real-world use cases. // Unfortunately, ICU4C's IDNA module does not support disabling some of // these options through `options` above, and thus continues throwing // unnecessary errors. To counter this situation, we just filter out the // errors that may have happened afterwards, before deciding whether to // return an error from this function. // CheckHyphens = false // (Specified in the current UTS #46 draft rev. 18.) // Refs: // - https://github.com/whatwg/url/issues/53 // - https://github.com/whatwg/url/pull/309 // - http://www.unicode.org/review/pri317/ // - http://www.unicode.org/reports/tr46/tr46-18.html // - https://www.icann.org/news/announcement-2000-01-07-en info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; if (mode != IDNA_STRICT) { // VerifyDnsLength = beStrict info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; } if (U_FAILURE(status) || (mode != IDNA_LENIENT && info.errors != 0)) { len = -1; buf->SetLength(0); } else { buf->SetLength(len); } uidna_close(uidna); return len; } static void ToUnicode(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); MaybeStackBuffer buf; int32_t len = ToUnicode(&buf, *val, val.length()); if (len < 0) { return env->ThrowError("Cannot convert name to Unicode"); } args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), *buf, v8::NewStringType::kNormal, len).ToLocalChecked()); } static void ToASCII(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); // optional arg bool lenient = args[1]->BooleanValue(env->context()).FromJust(); enum idna_mode mode = lenient ? IDNA_LENIENT : IDNA_DEFAULT; MaybeStackBuffer buf; int32_t len = ToASCII(&buf, *val, val.length(), mode); if (len < 0) { return env->ThrowError("Cannot convert name to ASCII"); } args.GetReturnValue().Set( String::NewFromUtf8(env->isolate(), *buf, v8::NewStringType::kNormal, len).ToLocalChecked()); } // This is similar to wcwidth except that it takes the current unicode // character properties database into consideration, allowing it to // correctly calculate the column widths of things like emoji's and // newer wide characters. wcwidth, on the other hand, uses a fixed // algorithm that does not take things like emoji into proper // consideration. // // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't // allow it to be input. Linux's PTY terminal prints control characters as // Narrow rhombi. // // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final // consonants are 0-width when combined with initial consonants; otherwise they // are technically Wide. But many terminals (including Konsole and // VTE/GLib-based) implement all medials and finals as 0-width. // // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420 // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223 static int GetColumnWidth(UChar32 codepoint, bool ambiguous_as_full_width = false) { const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code U_GC_CF_MASK | // Format control character U_GC_ME_MASK | // Enclosing mark U_GC_MN_MASK; // Nonspacing mark if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width ((U_MASK(u_charType(codepoint)) & zero_width_mask) || u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) { return 0; } // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a // codepoint as being full width, wide, ambiguous, neutral, narrow, // or halfwidth. const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH); switch (eaw) { case U_EA_FULLWIDTH: case U_EA_WIDE: return 2; case U_EA_AMBIGUOUS: // See: http://www.unicode.org/reports/tr11/#Ambiguous for details if (ambiguous_as_full_width) { return 2; } // Fall through if ambiguous_as_full_width if false. case U_EA_NEUTRAL: if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) { return 2; } // Fall through case U_EA_HALFWIDTH: case U_EA_NARROW: default: return 1; } } // Returns the column width for the given String. static void GetStringWidth(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); if (args.Length() < 1) return; bool ambiguous_as_full_width = args[1]->BooleanValue(); bool expand_emoji_sequence = args[2]->BooleanValue(); if (args[0]->IsNumber()) { args.GetReturnValue().Set( GetColumnWidth(args[0]->Uint32Value(), ambiguous_as_full_width)); return; } TwoByteValue value(env->isolate(), args[0]); // reinterpret_cast is required by windows to compile UChar* str = reinterpret_cast(*value); static_assert(sizeof(*str) == sizeof(**value), "sizeof(*str) == sizeof(**value)"); UChar32 c = 0; UChar32 p; size_t n = 0; uint32_t width = 0; while (n < value.length()) { p = c; U16_NEXT(str, n, value.length(), c); // Don't count individual emoji codepoints that occur within an // emoji sequence. This is not necessarily foolproof. Some // environments display emoji sequences in the appropriate // condensed form (as a single emoji glyph), other environments // may not understand an emoji sequence and will display each // individual emoji separately. When this happens, the width // calculated will be off, and there's no reliable way of knowing // in advance if a particular sequence is going to be supported. // The expand_emoji_sequence option allows the caller to skip this // check and count each code within an emoji sequence separately. if (!expand_emoji_sequence && n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner) (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) || u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) { continue; } width += GetColumnWidth(c, ambiguous_as_full_width); } args.GetReturnValue().Set(width); } void Init(Local target, Local unused, Local context, void* priv) { Environment* env = Environment::GetCurrent(context); env->SetMethod(target, "toUnicode", ToUnicode); env->SetMethod(target, "toASCII", ToASCII); env->SetMethod(target, "getStringWidth", GetStringWidth); env->SetMethod(target, "getVersion", GetVersion); // One-shot converters env->SetMethod(target, "icuErrName", ICUErrorName); env->SetMethod(target, "transcode", Transcode); } } // namespace i18n } // namespace node NODE_MODULE_CONTEXT_AWARE_BUILTIN(icu, node::i18n::Init) #endif // NODE_HAVE_I18N_SUPPORT