|
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
* Copyright (C) 2013-2015, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*******************************************************************************
|
|
|
|
* collationruleparser.cpp
|
|
|
|
*
|
|
|
|
* (replaced the former ucol_tok.cpp)
|
|
|
|
*
|
|
|
|
* created on: 2013apr10
|
|
|
|
* created by: Markus W. Scherer
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_COLLATION
|
|
|
|
|
|
|
|
#include "unicode/normalizer2.h"
|
|
|
|
#include "unicode/parseerr.h"
|
|
|
|
#include "unicode/uchar.h"
|
|
|
|
#include "unicode/ucol.h"
|
|
|
|
#include "unicode/uloc.h"
|
|
|
|
#include "unicode/unistr.h"
|
|
|
|
#include "unicode/utf16.h"
|
|
|
|
#include "charstr.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "collation.h"
|
|
|
|
#include "collationdata.h"
|
|
|
|
#include "collationruleparser.h"
|
|
|
|
#include "collationsettings.h"
|
|
|
|
#include "collationtailoring.h"
|
|
|
|
#include "cstring.h"
|
|
|
|
#include "patternprops.h"
|
|
|
|
#include "uassert.h"
|
|
|
|
#include "uvectr32.h"
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
|
|
|
|
const int32_t BEFORE_LENGTH = 7;
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
CollationRuleParser::Sink::~Sink() {}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
|
|
|
|
|
|
|
|
CollationRuleParser::Importer::~Importer() {}
|
|
|
|
|
|
|
|
CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
|
|
|
|
: nfd(*Normalizer2::getNFDInstance(errorCode)),
|
|
|
|
nfc(*Normalizer2::getNFCInstance(errorCode)),
|
|
|
|
rules(NULL), baseData(base), settings(NULL),
|
|
|
|
parseError(NULL), errorReason(NULL),
|
|
|
|
sink(NULL), importer(NULL),
|
|
|
|
ruleIndex(0) {
|
|
|
|
}
|
|
|
|
|
|
|
|
CollationRuleParser::~CollationRuleParser() {
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::parse(const UnicodeString &ruleString,
|
|
|
|
CollationSettings &outSettings,
|
|
|
|
UParseError *outParseError,
|
|
|
|
UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
settings = &outSettings;
|
|
|
|
parseError = outParseError;
|
|
|
|
if(parseError != NULL) {
|
|
|
|
parseError->line = 0;
|
|
|
|
parseError->offset = -1;
|
|
|
|
parseError->preContext[0] = 0;
|
|
|
|
parseError->postContext[0] = 0;
|
|
|
|
}
|
|
|
|
errorReason = NULL;
|
|
|
|
parse(ruleString, errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
rules = &ruleString;
|
|
|
|
ruleIndex = 0;
|
|
|
|
|
|
|
|
while(ruleIndex < rules->length()) {
|
|
|
|
UChar c = rules->charAt(ruleIndex);
|
|
|
|
if(PatternProps::isWhiteSpace(c)) {
|
|
|
|
++ruleIndex;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
switch(c) {
|
|
|
|
case 0x26: // '&'
|
|
|
|
parseRuleChain(errorCode);
|
|
|
|
break;
|
|
|
|
case 0x5b: // '['
|
|
|
|
parseSetting(errorCode);
|
|
|
|
break;
|
|
|
|
case 0x23: // '#' starts a comment, until the end of the line
|
|
|
|
ruleIndex = skipComment(ruleIndex + 1);
|
|
|
|
break;
|
|
|
|
case 0x40: // '@' is equivalent to [backwards 2]
|
|
|
|
settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
|
|
|
|
UCOL_ON, 0, errorCode);
|
|
|
|
++ruleIndex;
|
|
|
|
break;
|
|
|
|
case 0x21: // '!' used to turn on Thai/Lao character reversal
|
|
|
|
// Accept but ignore. The root collator has contractions
|
|
|
|
// that are equivalent to the character reversal, where appropriate.
|
|
|
|
++ruleIndex;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
setParseError("expected a reset or setting or comment", errorCode);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
|
|
|
|
int32_t resetStrength = parseResetAndPosition(errorCode);
|
|
|
|
UBool isFirstRelation = TRUE;
|
|
|
|
for(;;) {
|
|
|
|
int32_t result = parseRelationOperator(errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
if(result < 0) {
|
|
|
|
if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
|
|
|
|
// '#' starts a comment, until the end of the line
|
|
|
|
ruleIndex = skipComment(ruleIndex + 1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(isFirstRelation) {
|
|
|
|
setParseError("reset not followed by a relation", errorCode);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
int32_t strength = result & STRENGTH_MASK;
|
|
|
|
if(resetStrength < UCOL_IDENTICAL) {
|
|
|
|
// reset-before rule chain
|
|
|
|
if(isFirstRelation) {
|
|
|
|
if(strength != resetStrength) {
|
|
|
|
setParseError("reset-before strength differs from its first relation", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if(strength < resetStrength) {
|
|
|
|
setParseError("reset-before strength followed by a stronger relation", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
|
|
|
|
if((result & STARRED_FLAG) == 0) {
|
|
|
|
parseRelationStrings(strength, i, errorCode);
|
|
|
|
} else {
|
|
|
|
parseStarredCharacters(strength, i, errorCode);
|
|
|
|
}
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
isFirstRelation = FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
|
|
|
|
int32_t i = skipWhiteSpace(ruleIndex + 1);
|
|
|
|
int32_t j;
|
|
|
|
UChar c;
|
|
|
|
int32_t resetStrength;
|
|
|
|
if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
|
|
|
|
(j = i + BEFORE_LENGTH) < rules->length() &&
|
|
|
|
PatternProps::isWhiteSpace(rules->charAt(j)) &&
|
|
|
|
((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
|
|
|
|
0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
|
|
|
|
rules->charAt(j + 1) == 0x5d) {
|
|
|
|
// &[before n] with n=1 or 2 or 3
|
|
|
|
resetStrength = UCOL_PRIMARY + (c - 0x31);
|
|
|
|
i = skipWhiteSpace(j + 2);
|
|
|
|
} else {
|
|
|
|
resetStrength = UCOL_IDENTICAL;
|
|
|
|
}
|
|
|
|
if(i >= rules->length()) {
|
|
|
|
setParseError("reset without position", errorCode);
|
|
|
|
return UCOL_DEFAULT;
|
|
|
|
}
|
|
|
|
UnicodeString str;
|
|
|
|
if(rules->charAt(i) == 0x5b) { // '['
|
|
|
|
i = parseSpecialPosition(i, str, errorCode);
|
|
|
|
} else {
|
|
|
|
i = parseTailoringString(i, str, errorCode);
|
|
|
|
}
|
|
|
|
sink->addReset(resetStrength, str, errorReason, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { setErrorContext(); }
|
|
|
|
ruleIndex = i;
|
|
|
|
return resetStrength;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
|
|
|
|
ruleIndex = skipWhiteSpace(ruleIndex);
|
|
|
|
if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
|
|
|
|
int32_t strength;
|
|
|
|
int32_t i = ruleIndex;
|
|
|
|
UChar c = rules->charAt(i++);
|
|
|
|
switch(c) {
|
|
|
|
case 0x3c: // '<'
|
|
|
|
if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
|
|
|
|
++i;
|
|
|
|
if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
|
|
|
|
++i;
|
|
|
|
if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
|
|
|
|
++i;
|
|
|
|
strength = UCOL_QUATERNARY;
|
|
|
|
} else {
|
|
|
|
strength = UCOL_TERTIARY;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
strength = UCOL_SECONDARY;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
strength = UCOL_PRIMARY;
|
|
|
|
}
|
|
|
|
if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
|
|
|
|
++i;
|
|
|
|
strength |= STARRED_FLAG;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case 0x3b: // ';' same as <<
|
|
|
|
strength = UCOL_SECONDARY;
|
|
|
|
break;
|
|
|
|
case 0x2c: // ',' same as <<<
|
|
|
|
strength = UCOL_TERTIARY;
|
|
|
|
break;
|
|
|
|
case 0x3d: // '='
|
|
|
|
strength = UCOL_IDENTICAL;
|
|
|
|
if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
|
|
|
|
++i;
|
|
|
|
strength |= STARRED_FLAG;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return UCOL_DEFAULT;
|
|
|
|
}
|
|
|
|
return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
|
|
|
|
// Parse
|
|
|
|
// prefix | str / extension
|
|
|
|
// where prefix and extension are optional.
|
|
|
|
UnicodeString prefix, str, extension;
|
|
|
|
i = parseTailoringString(i, str, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
|
|
|
|
if(next == 0x7c) { // '|' separates the context prefix from the string.
|
|
|
|
prefix = str;
|
|
|
|
i = parseTailoringString(i + 1, str, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
next = (i < rules->length()) ? rules->charAt(i) : 0;
|
|
|
|
}
|
|
|
|
if(next == 0x2f) { // '/' separates the string from the extension.
|
|
|
|
i = parseTailoringString(i + 1, extension, errorCode);
|
|
|
|
}
|
|
|
|
if(!prefix.isEmpty()) {
|
|
|
|
UChar32 prefix0 = prefix.char32At(0);
|
|
|
|
UChar32 c = str.char32At(0);
|
|
|
|
if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
|
|
|
|
setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
|
|
|
|
errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { setErrorContext(); }
|
|
|
|
ruleIndex = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
|
|
|
|
UnicodeString empty, raw;
|
|
|
|
i = parseString(skipWhiteSpace(i), raw, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
if(raw.isEmpty()) {
|
|
|
|
setParseError("missing starred-relation string", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
UChar32 prev = -1;
|
|
|
|
int32_t j = 0;
|
|
|
|
for(;;) {
|
|
|
|
while(j < raw.length()) {
|
|
|
|
UChar32 c = raw.char32At(j);
|
|
|
|
if(!nfd.isInert(c)) {
|
|
|
|
setParseError("starred-relation string is not all NFD-inert", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
setErrorContext();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
j += U16_LENGTH(c);
|
|
|
|
prev = c;
|
|
|
|
}
|
|
|
|
if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if(prev < 0) {
|
|
|
|
setParseError("range without start in starred-relation string", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
i = parseString(i + 1, raw, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
if(raw.isEmpty()) {
|
|
|
|
setParseError("range without end in starred-relation string", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
UChar32 c = raw.char32At(0);
|
|
|
|
if(c < prev) {
|
|
|
|
setParseError("range start greater than end in starred-relation string", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// range prev-c
|
|
|
|
UnicodeString s;
|
|
|
|
while(++prev <= c) {
|
|
|
|
if(!nfd.isInert(prev)) {
|
|
|
|
setParseError("starred-relation string range is not all NFD-inert", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if(U_IS_SURROGATE(prev)) {
|
|
|
|
setParseError("starred-relation string range contains a surrogate", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if(0xfffd <= prev && prev <= 0xffff) {
|
|
|
|
setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
s.setTo(prev);
|
|
|
|
sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
setErrorContext();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
prev = -1;
|
|
|
|
j = U16_LENGTH(c);
|
|
|
|
}
|
|
|
|
ruleIndex = skipWhiteSpace(i);
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
|
|
|
|
i = parseString(skipWhiteSpace(i), raw, errorCode);
|
|
|
|
if(U_SUCCESS(errorCode) && raw.isEmpty()) {
|
|
|
|
setParseError("missing relation string", errorCode);
|
|
|
|
}
|
|
|
|
return skipWhiteSpace(i);
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return i; }
|
|
|
|
raw.remove();
|
|
|
|
while(i < rules->length()) {
|
|
|
|
UChar32 c = rules->charAt(i++);
|
|
|
|
if(isSyntaxChar(c)) {
|
|
|
|
if(c == 0x27) { // apostrophe
|
|
|
|
if(i < rules->length() && rules->charAt(i) == 0x27) {
|
|
|
|
// Double apostrophe, encodes a single one.
|
|
|
|
raw.append((UChar)0x27);
|
|
|
|
++i;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Quote literal text until the next single apostrophe.
|
|
|
|
for(;;) {
|
|
|
|
if(i == rules->length()) {
|
|
|
|
setParseError("quoted literal text missing terminating apostrophe", errorCode);
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
c = rules->charAt(i++);
|
|
|
|
if(c == 0x27) {
|
|
|
|
if(i < rules->length() && rules->charAt(i) == 0x27) {
|
|
|
|
// Double apostrophe inside quoted literal text,
|
|
|
|
// still encodes a single apostrophe.
|
|
|
|
++i;
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
raw.append((UChar)c);
|
|
|
|
}
|
|
|
|
} else if(c == 0x5c) { // backslash
|
|
|
|
if(i == rules->length()) {
|
|
|
|
setParseError("backslash escape at the end of the rule string", errorCode);
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
c = rules->char32At(i);
|
|
|
|
raw.append(c);
|
|
|
|
i += U16_LENGTH(c);
|
|
|
|
} else {
|
|
|
|
// Any other syntax character terminates a string.
|
|
|
|
--i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else if(PatternProps::isWhiteSpace(c)) {
|
|
|
|
// Unquoted white space terminates a string.
|
|
|
|
--i;
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
raw.append((UChar)c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for(int32_t j = 0; j < raw.length();) {
|
|
|
|
UChar32 c = raw.char32At(j);
|
|
|
|
if(U_IS_SURROGATE(c)) {
|
|
|
|
setParseError("string contains an unpaired surrogate", errorCode);
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
if(0xfffd <= c && c <= 0xffff) {
|
|
|
|
setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
j += U16_LENGTH(c);
|
|
|
|
}
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
static const char *const positions[] = {
|
|
|
|
"first tertiary ignorable",
|
|
|
|
"last tertiary ignorable",
|
|
|
|
"first secondary ignorable",
|
|
|
|
"last secondary ignorable",
|
|
|
|
"first primary ignorable",
|
|
|
|
"last primary ignorable",
|
|
|
|
"first variable",
|
|
|
|
"last variable",
|
|
|
|
"first regular",
|
|
|
|
"last regular",
|
|
|
|
"first implicit",
|
|
|
|
"last implicit",
|
|
|
|
"first trailing",
|
|
|
|
"last trailing"
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
UnicodeString raw;
|
|
|
|
int32_t j = readWords(i + 1, raw);
|
|
|
|
if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
|
|
|
|
++j;
|
|
|
|
for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
|
|
|
|
if(raw == UnicodeString(positions[pos], -1, US_INV)) {
|
|
|
|
str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
|
|
|
|
return j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(raw == UNICODE_STRING_SIMPLE("top")) {
|
|
|
|
str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
|
|
|
|
return j;
|
|
|
|
}
|
|
|
|
if(raw == UNICODE_STRING_SIMPLE("variable top")) {
|
|
|
|
str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
|
|
|
|
return j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
setParseError("not a valid special reset position", errorCode);
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::parseSetting(UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
UnicodeString raw;
|
|
|
|
int32_t i = ruleIndex + 1;
|
|
|
|
int32_t j = readWords(i, raw);
|
|
|
|
if(j <= i || raw.isEmpty()) {
|
|
|
|
setParseError("expected a setting/option at '['", errorCode);
|
|
|
|
}
|
|
|
|
if(rules->charAt(j) == 0x5d) { // words end with ]
|
|
|
|
++j;
|
|
|
|
if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
|
|
|
|
(raw.length() == 7 || raw.charAt(7) == 0x20)) {
|
|
|
|
parseReordering(raw, errorCode);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
|
|
|
|
settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
|
|
|
|
UCOL_ON, 0, errorCode);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
UnicodeString v;
|
|
|
|
int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
|
|
|
|
if(valueIndex >= 0) {
|
|
|
|
v.setTo(raw, valueIndex + 1);
|
|
|
|
raw.truncate(valueIndex);
|
|
|
|
}
|
|
|
|
if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
|
|
|
|
int32_t value = UCOL_DEFAULT;
|
|
|
|
UChar c = v.charAt(0);
|
|
|
|
if(0x31 <= c && c <= 0x34) { // 1..4
|
|
|
|
value = UCOL_PRIMARY + (c - 0x31);
|
|
|
|
} else if(c == 0x49) { // 'I'
|
|
|
|
value = UCOL_IDENTICAL;
|
|
|
|
}
|
|
|
|
if(value != UCOL_DEFAULT) {
|
|
|
|
settings->setStrength(value, 0, errorCode);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
|
|
|
|
UColAttributeValue value = UCOL_DEFAULT;
|
|
|
|
if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
|
|
|
|
value = UCOL_NON_IGNORABLE;
|
|
|
|
} else if(v == UNICODE_STRING_SIMPLE("shifted")) {
|
|
|
|
value = UCOL_SHIFTED;
|
|
|
|
}
|
|
|
|
if(value != UCOL_DEFAULT) {
|
|
|
|
settings->setAlternateHandling(value, 0, errorCode);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
|
|
|
|
int32_t value = UCOL_DEFAULT;
|
|
|
|
if(v == UNICODE_STRING_SIMPLE("space")) {
|
|
|
|
value = CollationSettings::MAX_VAR_SPACE;
|
|
|
|
} else if(v == UNICODE_STRING_SIMPLE("punct")) {
|
|
|
|
value = CollationSettings::MAX_VAR_PUNCT;
|
|
|
|
} else if(v == UNICODE_STRING_SIMPLE("symbol")) {
|
|
|
|
value = CollationSettings::MAX_VAR_SYMBOL;
|
|
|
|
} else if(v == UNICODE_STRING_SIMPLE("currency")) {
|
|
|
|
value = CollationSettings::MAX_VAR_CURRENCY;
|
|
|
|
}
|
|
|
|
if(value != UCOL_DEFAULT) {
|
|
|
|
settings->setMaxVariable(value, 0, errorCode);
|
|
|
|
settings->variableTop = baseData->getLastPrimaryForGroup(
|
|
|
|
UCOL_REORDER_CODE_FIRST + value);
|
|
|
|
U_ASSERT(settings->variableTop != 0);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
|
|
|
|
UColAttributeValue value = UCOL_DEFAULT;
|
|
|
|
if(v == UNICODE_STRING_SIMPLE("off")) {
|
|
|
|
value = UCOL_OFF;
|
|
|
|
} else if(v == UNICODE_STRING_SIMPLE("lower")) {
|
|
|
|
value = UCOL_LOWER_FIRST;
|
|
|
|
} else if(v == UNICODE_STRING_SIMPLE("upper")) {
|
|
|
|
value = UCOL_UPPER_FIRST;
|
|
|
|
}
|
|
|
|
if(value != UCOL_DEFAULT) {
|
|
|
|
settings->setCaseFirst(value, 0, errorCode);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
|
|
|
|
UColAttributeValue value = getOnOffValue(v);
|
|
|
|
if(value != UCOL_DEFAULT) {
|
|
|
|
settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
|
|
|
|
UColAttributeValue value = getOnOffValue(v);
|
|
|
|
if(value != UCOL_DEFAULT) {
|
|
|
|
settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
|
|
|
|
UColAttributeValue value = getOnOffValue(v);
|
|
|
|
if(value != UCOL_DEFAULT) {
|
|
|
|
settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
|
|
|
|
UColAttributeValue value = getOnOffValue(v);
|
|
|
|
if(value != UCOL_DEFAULT) {
|
|
|
|
if(value == UCOL_ON) {
|
|
|
|
setParseError("[hiraganaQ on] is not supported", errorCode);
|
|
|
|
}
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("import")) {
|
|
|
|
CharString lang;
|
|
|
|
lang.appendInvariantChars(v, errorCode);
|
|
|
|
if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
|
|
|
|
// BCP 47 language tag -> ICU locale ID
|
|
|
|
char localeID[ULOC_FULLNAME_CAPACITY];
|
|
|
|
int32_t parsedLength;
|
|
|
|
int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
|
|
|
|
&parsedLength, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode) ||
|
|
|
|
parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
|
|
|
|
errorCode = U_ZERO_ERROR;
|
|
|
|
setParseError("expected language tag in [import langTag]", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// localeID minus all keywords
|
|
|
|
char baseID[ULOC_FULLNAME_CAPACITY];
|
|
|
|
length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
|
|
|
|
errorCode = U_ZERO_ERROR;
|
|
|
|
setParseError("expected language tag in [import langTag]", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
|
|
|
|
uprv_strcpy(baseID, "root");
|
|
|
|
}
|
|
|
|
// @collation=type, or length=0 if not specified
|
|
|
|
char collationType[ULOC_KEYWORDS_CAPACITY];
|
|
|
|
length = uloc_getKeywordValue(localeID, "collation",
|
|
|
|
collationType, ULOC_KEYWORDS_CAPACITY,
|
|
|
|
&errorCode);
|
|
|
|
if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
|
|
|
|
errorCode = U_ZERO_ERROR;
|
|
|
|
setParseError("expected language tag in [import langTag]", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if(importer == NULL) {
|
|
|
|
setParseError("[import langTag] is not supported", errorCode);
|
|
|
|
} else {
|
|
|
|
UnicodeString importedRules;
|
|
|
|
importer->getRules(baseID, length > 0 ? collationType : "standard",
|
|
|
|
importedRules, errorReason, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
if(errorReason == NULL) {
|
|
|
|
errorReason = "[import langTag] failed";
|
|
|
|
}
|
|
|
|
setErrorContext();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
const UnicodeString *outerRules = rules;
|
|
|
|
int32_t outerRuleIndex = ruleIndex;
|
|
|
|
parse(importedRules, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
if(parseError != NULL) {
|
|
|
|
parseError->offset = outerRuleIndex;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rules = outerRules;
|
|
|
|
ruleIndex = j;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(rules->charAt(j) == 0x5b) { // words end with [
|
|
|
|
UnicodeSet set;
|
|
|
|
j = parseUnicodeSet(j, set, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
if(raw == UNICODE_STRING_SIMPLE("optimize")) {
|
|
|
|
sink->optimize(set, errorReason, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { setErrorContext(); }
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
} else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
|
|
|
|
sink->suppressContractions(set, errorReason, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { setErrorContext(); }
|
|
|
|
ruleIndex = j;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
setParseError("not a valid setting/option", errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
int32_t i = 7; // after "reorder"
|
|
|
|
if(i == raw.length()) {
|
|
|
|
// empty [reorder] with no codes
|
|
|
|
settings->resetReordering();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// Parse the codes in [reorder aa bb cc].
|
|
|
|
UVector32 reorderCodes(errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
CharString word;
|
|
|
|
while(i < raw.length()) {
|
|
|
|
++i; // skip the word-separating space
|
|
|
|
int32_t limit = raw.indexOf((UChar)0x20, i);
|
|
|
|
if(limit < 0) { limit = raw.length(); }
|
|
|
|
word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
int32_t code = getReorderCode(word.data());
|
|
|
|
if(code < 0) {
|
|
|
|
setParseError("unknown script or reorder code", errorCode);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
reorderCodes.addElement(code, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
i = limit;
|
|
|
|
}
|
|
|
|
settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *const gSpecialReorderCodes[] = {
|
|
|
|
"space", "punct", "symbol", "currency", "digit"
|
|
|
|
};
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::getReorderCode(const char *word) {
|
|
|
|
for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
|
|
|
|
if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
|
|
|
|
return UCOL_REORDER_CODE_FIRST + i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
|
|
|
|
if(script >= 0) {
|
|
|
|
return script;
|
|
|
|
}
|
|
|
|
if(uprv_stricmp(word, "others") == 0) {
|
|
|
|
return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
UColAttributeValue
|
|
|
|
CollationRuleParser::getOnOffValue(const UnicodeString &s) {
|
|
|
|
if(s == UNICODE_STRING_SIMPLE("on")) {
|
|
|
|
return UCOL_ON;
|
|
|
|
} else if(s == UNICODE_STRING_SIMPLE("off")) {
|
|
|
|
return UCOL_OFF;
|
|
|
|
} else {
|
|
|
|
return UCOL_DEFAULT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
|
|
|
|
// Collect a UnicodeSet pattern between a balanced pair of [brackets].
|
|
|
|
int32_t level = 0;
|
|
|
|
int32_t j = i;
|
|
|
|
for(;;) {
|
|
|
|
if(j == rules->length()) {
|
|
|
|
setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
|
|
|
|
return j;
|
|
|
|
}
|
|
|
|
UChar c = rules->charAt(j++);
|
|
|
|
if(c == 0x5b) { // '['
|
|
|
|
++level;
|
|
|
|
} else if(c == 0x5d) { // ']'
|
|
|
|
if(--level == 0) { break; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
errorCode = U_ZERO_ERROR;
|
|
|
|
setParseError("not a valid UnicodeSet pattern", errorCode);
|
|
|
|
return j;
|
|
|
|
}
|
|
|
|
j = skipWhiteSpace(j);
|
|
|
|
if(j == rules->length() || rules->charAt(j) != 0x5d) {
|
|
|
|
setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
|
|
|
|
return j;
|
|
|
|
}
|
|
|
|
return ++j;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
|
|
|
|
static const UChar sp = 0x20;
|
|
|
|
raw.remove();
|
|
|
|
i = skipWhiteSpace(i);
|
|
|
|
for(;;) {
|
|
|
|
if(i >= rules->length()) { return 0; }
|
|
|
|
UChar c = rules->charAt(i);
|
|
|
|
if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
|
|
|
|
if(raw.isEmpty()) { return i; }
|
|
|
|
if(raw.endsWith(&sp, 1)) { // remove trailing space
|
|
|
|
raw.truncate(raw.length() - 1);
|
|
|
|
}
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
if(PatternProps::isWhiteSpace(c)) {
|
|
|
|
raw.append(sp);
|
|
|
|
i = skipWhiteSpace(i + 1);
|
|
|
|
} else {
|
|
|
|
raw.append(c);
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::skipComment(int32_t i) const {
|
|
|
|
// skip to past the newline
|
|
|
|
while(i < rules->length()) {
|
|
|
|
UChar c = rules->charAt(i++);
|
|
|
|
// LF or FF or CR or NEL or LS or PS
|
|
|
|
if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
|
|
|
|
// Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
|
|
|
|
// NLF (new line function) = CR or LF or CR+LF or NEL.
|
|
|
|
// No need to collect all of CR+LF because a following LF will be ignored anyway.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
// Error code consistent with the old parser (from ca. 2001),
|
|
|
|
// rather than U_PARSE_ERROR;
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
|
|
|
errorReason = reason;
|
|
|
|
if(parseError != NULL) { setErrorContext(); }
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
CollationRuleParser::setErrorContext() {
|
|
|
|
if(parseError == NULL) { return; }
|
|
|
|
|
|
|
|
// Note: This relies on the calling code maintaining the ruleIndex
|
|
|
|
// at a position that is useful for debugging.
|
|
|
|
// For example, at the beginning of a reset or relation etc.
|
|
|
|
parseError->offset = ruleIndex;
|
|
|
|
parseError->line = 0; // We are not counting line numbers.
|
|
|
|
|
|
|
|
// before ruleIndex
|
|
|
|
int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
|
|
|
|
if(start < 0) {
|
|
|
|
start = 0;
|
|
|
|
} else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
|
|
|
|
++start;
|
|
|
|
}
|
|
|
|
int32_t length = ruleIndex - start;
|
|
|
|
rules->extract(start, length, parseError->preContext);
|
|
|
|
parseError->preContext[length] = 0;
|
|
|
|
|
|
|
|
// starting from ruleIndex
|
|
|
|
length = rules->length() - ruleIndex;
|
|
|
|
if(length >= U_PARSE_CONTEXT_LEN) {
|
|
|
|
length = U_PARSE_CONTEXT_LEN - 1;
|
|
|
|
if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
|
|
|
|
--length;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rules->extract(ruleIndex, length, parseError->postContext);
|
|
|
|
parseError->postContext[length] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool
|
|
|
|
CollationRuleParser::isSyntaxChar(UChar32 c) {
|
|
|
|
return 0x21 <= c && c <= 0x7e &&
|
|
|
|
(c <= 0x2f || (0x3a <= c && c <= 0x40) ||
|
|
|
|
(0x5b <= c && c <= 0x60) || (0x7b <= c));
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
CollationRuleParser::skipWhiteSpace(int32_t i) const {
|
|
|
|
while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif // !UCONFIG_NO_COLLATION
|