You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

297 lines
8.5 KiB

// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2001-2012, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/23/01 aliu Creation.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#include "strmatch.h"
#include "rbt_data.h"
#include "util.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
StringMatcher::StringMatcher(const UnicodeString& theString,
int32_t start,
int32_t limit,
int32_t segmentNum,
const TransliterationRuleData& theData) :
data(&theData),
segmentNumber(segmentNum),
matchStart(-1),
matchLimit(-1)
{
theString.extractBetween(start, limit, pattern);
}
StringMatcher::StringMatcher(const StringMatcher& o) :
UnicodeFunctor(o),
UnicodeMatcher(o),
UnicodeReplacer(o),
pattern(o.pattern),
data(o.data),
segmentNumber(o.segmentNumber),
matchStart(o.matchStart),
matchLimit(o.matchLimit)
{
}
/**
* Destructor
*/
StringMatcher::~StringMatcher() {
}
/**
* Implement UnicodeFunctor
*/
UnicodeFunctor* StringMatcher::clone() const {
return new StringMatcher(*this);
}
/**
* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
* and return the pointer.
*/
UnicodeMatcher* StringMatcher::toMatcher() const {
StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
return nonconst_base;
}
/**
* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
* and return the pointer.
*/
UnicodeReplacer* StringMatcher::toReplacer() const {
StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
return nonconst_base;
}
/**
* Implement UnicodeMatcher
*/
UMatchDegree StringMatcher::matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) {
int32_t i;
int32_t cursor = offset;
if (limit < cursor) {
// Match in the reverse direction
for (i=pattern.length()-1; i>=0; --i) {
UChar keyChar = pattern.charAt(i);
UnicodeMatcher* subm = data->lookupMatcher(keyChar);
if (subm == 0) {
if (cursor > limit &&
keyChar == text.charAt(cursor)) {
--cursor;
} else {
return U_MISMATCH;
}
} else {
UMatchDegree m =
subm->matches(text, cursor, limit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
// Record the match position, but adjust for a normal
// forward start, limit, and only if a prior match does not
// exist -- we want the rightmost match.
if (matchStart < 0) {
matchStart = cursor+1;
matchLimit = offset+1;
}
} else {
for (i=0; i<pattern.length(); ++i) {
if (incremental && cursor == limit) {
// We've reached the context limit without a mismatch and
// without completing our match.
return U_PARTIAL_MATCH;
}
UChar keyChar = pattern.charAt(i);
UnicodeMatcher* subm = data->lookupMatcher(keyChar);
if (subm == 0) {
// Don't need the cursor < limit check if
// incremental is TRUE (because it's done above); do need
// it otherwise.
if (cursor < limit &&
keyChar == text.charAt(cursor)) {
++cursor;
} else {
return U_MISMATCH;
}
} else {
UMatchDegree m =
subm->matches(text, cursor, limit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
// Record the match position
matchStart = offset;
matchLimit = cursor;
}
offset = cursor;
return U_MATCH;
}
/**
* Implement UnicodeMatcher
*/
UnicodeString& StringMatcher::toPattern(UnicodeString& result,
UBool escapeUnprintable) const
{
result.truncate(0);
UnicodeString str, quoteBuf;
if (segmentNumber > 0) {
result.append((UChar)40); /*(*/
}
for (int32_t i=0; i<pattern.length(); ++i) {
UChar keyChar = pattern.charAt(i);
const UnicodeMatcher* m = data->lookupMatcher(keyChar);
if (m == 0) {
ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
} else {
ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
TRUE, escapeUnprintable, quoteBuf);
}
}
if (segmentNumber > 0) {
result.append((UChar)41); /*)*/
}
// Flush quoteBuf out to result
ICU_Utility::appendToRule(result, -1,
TRUE, escapeUnprintable, quoteBuf);
return result;
}
/**
* Implement UnicodeMatcher
*/
UBool StringMatcher::matchesIndexValue(uint8_t v) const {
if (pattern.length() == 0) {
return TRUE;
}
UChar32 c = pattern.char32At(0);
const UnicodeMatcher *m = data->lookupMatcher(c);
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
}
/**
* Implement UnicodeMatcher
*/
void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
UChar32 ch;
for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
ch = pattern.char32At(i);
const UnicodeMatcher* matcher = data->lookupMatcher(ch);
if (matcher == NULL) {
toUnionTo.add(ch);
} else {
matcher->addMatchSetTo(toUnionTo);
}
}
}
/**
* UnicodeReplacer API
*/
int32_t StringMatcher::replace(Replaceable& text,
int32_t start,
int32_t limit,
int32_t& /*cursor*/) {
int32_t outLen = 0;
// Copy segment with out-of-band data
int32_t dest = limit;
// If there was no match, that means that a quantifier
// matched zero-length. E.g., x (a)* y matched "xy".
if (matchStart >= 0) {
if (matchStart != matchLimit) {
text.copy(matchStart, matchLimit, dest);
outLen = matchLimit - matchStart;
}
}
text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
return outLen;
}
/**
* UnicodeReplacer API
*/
UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
UBool /*escapeUnprintable*/) const {
// assert(segmentNumber > 0);
rule.truncate(0);
rule.append((UChar)0x0024 /*$*/);
ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
return rule;
}
/**
* Remove any match info. This must be called before performing a
* set of matches with this segment.
*/
void StringMatcher::resetMatch() {
matchStart = matchLimit = -1;
}
/**
* Union the set of all characters that may output by this object
* into the given set.
* @param toUnionTo the set into which to union the output characters
*/
void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
// The output of this replacer varies; it is the source text between
// matchStart and matchLimit. Since this varies depending on the
// input text, we can't compute it here. We can either do nothing
// or we can add ALL characters to the set. It's probably more useful
// to do nothing.
}
/**
* Implement UnicodeFunctor
*/
void StringMatcher::setData(const TransliterationRuleData* d) {
data = d;
int32_t i = 0;
while (i<pattern.length()) {
UChar32 c = pattern.char32At(i);
UnicodeFunctor* f = data->lookup(c);
if (f != NULL) {
f->setData(data);
}
i += U16_LENGTH(c);
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
//eof