node/deps/icu-small/source/i18n/strrepl.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (c) 2002-2012, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   01/21/2002  aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "strrepl.h"
#include "rbt_data.h"
#include "util.h"

U_NAMESPACE_BEGIN

UnicodeReplacer::~UnicodeReplacer() {}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)

/**
 * Construct a StringReplacer that sets the emits the given output
 * text and sets the cursor to the given position.
 * @param theOutput text that will replace input text when the
 * replace() method is called.  May contain stand-in characters
 * that represent nested replacers.
 * @param theCursorPos cursor position that will be returned by
 * the replace() method
 * @param theData transliterator context object that translates
 * stand-in characters to UnicodeReplacer objects
 */
StringReplacer::StringReplacer(const UnicodeString& theOutput,
                               int32_t theCursorPos,
                               const TransliterationRuleData* theData) {
    output = theOutput;
    cursorPos = theCursorPos;
    hasCursor = TRUE;
    data = theData;
    isComplex = TRUE;
}

/**
 * Construct a StringReplacer that sets the emits the given output
 * text and does not modify the cursor.
 * @param theOutput text that will replace input text when the
 * replace() method is called.  May contain stand-in characters
 * that represent nested replacers.
 * @param theData transliterator context object that translates
 * stand-in characters to UnicodeReplacer objects
 */
StringReplacer::StringReplacer(const UnicodeString& theOutput,
                               const TransliterationRuleData* theData) {
    output = theOutput;
    cursorPos = 0;
    hasCursor = FALSE;
    data = theData;
    isComplex = TRUE;
}

/**
 * Copy constructor.
 */
StringReplacer::StringReplacer(const StringReplacer& other) :
    UnicodeFunctor(other),
    UnicodeReplacer(other)
{
    output = other.output;
    cursorPos = other.cursorPos;
    hasCursor = other.hasCursor;
    data = other.data;
    isComplex = other.isComplex;
}

/**
 * Destructor
 */
StringReplacer::~StringReplacer() {
}

/**
 * Implement UnicodeFunctor
 */
UnicodeFunctor* StringReplacer::clone() const {
    return new StringReplacer(*this);
}

/**
 * Implement UnicodeFunctor
 */
UnicodeReplacer* StringReplacer::toReplacer() const {
  return const_cast<StringReplacer *>(this);
}

/**
 * UnicodeReplacer API
 */
int32_t StringReplacer::replace(Replaceable& text,
                                int32_t start,
                                int32_t limit,
                                int32_t& cursor) {
    int32_t outLen;
    int32_t newStart = 0;

    // NOTE: It should be possible to _always_ run the complex
    // processing code; just slower.  If not, then there is a bug
    // in the complex processing code.

    // Simple (no nested replacers) Processing Code :
    if (!isComplex) {
        text.handleReplaceBetween(start, limit, output);
        outLen = output.length();

        // Setup default cursor position (for cursorPos within output)
        newStart = cursorPos;
    }

    // Complex (nested replacers) Processing Code :
    else {
        /* When there are segments to be copied, use the Replaceable.copy()
         * API in order to retain out-of-band data.  Copy everything to the
         * end of the string, then copy them back over the key.  This preserves
         * the integrity of indices into the key and surrounding context while
         * generating the output text.
         */
        UnicodeString buf;
        int32_t oOutput; // offset into 'output'
        isComplex = FALSE;

        // The temporary buffer starts at tempStart, and extends
        // to destLimit.  The start of the buffer has a single
        // character from before the key.  This provides style
        // data when addition characters are filled into the
        // temporary buffer.  If there is nothing to the left, use
        // the non-character U+FFFF, which Replaceable subclasses
        // should treat specially as a "no-style character."
        // destStart points to the point after the style context
        // character, so it is tempStart+1 or tempStart+2.
        int32_t tempStart = text.length(); // start of temp buffer
        int32_t destStart = tempStart; // copy new text to here
        if (start > 0) {
            int32_t len = U16_LENGTH(text.char32At(start-1));
            text.copy(start-len, start, tempStart);
            destStart += len;
        } else {
            UnicodeString str((UChar) 0xFFFF);
            text.handleReplaceBetween(tempStart, tempStart, str);
            destStart++;
        }
        int32_t destLimit = destStart;

        for (oOutput=0; oOutput<output.length(); ) {
            if (oOutput == cursorPos) {
                // Record the position of the cursor
                newStart = destLimit - destStart; // relative to start
            }
            UChar32 c = output.char32At(oOutput);
            UnicodeReplacer* r = data->lookupReplacer(c);
            if (r == NULL) {
                // Accumulate straight (non-segment) text.
                buf.append(c);
            } else {
                isComplex = TRUE;

                // Insert any accumulated straight text.
                if (buf.length() > 0) {
                    text.handleReplaceBetween(destLimit, destLimit, buf);
                    destLimit += buf.length();
                    buf.truncate(0);
                }

                // Delegate output generation to replacer object
                int32_t len = r->replace(text, destLimit, destLimit, cursor);
                destLimit += len;
            }
            oOutput += U16_LENGTH(c);
        }
        // Insert any accumulated straight text.
        if (buf.length() > 0) {
            text.handleReplaceBetween(destLimit, destLimit, buf);
            destLimit += buf.length();
        }
        if (oOutput == cursorPos) {
            // Record the position of the cursor
            newStart = destLimit - destStart; // relative to start
        }

        outLen = destLimit - destStart;

        // Copy new text to start, and delete it
        text.copy(destStart, destLimit, start);
        text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());

        // Delete the old text (the key)
        text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
    }

    if (hasCursor) {
        // Adjust the cursor for positions outside the key.  These
        // refer to code points rather than code units.  If cursorPos
        // is within the output string, then use newStart, which has
        // already been set above.
        if (cursorPos < 0) {
            newStart = start;
            int32_t n = cursorPos;
            // Outside the output string, cursorPos counts code points
            while (n < 0 && newStart > 0) {
                newStart -= U16_LENGTH(text.char32At(newStart-1));
                ++n;
            }
            newStart += n;
        } else if (cursorPos > output.length()) {
            newStart = start + outLen;
            int32_t n = cursorPos - output.length();
            // Outside the output string, cursorPos counts code points
            while (n > 0 && newStart < text.length()) {
                newStart += U16_LENGTH(text.char32At(newStart));
                --n;
            }
            newStart += n;
        } else {
            // Cursor is within output string.  It has been set up above
            // to be relative to start.
            newStart += start;
        }

        cursor = newStart;
    }

    return outLen;
}

/**
 * UnicodeReplacer API
 */
UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
                                                 UBool escapeUnprintable) const {
    rule.truncate(0);
    UnicodeString quoteBuf;

    int32_t cursor = cursorPos;

    // Handle a cursor preceding the output
    if (hasCursor && cursor < 0) {
        while (cursor++ < 0) {
            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
        }
        // Fall through and append '|' below
    }

    for (int32_t i=0; i<output.length(); ++i) {
        if (hasCursor && i == cursor) {
            ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
        }
        UChar c = output.charAt(i); // Ok to use 16-bits here

        UnicodeReplacer* r = data->lookupReplacer(c);
        if (r == NULL) {
            ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
        } else {
            UnicodeString buf;
            r->toReplacerPattern(buf, escapeUnprintable);
            buf.insert(0, (UChar)0x20);
            buf.append((UChar)0x20);
            ICU_Utility::appendToRule(rule, buf,
                                      TRUE, escapeUnprintable, quoteBuf);
        }
    }

    // Handle a cursor after the output.  Use > rather than >= because
    // if cursor == output.length() it is at the end of the output,
    // which is the default position, so we need not emit it.
    if (hasCursor && cursor > output.length()) {
        cursor -= output.length();
        while (cursor-- > 0) {
            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
        }
        ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
    }
    // Flush quoteBuf out to result
    ICU_Utility::appendToRule(rule, -1,
                              TRUE, escapeUnprintable, quoteBuf);

    return rule;
}

/**
 * Implement UnicodeReplacer
 */
void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
    ch = output.char32At(i);
    UnicodeReplacer* r = data->lookupReplacer(ch);
    if (r == NULL) {
        toUnionTo.add(ch);
    } else {
        r->addReplacementSetTo(toUnionTo);
    }
    }
}

/**
 * UnicodeFunctor API
 */
void StringReplacer::setData(const TransliterationRuleData* d) {
    data = d;
    int32_t i = 0;
    while (i<output.length()) {
        UChar32 c = output.char32At(i);
        UnicodeFunctor* f = data->lookup(c);
        if (f != NULL) {
            f->setData(data);
        }
        i += U16_LENGTH(c);
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof
deps: ICU 59.1 bump * No feature changes. * Bug fixes. * Details: http://site.icu-project.org/download/59 Fixes: https://github.com/nodejs/node/issues/12077 PR-URL: https://github.com/nodejs/node/pull/12486 Refs: https://github.com/nodejs/node/issues/7844 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Michael Dawson <michael_dawson@ca.ibm.com> Reviewed-By: Refael Ackermann <refack@gmail.com> 8 years ago			`// © 2016 and later: Unicode, Inc. and others.`
deps: Intl: ICU 58 bump - small icu (BIG COMMIT) This commit contains the ICU 58.1 delta. It is especially large because of the ICU license change, and, because the line endings were off previously. * bump to ICU 58.1 - check in small ICU source * from 58.1 final http://site.icu-project.org/download/58 Fixes: https://github.com/nodejs/node/issues/7844 PR-URL: https://github.com/nodejs/node/pull/9234 Reviewed-By: James M Snell <jasnell@gmail.com> 8 years ago			`// License & terms of use: http://www.unicode.org/copyright.html`
deps: Intl: Check in "small-icu" 57.1 * this commit has "small" ICU 57.1. See other related commit for tools to generate this commit. Fixes: https://github.com/nodejs/node/issues/3476 PR-URL: https://github.com/nodejs/node/pull/6088 Reviewed-By: James M Snell <jasnell@gmail.com> 9 years ago			`/*`
			`**********************************************************************`
			`* Copyright (c) 2002-2012, International Business Machines Corporation`
			`* and others. All Rights Reserved.`
			`**********************************************************************`
			`* Date Name Description`
			`* 01/21/2002 aliu Creation.`
			`**********************************************************************`
			`*/`

			`#include "unicode/utypes.h"`

			`#if !UCONFIG_NO_TRANSLITERATION`

			`#include "unicode/uniset.h"`
			`#include "unicode/utf16.h"`
			`#include "strrepl.h"`
			`#include "rbt_data.h"`
			`#include "util.h"`

			`U_NAMESPACE_BEGIN`

			`UnicodeReplacer::~UnicodeReplacer() {}`
			`UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)`

			`/**`
			`* Construct a StringReplacer that sets the emits the given output`
			`* text and sets the cursor to the given position.`
			`* @param theOutput text that will replace input text when the`
			`* replace() method is called. May contain stand-in characters`
			`* that represent nested replacers.`
			`* @param theCursorPos cursor position that will be returned by`
			`* the replace() method`
			`* @param theData transliterator context object that translates`
			`* stand-in characters to UnicodeReplacer objects`
			`*/`
			`StringReplacer::StringReplacer(const UnicodeString& theOutput,`
			`int32_t theCursorPos,`
			`const TransliterationRuleData* theData) {`
			`output = theOutput;`
			`cursorPos = theCursorPos;`
			`hasCursor = TRUE;`
			`data = theData;`
			`isComplex = TRUE;`
			`}`

			`/**`
			`* Construct a StringReplacer that sets the emits the given output`
			`* text and does not modify the cursor.`
			`* @param theOutput text that will replace input text when the`
			`* replace() method is called. May contain stand-in characters`
			`* that represent nested replacers.`
			`* @param theData transliterator context object that translates`
			`* stand-in characters to UnicodeReplacer objects`
			`*/`
			`StringReplacer::StringReplacer(const UnicodeString& theOutput,`
			`const TransliterationRuleData* theData) {`
			`output = theOutput;`
			`cursorPos = 0;`
			`hasCursor = FALSE;`
			`data = theData;`
			`isComplex = TRUE;`
			`}`

			`/**`
			`* Copy constructor.`
			`*/`
			`StringReplacer::StringReplacer(const StringReplacer& other) :`
			`UnicodeFunctor(other),`
			`UnicodeReplacer(other)`
			`{`
			`output = other.output;`
			`cursorPos = other.cursorPos;`
			`hasCursor = other.hasCursor;`
			`data = other.data;`
			`isComplex = other.isComplex;`
			`}`

			`/**`
			`* Destructor`
			`*/`
			`StringReplacer::~StringReplacer() {`
			`}`

			`/**`
			`* Implement UnicodeFunctor`
			`*/`
			`UnicodeFunctor* StringReplacer::clone() const {`
			`return new StringReplacer(*this);`
			`}`

			`/**`
			`* Implement UnicodeFunctor`
			`*/`
			`UnicodeReplacer* StringReplacer::toReplacer() const {`
			`return const_cast<StringReplacer *>(this);`
			`}`

			`/**`
			`* UnicodeReplacer API`
			`*/`
			`int32_t StringReplacer::replace(Replaceable& text,`
			`int32_t start,`
			`int32_t limit,`
			`int32_t& cursor) {`
			`int32_t outLen;`
			`int32_t newStart = 0;`

			`// NOTE: It should be possible to _always_ run the complex`
			`// processing code; just slower. If not, then there is a bug`
			`// in the complex processing code.`

			`// Simple (no nested replacers) Processing Code :`
			`if (!isComplex) {`
			`text.handleReplaceBetween(start, limit, output);`
			`outLen = output.length();`

			`// Setup default cursor position (for cursorPos within output)`
			`newStart = cursorPos;`
			`}`

			`// Complex (nested replacers) Processing Code :`
			`else {`
			`/* When there are segments to be copied, use the Replaceable.copy()`
			`* API in order to retain out-of-band data. Copy everything to the`
			`* end of the string, then copy them back over the key. This preserves`
			`* the integrity of indices into the key and surrounding context while`
			`* generating the output text.`
			`*/`
			`UnicodeString buf;`
			`int32_t oOutput; // offset into 'output'`
			`isComplex = FALSE;`

			`// The temporary buffer starts at tempStart, and extends`
			`// to destLimit. The start of the buffer has a single`
			`// character from before the key. This provides style`
			`// data when addition characters are filled into the`
			`// temporary buffer. If there is nothing to the left, use`
			`// the non-character U+FFFF, which Replaceable subclasses`
			`// should treat specially as a "no-style character."`
			`// destStart points to the point after the style context`
			`// character, so it is tempStart+1 or tempStart+2.`
			`int32_t tempStart = text.length(); // start of temp buffer`
			`int32_t destStart = tempStart; // copy new text to here`
			`if (start > 0) {`
			`int32_t len = U16_LENGTH(text.char32At(start-1));`
			`text.copy(start-len, start, tempStart);`
			`destStart += len;`
			`} else {`
			`UnicodeString str((UChar) 0xFFFF);`
			`text.handleReplaceBetween(tempStart, tempStart, str);`
			`destStart++;`
			`}`
			`int32_t destLimit = destStart;`

			`for (oOutput=0; oOutput<output.length(); ) {`
			`if (oOutput == cursorPos) {`
			`// Record the position of the cursor`
			`newStart = destLimit - destStart; // relative to start`
			`}`
			`UChar32 c = output.char32At(oOutput);`
			`UnicodeReplacer* r = data->lookupReplacer(c);`
			`if (r == NULL) {`
			`// Accumulate straight (non-segment) text.`
			`buf.append(c);`
			`} else {`
			`isComplex = TRUE;`

			`// Insert any accumulated straight text.`
			`if (buf.length() > 0) {`
			`text.handleReplaceBetween(destLimit, destLimit, buf);`
			`destLimit += buf.length();`
			`buf.truncate(0);`
			`}`

			`// Delegate output generation to replacer object`
			`int32_t len = r->replace(text, destLimit, destLimit, cursor);`
			`destLimit += len;`
			`}`
			`oOutput += U16_LENGTH(c);`
			`}`
			`// Insert any accumulated straight text.`
			`if (buf.length() > 0) {`
			`text.handleReplaceBetween(destLimit, destLimit, buf);`
			`destLimit += buf.length();`
			`}`
			`if (oOutput == cursorPos) {`
			`// Record the position of the cursor`
			`newStart = destLimit - destStart; // relative to start`
			`}`

			`outLen = destLimit - destStart;`

			`// Copy new text to start, and delete it`
			`text.copy(destStart, destLimit, start);`
			`text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());`

			`// Delete the old text (the key)`
			`text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());`
			`}`

			`if (hasCursor) {`
			`// Adjust the cursor for positions outside the key. These`
			`// refer to code points rather than code units. If cursorPos`
			`// is within the output string, then use newStart, which has`
			`// already been set above.`
			`if (cursorPos < 0) {`
			`newStart = start;`
			`int32_t n = cursorPos;`
			`// Outside the output string, cursorPos counts code points`
			`while (n < 0 && newStart > 0) {`
			`newStart -= U16_LENGTH(text.char32At(newStart-1));`
			`++n;`
			`}`
			`newStart += n;`
			`} else if (cursorPos > output.length()) {`
			`newStart = start + outLen;`
			`int32_t n = cursorPos - output.length();`
			`// Outside the output string, cursorPos counts code points`
			`while (n > 0 && newStart < text.length()) {`
			`newStart += U16_LENGTH(text.char32At(newStart));`
			`--n;`
			`}`
			`newStart += n;`
			`} else {`
			`// Cursor is within output string. It has been set up above`
			`// to be relative to start.`
			`newStart += start;`
			`}`

			`cursor = newStart;`
			`}`

			`return outLen;`
			`}`

			`/**`
			`* UnicodeReplacer API`
			`*/`
			`UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,`
			`UBool escapeUnprintable) const {`
			`rule.truncate(0);`
			`UnicodeString quoteBuf;`

			`int32_t cursor = cursorPos;`

			`// Handle a cursor preceding the output`
			`if (hasCursor && cursor < 0) {`
			`while (cursor++ < 0) {`
			`ICU_Utility::appendToRule(rule, (UChar)0x0040 /@/, TRUE, escapeUnprintable, quoteBuf);`
			`}`
			`// Fall through and append '\|' below`
			`}`

			`for (int32_t i=0; i<output.length(); ++i) {`
			`if (hasCursor && i == cursor) {`
			`ICU_Utility::appendToRule(rule, (UChar)0x007C /\|/, TRUE, escapeUnprintable, quoteBuf);`
			`}`
			`UChar c = output.charAt(i); // Ok to use 16-bits here`

			`UnicodeReplacer* r = data->lookupReplacer(c);`
			`if (r == NULL) {`
			`ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);`
			`} else {`
			`UnicodeString buf;`
			`r->toReplacerPattern(buf, escapeUnprintable);`
			`buf.insert(0, (UChar)0x20);`
			`buf.append((UChar)0x20);`
			`ICU_Utility::appendToRule(rule, buf,`
			`TRUE, escapeUnprintable, quoteBuf);`
			`}`
			`}`

			`// Handle a cursor after the output. Use > rather than >= because`
			`// if cursor == output.length() it is at the end of the output,`
			`// which is the default position, so we need not emit it.`
			`if (hasCursor && cursor > output.length()) {`
			`cursor -= output.length();`
			`while (cursor-- > 0) {`
			`ICU_Utility::appendToRule(rule, (UChar)0x0040 /@/, TRUE, escapeUnprintable, quoteBuf);`
			`}`
			`ICU_Utility::appendToRule(rule, (UChar)0x007C /\|/, TRUE, escapeUnprintable, quoteBuf);`
			`}`
			`// Flush quoteBuf out to result`
			`ICU_Utility::appendToRule(rule, -1,`
			`TRUE, escapeUnprintable, quoteBuf);`

			`return rule;`
			`}`

			`/**`
			`* Implement UnicodeReplacer`
			`*/`
			`void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {`
			`UChar32 ch;`
			`for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {`
			`ch = output.char32At(i);`
			`UnicodeReplacer* r = data->lookupReplacer(ch);`
			`if (r == NULL) {`
			`toUnionTo.add(ch);`
			`} else {`
			`r->addReplacementSetTo(toUnionTo);`
			`}`
			`}`
			`}`

			`/**`
			`* UnicodeFunctor API`
			`*/`
			`void StringReplacer::setData(const TransliterationRuleData* d) {`
			`data = d;`
			`int32_t i = 0;`
			`while (i<output.length()) {`
			`UChar32 c = output.char32At(i);`
			`UnicodeFunctor* f = data->lookup(c);`
			`if (f != NULL) {`
			`f->setData(data);`
			`}`
			`i += U16_LENGTH(c);`
			`}`
			`}`

			`U_NAMESPACE_END`

			`#endif /* #if !UCONFIG_NO_TRANSLITERATION */`

			`//eof`