node/deps/icu-small/source/i18n/brktrans.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2008-2015, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   05/11/2008  Andy Heninger  Port from Java
**********************************************************************
*/

#include "unicode/utypes.h"

#if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION

#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
#include "unicode/uchar.h"
#include "unicode/unifilt.h"
#include "unicode/uniset.h"

#include "brktrans.h"
#include "cmemory.h"
#include "mutex.h"
#include "uprops.h"
#include "uinvchar.h"
#include "util.h"
#include "uvectr32.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)

static const UChar SPACE       = 32;  // ' '


/**
 * Constructs a transliterator with the default delimiters '{' and
 * '}'.
 */
BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
        Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
        cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
    }


/**
 * Destructor.
 */
BreakTransliterator::~BreakTransliterator() {
}

/**
 * Copy constructor.
 */
BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
        Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
}


/**
 * Transliterator API.
 */
Transliterator* BreakTransliterator::clone(void) const {
    return new BreakTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                    UBool isIncremental ) const {

        UErrorCode status = U_ZERO_ERROR;
        LocalPointer<BreakIterator> bi;
        LocalPointer<UVector32> boundaries;

        {
            Mutex m;
            BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
            boundaries.moveFrom(nonConstThis->cachedBoundaries);
            bi.moveFrom(nonConstThis->cachedBI);
        }
        if (bi.isNull()) {
            bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
        }
        if (boundaries.isNull()) {
            boundaries.adoptInstead(new UVector32(status));
        }

        if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
            return;
        }

        boundaries->removeAllElements();
        UnicodeString sText = replaceableAsString(text);
        bi->setText(sText);
        bi->preceding(offsets.start);

        // To make things much easier, we will stack the boundaries, and then insert at the end.
        // generally, we won't need too many, since we will be filtered.

        int32_t boundary;
        for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
            if (boundary == 0) continue;
            // HACK: Check to see that preceeding item was a letter

            UChar32 cp = sText.char32At(boundary-1);
            int type = u_charType(cp);
            //System.out.println(Integer.toString(cp,16) + " (before): " + type);
            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;

            cp = sText.char32At(boundary);
            type = u_charType(cp);
            //System.out.println(Integer.toString(cp,16) + " (after): " + type);
            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;

            boundaries->addElement(boundary, status);
            // printf("Boundary at %d\n", boundary);
        }

        int delta = 0;
        int lastBoundary = 0;

        if (boundaries->size() != 0) { // if we found something, adjust
            delta = boundaries->size() * fInsertion.length();
            lastBoundary = boundaries->lastElementi();

            // we do this from the end backwards, so that we don't have to keep updating.

            while (boundaries->size() > 0) {
                boundary = boundaries->popi();
                text.handleReplaceBetween(boundary, boundary, fInsertion);
            }
        }

        // Now fix up the return values
        offsets.contextLimit += delta;
        offsets.limit += delta;
        offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;

        // Return break iterator & boundaries vector to the cache.
        {
            Mutex m;
            BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
            if (nonConstThis->cachedBI.isNull()) {
                nonConstThis->cachedBI.moveFrom(bi);
            }
            if (nonConstThis->cachedBoundaries.isNull()) {
                nonConstThis->cachedBoundaries.moveFrom(boundaries);
            }
        }

        // TODO:  do something with U_FAILURE(status);
        //        (need to look at transliterators overall, not just here.)
}

//
//  getInsertion()
//
const UnicodeString &BreakTransliterator::getInsertion() const {
    return fInsertion;
}

//
//  setInsertion()
//
void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
    this->fInsertion = insertion;
}

//
//   replaceableAsString   Hack to let break iterators work
//                         on the replaceable text from transliterators.
//                         In practice, the only real Replaceable type that we
//                         will be seeing is UnicodeString, so this function
//                         will normally be efficient.
//
UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
    UnicodeString s;
    UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
    if (rs != NULL) {
        s = *rs;
    } else {
        r.extractBetween(0, r.length(), s);
    }
    return s;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
deps: ICU 59.1 bump * No feature changes. * Bug fixes. * Details: http://site.icu-project.org/download/59 Fixes: https://github.com/nodejs/node/issues/12077 PR-URL: https://github.com/nodejs/node/pull/12486 Refs: https://github.com/nodejs/node/issues/7844 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Michael Dawson <michael_dawson@ca.ibm.com> Reviewed-By: Refael Ackermann <refack@gmail.com> 8 years ago			`// © 2016 and later: Unicode, Inc. and others.`
deps: Intl: ICU 58 bump - small icu (BIG COMMIT) This commit contains the ICU 58.1 delta. It is especially large because of the ICU license change, and, because the line endings were off previously. * bump to ICU 58.1 - check in small ICU source * from 58.1 final http://site.icu-project.org/download/58 Fixes: https://github.com/nodejs/node/issues/7844 PR-URL: https://github.com/nodejs/node/pull/9234 Reviewed-By: James M Snell <jasnell@gmail.com> 8 years ago			`// License & terms of use: http://www.unicode.org/copyright.html`
deps: Intl: Check in "small-icu" 57.1 * this commit has "small" ICU 57.1. See other related commit for tools to generate this commit. Fixes: https://github.com/nodejs/node/issues/3476 PR-URL: https://github.com/nodejs/node/pull/6088 Reviewed-By: James M Snell <jasnell@gmail.com> 9 years ago			`/*`
			`**********************************************************************`
			`* Copyright (C) 2008-2015, International Business Machines`
			`* Corporation and others. All Rights Reserved.`
			`**********************************************************************`
			`* Date Name Description`
			`* 05/11/2008 Andy Heninger Port from Java`
			`**********************************************************************`
			`*/`

			`#include "unicode/utypes.h"`

			`#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION`

			`#include "unicode/brkiter.h"`
			`#include "unicode/localpointer.h"`
			`#include "unicode/uchar.h"`
			`#include "unicode/unifilt.h"`
			`#include "unicode/uniset.h"`

			`#include "brktrans.h"`
			`#include "cmemory.h"`
			`#include "mutex.h"`
			`#include "uprops.h"`
			`#include "uinvchar.h"`
			`#include "util.h"`
			`#include "uvectr32.h"`

			`U_NAMESPACE_BEGIN`

			`UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)`

			`static const UChar SPACE = 32; // ' '`


			`/**`
			`* Constructs a transliterator with the default delimiters '{' and`
			`* '}'.`
			`*/`
			`BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :`
			`Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),`
			`cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {`
			`}`


			`/**`
			`* Destructor.`
			`*/`
			`BreakTransliterator::~BreakTransliterator() {`
			`}`

			`/**`
			`* Copy constructor.`
			`*/`
			`BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :`
			`Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {`
			`}`


			`/**`
			`* Transliterator API.`
			`*/`
			`Transliterator* BreakTransliterator::clone(void) const {`
			`return new BreakTransliterator(*this);`
			`}`

			`/**`
			`* Implements {@link Transliterator#handleTransliterate}.`
			`*/`
			`void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,`
			`UBool isIncremental ) const {`

			`UErrorCode status = U_ZERO_ERROR;`
			`LocalPointer<BreakIterator> bi;`
			`LocalPointer<UVector32> boundaries;`

			`{`
			`Mutex m;`
			`BreakTransliterator nonConstThis = const_cast<BreakTransliterator >(this);`
			`boundaries.moveFrom(nonConstThis->cachedBoundaries);`
			`bi.moveFrom(nonConstThis->cachedBI);`
			`}`
			`if (bi.isNull()) {`
			`bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));`
			`}`
			`if (boundaries.isNull()) {`
			`boundaries.adoptInstead(new UVector32(status));`
			`}`

			`if (bi.isNull() \|\| boundaries.isNull() \|\| U_FAILURE(status)) {`
			`return;`
			`}`

			`boundaries->removeAllElements();`
			`UnicodeString sText = replaceableAsString(text);`
			`bi->setText(sText);`
			`bi->preceding(offsets.start);`

			`// To make things much easier, we will stack the boundaries, and then insert at the end.`
			`// generally, we won't need too many, since we will be filtered.`

			`int32_t boundary;`
			`for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {`
			`if (boundary == 0) continue;`
			`// HACK: Check to see that preceeding item was a letter`

			`UChar32 cp = sText.char32At(boundary-1);`
			`int type = u_charType(cp);`
			`//System.out.println(Integer.toString(cp,16) + " (before): " + type);`
			`if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == 0) continue;`

			`cp = sText.char32At(boundary);`
			`type = u_charType(cp);`
			`//System.out.println(Integer.toString(cp,16) + " (after): " + type);`
			`if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == 0) continue;`

			`boundaries->addElement(boundary, status);`
			`// printf("Boundary at %d\n", boundary);`
			`}`

			`int delta = 0;`
			`int lastBoundary = 0;`

			`if (boundaries->size() != 0) { // if we found something, adjust`
			`delta = boundaries->size() * fInsertion.length();`
			`lastBoundary = boundaries->lastElementi();`

			`// we do this from the end backwards, so that we don't have to keep updating.`

			`while (boundaries->size() > 0) {`
			`boundary = boundaries->popi();`
			`text.handleReplaceBetween(boundary, boundary, fInsertion);`
			`}`
			`}`

			`// Now fix up the return values`
			`offsets.contextLimit += delta;`
			`offsets.limit += delta;`
			`offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;`

			`// Return break iterator & boundaries vector to the cache.`
			`{`
			`Mutex m;`
			`BreakTransliterator nonConstThis = const_cast<BreakTransliterator >(this);`
			`if (nonConstThis->cachedBI.isNull()) {`
			`nonConstThis->cachedBI.moveFrom(bi);`
			`}`
			`if (nonConstThis->cachedBoundaries.isNull()) {`
			`nonConstThis->cachedBoundaries.moveFrom(boundaries);`
			`}`
			`}`

			`// TODO: do something with U_FAILURE(status);`
			`// (need to look at transliterators overall, not just here.)`
			`}`

			`//`
			`// getInsertion()`
			`//`
			`const UnicodeString &BreakTransliterator::getInsertion() const {`
			`return fInsertion;`
			`}`

			`//`
			`// setInsertion()`
			`//`
			`void BreakTransliterator::setInsertion(const UnicodeString &insertion) {`
			`this->fInsertion = insertion;`
			`}`

			`//`
			`// replaceableAsString Hack to let break iterators work`
			`// on the replaceable text from transliterators.`
			`// In practice, the only real Replaceable type that we`
			`// will be seeing is UnicodeString, so this function`
			`// will normally be efficient.`
			`//`
			`UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {`
			`UnicodeString s;`
			`UnicodeString rs = dynamic_cast<UnicodeString >(&r);`
			`if (rs != NULL) {`
			`s = *rs;`
			`} else {`
			`r.extractBetween(0, r.length(), s);`
			`}`
			`return s;`
			`}`

			`U_NAMESPACE_END`

			`#endif /* #if !UCONFIG_NO_TRANSLITERATION */`