node/deps/icu-small/source/i18n/utf8collationiterator.cpp


								/*

								*******************************************************************************

								* Copyright (C) 2012-2014, International Business Machines

								* Corporation and others.  All Rights Reserved.

								*******************************************************************************

								* utf8collationiterator.cpp

								*

								* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)

								* created by: Markus W. Scherer

								*/


								#include "unicode/utypes.h"


								#if !UCONFIG_NO_COLLATION


								#include "unicode/utf8.h"

								#include "charstr.h"

								#include "cmemory.h"

								#include "collation.h"

								#include "collationdata.h"

								#include "collationfcd.h"

								#include "collationiterator.h"

								#include "normalizer2impl.h"

								#include "uassert.h"

								#include "utf8collationiterator.h"


								U_NAMESPACE_BEGIN


								UTF8CollationIterator::~UTF8CollationIterator() {}


								void

								UTF8CollationIterator::resetToOffset(int32_t newOffset) {

								    reset();

								    pos = newOffset;

								}


								int32_t

								UTF8CollationIterator::getOffset() const {

								    return pos;

								}


								uint32_t

								UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {

								    if(pos == length) {

								        c = U_SENTINEL;

								        return Collation::FALLBACK_CE32;

								    }

								    // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().

								    c = u8[pos++];

								    if(c < 0xc0) {

								        // ASCII 00..7F; trail bytes 80..BF map to error values.

								        return trie->data32[c];

								    }

								    uint8_t t1, t2;

								    if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {

								        // U+0080..U+07FF; 00..7F map to error values.

								        uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];

								        c = ((c & 0x1f) << 6) | t1;

								        ++pos;

								        return ce32;

								    } else if(c <= 0xef &&

								              ((pos + 1) < length || length < 0) &&

								              (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&

								              (t2 = (u8[pos + 1] - 0x80)) <= 0x3f

								    ) {

								        // U+0800..U+FFFF; caller maps surrogates to error values.

								        c = (UChar)((c << 12) | (t1 << 6) | t2);

								        pos += 2;

								        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);

								    } else {

								        // Function call for supplementary code points and error cases.

								        // Illegal byte sequences yield U+FFFD.

								        c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);

								        return data->getCE32(c);

								    }

								}


								UBool

								UTF8CollationIterator::foundNULTerminator() {

								    if(length < 0) {

								        length = --pos;

								        return TRUE;

								    } else {

								        return FALSE;

								    }

								}


								UBool

								UTF8CollationIterator::forbidSurrogateCodePoints() const {

								    return TRUE;

								}


								UChar32

								UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {

								    if(pos == length) {

								        return U_SENTINEL;

								    }

								    if(u8[pos] == 0 && length < 0) {

								        length = pos;

								        return U_SENTINEL;

								    }

								    UChar32 c;

								    U8_NEXT_OR_FFFD(u8, pos, length, c);

								    return c;

								}


								UChar32

								UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {

								    if(pos == 0) {

								        return U_SENTINEL;

								    }

								    UChar32 c;

								    U8_PREV_OR_FFFD(u8, 0, pos, c);

								    return c;

								}


								void

								UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {

								    U8_FWD_N(u8, pos, length, num);

								}


								void

								UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {

								    U8_BACK_N(u8, 0, pos, num);

								}


								// FCDUTF8CollationIterator ------------------------------------------------ ***


								FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}


								void

								FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {

								    reset();

								    start = pos = newOffset;

								    state = CHECK_FWD;

								}


								int32_t

								FCDUTF8CollationIterator::getOffset() const {

								    if(state != IN_NORMALIZED) {

								        return pos;

								    } else if(pos == 0) {

								        return start;

								    } else {

								        return limit;

								    }

								}


								uint32_t

								FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {

								    for(;;) {

								        if(state == CHECK_FWD) {

								            // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.

								            if(pos == length) {

								                c = U_SENTINEL;

								                return Collation::FALLBACK_CE32;

								            }

								            c = u8[pos++];

								            if(c < 0xc0) {

								                // ASCII 00..7F; trail bytes 80..BF map to error values.

								                return trie->data32[c];

								            }

								            uint8_t t1, t2;

								            if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {

								                // U+0080..U+07FF; 00..7F map to error values.

								                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];

								                c = ((c & 0x1f) << 6) | t1;

								                ++pos;

								                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {

								                    pos -= 2;

								                } else {

								                    return ce32;

								                }

								            } else if(c <= 0xef &&

								                      ((pos + 1) < length || length < 0) &&

								                      (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&

								                      (t2 = (u8[pos + 1] - 0x80)) <= 0x3f

								            ) {

								                // U+0800..U+FFFF; caller maps surrogates to error values.

								                c = (UChar)((c << 12) | (t1 << 6) | t2);

								                pos += 2;

								                if(CollationFCD::hasTccc(c) &&

								                        (CollationFCD::maybeTibetanCompositeVowel(c) ||

								                            (pos != length && nextHasLccc()))) {

								                    pos -= 3;

								                } else {

								                    break;  // return CE32(BMP)

								                }

								            } else {

								                // Function call for supplementary code points and error cases.

								                // Illegal byte sequences yield U+FFFD.

								                c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);

								                if(c == 0xfffd) {

								                    return Collation::FFFD_CE32;

								                } else {

								                    U_ASSERT(c > 0xffff);

								                    if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {

								                        pos -= 4;

								                    } else {

								                        return data->getCE32FromSupplementary(c);

								                    }

								                }

								            }

								            if(!nextSegment(errorCode)) {

								                c = U_SENTINEL;

								                return Collation::FALLBACK_CE32;

								            }

								            continue;

								        } else if(state == IN_FCD_SEGMENT && pos != limit) {

								            return UTF8CollationIterator::handleNextCE32(c, errorCode);

								        } else if(state == IN_NORMALIZED && pos != normalized.length()) {

								            c = normalized[pos++];

								            break;

								        } else {

								            switchToForward();

								        }

								    }

								    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);

								}


								UBool

								FCDUTF8CollationIterator::nextHasLccc() const {

								    U_ASSERT(state == CHECK_FWD && pos != length);

								    // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.

								    // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)

								    UChar32 c = u8[pos];

								    if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }

								    int32_t i = pos;

								    U8_NEXT_OR_FFFD(u8, i, length, c);

								    if(c > 0xffff) { c = U16_LEAD(c); }

								    return CollationFCD::hasLccc(c);

								}


								UBool

								FCDUTF8CollationIterator::previousHasTccc() const {

								    U_ASSERT(state == CHECK_BWD && pos != 0);

								    UChar32 c = u8[pos - 1];

								    if(c < 0x80) { return FALSE; }

								    int32_t i = pos;

								    U8_PREV_OR_FFFD(u8, 0, i, c);

								    if(c > 0xffff) { c = U16_LEAD(c); }

								    return CollationFCD::hasTccc(c);

								}


								UChar

								FCDUTF8CollationIterator::handleGetTrailSurrogate() {

								    if(state != IN_NORMALIZED) { return 0; }

								    U_ASSERT(pos < normalized.length());

								    UChar trail;

								    if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }

								    return trail;

								}


								UBool

								FCDUTF8CollationIterator::foundNULTerminator() {

								    if(state == CHECK_FWD && length < 0) {

								        length = --pos;

								        return TRUE;

								    } else {

								        return FALSE;

								    }

								}


								UChar32

								FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {

								    UChar32 c;

								    for(;;) {

								        if(state == CHECK_FWD) {

								            if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {

								                return U_SENTINEL;

								            }

								            if(c < 0x80) {

								                ++pos;

								                return c;

								            }

								            U8_NEXT_OR_FFFD(u8, pos, length, c);

								            if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&

								                    (CollationFCD::maybeTibetanCompositeVowel(c) ||

								                        (pos != length && nextHasLccc()))) {

								                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence

								                // and we can use U8_LENGTH() rather than a previous-position variable.

								                pos -= U8_LENGTH(c);

								                if(!nextSegment(errorCode)) {

								                    return U_SENTINEL;

								                }

								                continue;

								            }

								            return c;

								        } else if(state == IN_FCD_SEGMENT && pos != limit) {

								            U8_NEXT_OR_FFFD(u8, pos, length, c);

								            return c;

								        } else if(state == IN_NORMALIZED && pos != normalized.length()) {

								            c = normalized.char32At(pos);

								            pos += U16_LENGTH(c);

								            return c;

								        } else {

								            switchToForward();

								        }

								    }

								}


								UChar32

								FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {

								    UChar32 c;

								    for(;;) {

								        if(state == CHECK_BWD) {

								            if(pos == 0) {

								                return U_SENTINEL;

								            }

								            if((c = u8[pos - 1]) < 0x80) {

								                --pos;

								                return c;

								            }

								            U8_PREV_OR_FFFD(u8, 0, pos, c);

								            if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&

								                    (CollationFCD::maybeTibetanCompositeVowel(c) ||

								                        (pos != 0 && previousHasTccc()))) {

								                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence

								                // and we can use U8_LENGTH() rather than a previous-position variable.

								                pos += U8_LENGTH(c);

								                if(!previousSegment(errorCode)) {

								                    return U_SENTINEL;

								                }

								                continue;

								            }

								            return c;

								        } else if(state == IN_FCD_SEGMENT && pos != start) {

								            U8_PREV_OR_FFFD(u8, 0, pos, c);

								            return c;

								        } else if(state >= IN_NORMALIZED && pos != 0) {

								            c = normalized.char32At(pos - 1);

								            pos -= U16_LENGTH(c);

								            return c;

								        } else {

								            switchToBackward();

								        }

								    }

								}


								void

								FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {

								    // Specify the class to avoid a virtual-function indirection.

								    // In Java, we would declare this class final.

								    while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {

								        --num;

								    }

								}


								void

								FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {

								    // Specify the class to avoid a virtual-function indirection.

								    // In Java, we would declare this class final.

								    while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {

								        --num;

								    }

								}


								void

								FCDUTF8CollationIterator::switchToForward() {

								    U_ASSERT(state == CHECK_BWD ||

								             (state == IN_FCD_SEGMENT && pos == limit) ||

								             (state == IN_NORMALIZED && pos == normalized.length()));

								    if(state == CHECK_BWD) {

								        // Turn around from backward checking.

								        start = pos;

								        if(pos == limit) {

								            state = CHECK_FWD;  // Check forward.

								        } else {  // pos < limit

								            state = IN_FCD_SEGMENT;  // Stay in FCD segment.

								        }

								    } else {

								        // Reached the end of the FCD segment.

								        if(state == IN_FCD_SEGMENT) {

								            // The input text segment is FCD, extend it forward.

								        } else {

								            // The input text segment needed to be normalized.

								            // Switch to checking forward from it.

								            start = pos = limit;

								        }

								        state = CHECK_FWD;

								    }

								}


								UBool

								FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {

								    if(U_FAILURE(errorCode)) { return FALSE; }

								    U_ASSERT(state == CHECK_FWD && pos != length);

								    // The input text [start..pos[ passes the FCD check.

								    int32_t segmentStart = pos;

								    // Collect the characters being checked, in case they need to be normalized.

								    UnicodeString s;

								    uint8_t prevCC = 0;

								    for(;;) {

								        // Fetch the next character and its fcd16 value.

								        int32_t cpStart = pos;

								        UChar32 c;

								        U8_NEXT_OR_FFFD(u8, pos, length, c);

								        uint16_t fcd16 = nfcImpl.getFCD16(c);

								        uint8_t leadCC = (uint8_t)(fcd16 >> 8);

								        if(leadCC == 0 && cpStart != segmentStart) {

								            // FCD boundary before this character.

								            pos = cpStart;

								            break;

								        }

								        s.append(c);

								        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {

								            // Fails FCD check. Find the next FCD boundary and normalize.

								            while(pos != length) {

								                cpStart = pos;

								                U8_NEXT_OR_FFFD(u8, pos, length, c);

								                if(nfcImpl.getFCD16(c) <= 0xff) {

								                    pos = cpStart;

								                    break;

								                }

								                s.append(c);

								            }

								            if(!normalize(s, errorCode)) { return FALSE; }

								            start = segmentStart;

								            limit = pos;

								            state = IN_NORMALIZED;

								            pos = 0;

								            return TRUE;

								        }

								        prevCC = (uint8_t)fcd16;

								        if(pos == length || prevCC == 0) {

								            // FCD boundary after the last character.

								            break;

								        }

								    }

								    limit = pos;

								    pos = segmentStart;

								    U_ASSERT(pos != limit);

								    state = IN_FCD_SEGMENT;

								    return TRUE;

								}


								void

								FCDUTF8CollationIterator::switchToBackward() {

								    U_ASSERT(state == CHECK_FWD ||

								             (state == IN_FCD_SEGMENT && pos == start) ||

								             (state >= IN_NORMALIZED && pos == 0));

								    if(state == CHECK_FWD) {

								        // Turn around from forward checking.

								        limit = pos;

								        if(pos == start) {

								            state = CHECK_BWD;  // Check backward.

								        } else {  // pos > start

								            state = IN_FCD_SEGMENT;  // Stay in FCD segment.

								        }

								    } else {

								        // Reached the start of the FCD segment.

								        if(state == IN_FCD_SEGMENT) {

								            // The input text segment is FCD, extend it backward.

								        } else {

								            // The input text segment needed to be normalized.

								            // Switch to checking backward from it.

								            limit = pos = start;

								        }

								        state = CHECK_BWD;

								    }

								}


								UBool

								FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {

								    if(U_FAILURE(errorCode)) { return FALSE; }

								    U_ASSERT(state == CHECK_BWD && pos != 0);

								    // The input text [pos..limit[ passes the FCD check.

								    int32_t segmentLimit = pos;

								    // Collect the characters being checked, in case they need to be normalized.

								    UnicodeString s;

								    uint8_t nextCC = 0;

								    for(;;) {

								        // Fetch the previous character and its fcd16 value.

								        int32_t cpLimit = pos;

								        UChar32 c;

								        U8_PREV_OR_FFFD(u8, 0, pos, c);

								        uint16_t fcd16 = nfcImpl.getFCD16(c);

								        uint8_t trailCC = (uint8_t)fcd16;

								        if(trailCC == 0 && cpLimit != segmentLimit) {

								            // FCD boundary after this character.

								            pos = cpLimit;

								            break;

								        }

								        s.append(c);

								        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||

								                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {

								            // Fails FCD check. Find the previous FCD boundary and normalize.

								            while(fcd16 > 0xff && pos != 0) {

								                cpLimit = pos;

								                U8_PREV_OR_FFFD(u8, 0, pos, c);

								                fcd16 = nfcImpl.getFCD16(c);

								                if(fcd16 == 0) {

								                    pos = cpLimit;

								                    break;

								                }

								                s.append(c);

								            }

								            s.reverse();

								            if(!normalize(s, errorCode)) { return FALSE; }

								            limit = segmentLimit;

								            start = pos;

								            state = IN_NORMALIZED;

								            pos = normalized.length();

								            return TRUE;

								        }

								        nextCC = (uint8_t)(fcd16 >> 8);

								        if(pos == 0 || nextCC == 0) {

								            // FCD boundary before the following character.

								            break;

								        }

								    }

								    start = pos;

								    pos = segmentLimit;

								    U_ASSERT(pos != start);

								    state = IN_FCD_SEGMENT;

								    return TRUE;

								}


								UBool

								FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {

								    // NFD without argument checking.

								    U_ASSERT(U_SUCCESS(errorCode));

								    nfcImpl.decompose(s, normalized, errorCode);

								    return U_SUCCESS(errorCode);

								}


								U_NAMESPACE_END


								#endif  // !UCONFIG_NO_COLLATION