|
|
|
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
* Copyright (C) 2009-2014, International Business Machines Corporation and
|
|
|
|
* others. All Rights Reserved.
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_COLLATION
|
|
|
|
|
|
|
|
#include "unicode/alphaindex.h"
|
|
|
|
#include "unicode/coll.h"
|
|
|
|
#include "unicode/localpointer.h"
|
|
|
|
#include "unicode/normalizer2.h"
|
|
|
|
#include "unicode/tblcoll.h"
|
|
|
|
#include "unicode/uchar.h"
|
|
|
|
#include "unicode/ulocdata.h"
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "unicode/uobject.h"
|
|
|
|
#include "unicode/usetiter.h"
|
|
|
|
#include "unicode/utf16.h"
|
|
|
|
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "cstring.h"
|
|
|
|
#include "uassert.h"
|
|
|
|
#include "uvector.h"
|
|
|
|
#include "uvectr64.h"
|
|
|
|
|
|
|
|
//#include <string>
|
|
|
|
//#include <iostream>
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Prefix string for Chinese index buckets.
|
|
|
|
* See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
|
|
|
|
*/
|
|
|
|
const UChar BASE[1] = { 0xFDD0 };
|
|
|
|
const int32_t BASE_LENGTH = 1;
|
|
|
|
|
|
|
|
UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
|
|
|
|
const UnicodeString &one, const UnicodeString &other);
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
static int32_t U_CALLCONV
|
|
|
|
collatorComparator(const void *context, const void *left, const void *right);
|
|
|
|
|
|
|
|
static int32_t U_CALLCONV
|
|
|
|
recordCompareFn(const void *context, const void *left, const void *right);
|
|
|
|
|
|
|
|
// UVector<Record *> support function, delete a Record.
|
|
|
|
static void U_CALLCONV
|
|
|
|
alphaIndex_deleteRecord(void *obj) {
|
|
|
|
delete static_cast<AlphabeticIndex::Record *>(obj);
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
|
|
|
|
UErrorCode &errorCode) {
|
|
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
|
|
if (owned.isValid()) {
|
|
|
|
return owned.orphan();
|
|
|
|
}
|
|
|
|
UnicodeString *p = new UnicodeString(s);
|
|
|
|
if (p == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
}
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline UnicodeString *getString(const UVector &list, int32_t i) {
|
|
|
|
return static_cast<UnicodeString *>(list[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
|
|
|
|
return static_cast<AlphabeticIndex::Bucket *>(list[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
|
|
|
|
return static_cast<AlphabeticIndex::Record *>(list[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Like Java Collections.binarySearch(List, String, Comparator).
|
|
|
|
*
|
|
|
|
* @return the index>=0 where the item was found,
|
|
|
|
* or the index<0 for inserting the string at ~index in sorted order
|
|
|
|
*/
|
|
|
|
int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
|
|
|
|
if (list.size() == 0) { return ~0; }
|
|
|
|
int32_t start = 0;
|
|
|
|
int32_t limit = list.size();
|
|
|
|
for (;;) {
|
|
|
|
int32_t i = (start + limit) / 2;
|
|
|
|
const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
|
|
|
|
UErrorCode errorCode = U_ZERO_ERROR;
|
|
|
|
UCollationResult cmp = coll.compare(s, *si, errorCode);
|
|
|
|
if (cmp == UCOL_EQUAL) {
|
|
|
|
return i;
|
|
|
|
} else if (cmp < 0) {
|
|
|
|
if (i == start) {
|
|
|
|
return ~start; // insert s before *si
|
|
|
|
}
|
|
|
|
limit = i;
|
|
|
|
} else {
|
|
|
|
if (i == start) {
|
|
|
|
return ~(start + 1); // insert s after *si
|
|
|
|
}
|
|
|
|
start = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// The BucketList is not in the anonymous namespace because only Clang
|
|
|
|
// seems to support its use in other classes from there.
|
|
|
|
// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
|
|
|
|
class BucketList : public UObject {
|
|
|
|
public:
|
|
|
|
BucketList(UVector *bucketList, UVector *publicBucketList)
|
|
|
|
: bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
|
|
|
|
int32_t displayIndex = 0;
|
|
|
|
for (int32_t i = 0; i < publicBucketList->size(); ++i) {
|
|
|
|
getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The virtual destructor must not be inline.
|
|
|
|
// See ticket #8454 for details.
|
|
|
|
virtual ~BucketList();
|
|
|
|
|
|
|
|
int32_t getBucketCount() const {
|
|
|
|
return immutableVisibleList_->size();
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
|
|
|
|
UErrorCode &errorCode) {
|
|
|
|
// binary search
|
|
|
|
int32_t start = 0;
|
|
|
|
int32_t limit = bucketList_->size();
|
|
|
|
while ((start + 1) < limit) {
|
|
|
|
int32_t i = (start + limit) / 2;
|
|
|
|
const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
|
|
|
|
UCollationResult nameVsBucket =
|
|
|
|
collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
|
|
|
|
if (nameVsBucket < 0) {
|
|
|
|
limit = i;
|
|
|
|
} else {
|
|
|
|
start = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
|
|
|
|
if (bucket->displayBucket_ != NULL) {
|
|
|
|
bucket = bucket->displayBucket_;
|
|
|
|
}
|
|
|
|
return bucket->displayIndex_;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** All of the buckets, visible and invisible. */
|
|
|
|
UVector *bucketList_;
|
|
|
|
/** Just the visible buckets. */
|
|
|
|
UVector *immutableVisibleList_;
|
|
|
|
};
|
|
|
|
|
|
|
|
BucketList::~BucketList() {
|
|
|
|
delete bucketList_;
|
|
|
|
if (immutableVisibleList_ != bucketList_) {
|
|
|
|
delete immutableVisibleList_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
|
|
|
|
delete buckets_;
|
|
|
|
delete collatorPrimaryOnly_;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
AlphabeticIndex::ImmutableIndex::getBucketCount() const {
|
|
|
|
return buckets_->getBucketCount();
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
AlphabeticIndex::ImmutableIndex::getBucketIndex(
|
|
|
|
const UnicodeString &name, UErrorCode &errorCode) const {
|
|
|
|
return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
const AlphabeticIndex::Bucket *
|
|
|
|
AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
|
|
|
|
if (0 <= index && index < buckets_->getBucketCount()) {
|
|
|
|
return icu::getBucket(*buckets_->immutableVisibleList_, index);
|
|
|
|
} else {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
|
|
|
|
: inputList_(NULL),
|
|
|
|
labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
|
|
|
|
maxLabelCount_(99),
|
|
|
|
initialLabels_(NULL), firstCharsInScripts_(NULL),
|
|
|
|
collator_(NULL), collatorPrimaryOnly_(NULL),
|
|
|
|
buckets_(NULL) {
|
|
|
|
init(&locale, status);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
|
|
|
|
: inputList_(NULL),
|
|
|
|
labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
|
|
|
|
maxLabelCount_(99),
|
|
|
|
initialLabels_(NULL), firstCharsInScripts_(NULL),
|
|
|
|
collator_(collator), collatorPrimaryOnly_(NULL),
|
|
|
|
buckets_(NULL) {
|
|
|
|
init(NULL, status);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex::~AlphabeticIndex() {
|
|
|
|
delete collator_;
|
|
|
|
delete collatorPrimaryOnly_;
|
|
|
|
delete firstCharsInScripts_;
|
|
|
|
delete buckets_;
|
|
|
|
delete inputList_;
|
|
|
|
delete initialLabels_;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
initialLabels_->addAll(additions);
|
|
|
|
clearBuckets();
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
|
|
|
|
addIndexExemplars(locale, status);
|
|
|
|
clearBuckets();
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
|
|
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
|
|
// In C++, the ImmutableIndex must own its copy of the BucketList,
|
|
|
|
// even if it contains no records, for proper memory management.
|
|
|
|
// We could clone the buckets_ if they are not NULL,
|
|
|
|
// but that would be worth it only if this method is called multiple times,
|
|
|
|
// or called after using the old-style bucket iterator API.
|
|
|
|
LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
|
|
|
|
LocalPointer<RuleBasedCollator> coll(
|
|
|
|
static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
|
|
|
|
if (immutableBucketList.isNull() || coll.isNull()) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
|
|
|
|
if (immIndex == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
// The ImmutableIndex adopted its parameter objects.
|
|
|
|
immutableBucketList.orphan();
|
|
|
|
coll.orphan();
|
|
|
|
return immIndex;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
|
|
|
|
initBuckets(status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return buckets_->getBucketCount();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status) || inputList_ == NULL) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return inputList_->size();
|
|
|
|
}
|
|
|
|
|
|
|
|
void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
|
|
|
|
const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
|
|
|
|
if (U_FAILURE(errorCode)) { return; }
|
|
|
|
|
|
|
|
const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
|
|
|
|
const UnicodeString &overflowBoundary =
|
|
|
|
*getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
|
|
|
|
|
|
|
|
// We make a sorted array of elements.
|
|
|
|
// Some of the input may be redundant.
|
|
|
|
// That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
|
|
|
|
// We filter out those cases.
|
|
|
|
UnicodeSetIterator iter(*initialLabels_);
|
|
|
|
while (iter.next()) {
|
|
|
|
const UnicodeString *item = &iter.getString();
|
|
|
|
LocalPointer<UnicodeString> ownedItem;
|
|
|
|
UBool checkDistinct;
|
|
|
|
int32_t itemLength = item->length();
|
|
|
|
if (!item->hasMoreChar32Than(0, itemLength, 1)) {
|
|
|
|
checkDistinct = FALSE;
|
|
|
|
} else if(item->charAt(itemLength - 1) == 0x2a && // '*'
|
|
|
|
item->charAt(itemLength - 2) != 0x2a) {
|
|
|
|
// Use a label if it is marked with one trailing star,
|
|
|
|
// even if the label string sorts the same when all contractions are suppressed.
|
|
|
|
ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
|
|
|
|
item = ownedItem.getAlias();
|
|
|
|
if (item == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
checkDistinct = FALSE;
|
|
|
|
} else {
|
|
|
|
checkDistinct = TRUE;
|
|
|
|
}
|
|
|
|
if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
|
|
|
|
// Ignore a primary-ignorable or non-alphabetic index character.
|
|
|
|
} else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
|
|
|
|
// Ignore an index character that will land in the overflow bucket.
|
|
|
|
} else if (checkDistinct &&
|
|
|
|
collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
|
|
|
|
// Ignore a multi-code point index character that does not sort distinctly
|
|
|
|
// from the sequence of its separate characters.
|
|
|
|
} else {
|
|
|
|
int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
|
|
|
|
if (insertionPoint < 0) {
|
|
|
|
indexCharacters.insertElementAt(
|
|
|
|
ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
|
|
|
|
} else {
|
|
|
|
const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
|
|
|
|
if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
|
|
|
|
indexCharacters.setElementAt(
|
|
|
|
ownedString(*item, ownedItem, errorCode), insertionPoint);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (U_FAILURE(errorCode)) { return; }
|
|
|
|
|
|
|
|
// if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
|
|
|
|
|
|
|
|
int32_t size = indexCharacters.size() - 1;
|
|
|
|
if (size > maxLabelCount_) {
|
|
|
|
int32_t count = 0;
|
|
|
|
int32_t old = -1;
|
|
|
|
for (int32_t i = 0; i < indexCharacters.size();) {
|
|
|
|
++count;
|
|
|
|
int32_t bump = count * maxLabelCount_ / size;
|
|
|
|
if (bump == old) {
|
|
|
|
indexCharacters.removeElementAt(i);
|
|
|
|
} else {
|
|
|
|
old = bump;
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) {
|
|
|
|
if (!current.startsWith(BASE, BASE_LENGTH)) {
|
|
|
|
return current;
|
|
|
|
}
|
|
|
|
UChar rest = current.charAt(BASE_LENGTH);
|
|
|
|
if (0x2800 < rest && rest <= 0x28FF) { // stroke count
|
|
|
|
int32_t count = rest-0x2800;
|
|
|
|
temp.setTo((UChar)(0x30 + count % 10));
|
|
|
|
if (count >= 10) {
|
|
|
|
count /= 10;
|
|
|
|
temp.insert(0, (UChar)(0x30 + count % 10));
|
|
|
|
if (count >= 10) {
|
|
|
|
count /= 10;
|
|
|
|
temp.insert(0, (UChar)(0x30 + count));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return temp.append((UChar)0x5283);
|
|
|
|
}
|
|
|
|
return temp.setTo(current, BASE_LENGTH);
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool hasMultiplePrimaryWeights(
|
|
|
|
const RuleBasedCollator &coll, uint32_t variableTop,
|
|
|
|
const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
|
|
|
|
ces.removeAllElements();
|
|
|
|
coll.internalGetCEs(s, ces, errorCode);
|
|
|
|
if (U_FAILURE(errorCode)) { return FALSE; }
|
|
|
|
UBool seenPrimary = FALSE;
|
|
|
|
for (int32_t i = 0; i < ces.size(); ++i) {
|
|
|
|
int64_t ce = ces.elementAti(i);
|
|
|
|
uint32_t p = (uint32_t)(ce >> 32);
|
|
|
|
if (p > variableTop) {
|
|
|
|
// not primary ignorable
|
|
|
|
if (seenPrimary) {
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
seenPrimary = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
|
|
|
|
// Initialize indexCharacters.
|
|
|
|
UVector indexCharacters(errorCode);
|
|
|
|
indexCharacters.setDeleter(uprv_deleteUObject);
|
|
|
|
initLabels(indexCharacters, errorCode);
|
|
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
|
|
|
|
|
|
// Variables for hasMultiplePrimaryWeights().
|
|
|
|
UVector64 ces(errorCode);
|
|
|
|
uint32_t variableTop;
|
|
|
|
if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
|
|
|
|
variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
|
|
|
|
} else {
|
|
|
|
variableTop = 0;
|
|
|
|
}
|
|
|
|
UBool hasInvisibleBuckets = FALSE;
|
|
|
|
|
|
|
|
// Helper arrays for Chinese Pinyin collation.
|
|
|
|
Bucket *asciiBuckets[26] = {
|
|
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
|
|
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
|
|
|
|
};
|
|
|
|
Bucket *pinyinBuckets[26] = {
|
|
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
|
|
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
|
|
|
|
};
|
|
|
|
UBool hasPinyin = FALSE;
|
|
|
|
|
|
|
|
LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode);
|
|
|
|
if (U_FAILURE(errorCode)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucketList->setDeleter(uprv_deleteUObject);
|
|
|
|
|
|
|
|
// underflow bucket
|
|
|
|
Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
|
|
|
|
if (bucket == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucketList->addElement(bucket, errorCode);
|
|
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
|
|
|
|
|
|
UnicodeString temp;
|
|
|
|
|
|
|
|
// fix up the list, adding underflow, additions, overflow
|
|
|
|
// Insert inflow labels as needed.
|
|
|
|
int32_t scriptIndex = -1;
|
|
|
|
const UnicodeString *scriptUpperBoundary = &emptyString_;
|
|
|
|
for (int32_t i = 0; i < indexCharacters.size(); ++i) {
|
|
|
|
UnicodeString ¤t = *getString(indexCharacters, i);
|
|
|
|
if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
|
|
|
|
// We crossed the script boundary into a new script.
|
|
|
|
const UnicodeString &inflowBoundary = *scriptUpperBoundary;
|
|
|
|
UBool skippedScript = FALSE;
|
|
|
|
for (;;) {
|
|
|
|
scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
|
|
|
|
if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
skippedScript = TRUE;
|
|
|
|
}
|
|
|
|
if (skippedScript && bucketList->size() > 1) {
|
|
|
|
// We are skipping one or more scripts,
|
|
|
|
// and we are not just getting out of the underflow label.
|
|
|
|
bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
|
|
|
|
if (bucket == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucketList->addElement(bucket, errorCode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Add a bucket with the current label.
|
|
|
|
bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
|
|
|
|
if (bucket == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucketList->addElement(bucket, errorCode);
|
|
|
|
// Remember ASCII and Pinyin buckets for Pinyin redirects.
|
|
|
|
UChar c;
|
|
|
|
if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
|
|
|
|
asciiBuckets[c - 0x41] = bucket;
|
|
|
|
} else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
|
|
|
|
0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
|
|
|
|
pinyinBuckets[c - 0x41] = bucket;
|
|
|
|
hasPinyin = TRUE;
|
|
|
|
}
|
|
|
|
// Check for multiple primary weights.
|
|
|
|
if (!current.startsWith(BASE, BASE_LENGTH) &&
|
|
|
|
hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
|
|
|
|
ces, errorCode) &&
|
|
|
|
current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
|
|
|
|
// "AE-ligature" or "Sch" etc.
|
|
|
|
for (int32_t i = bucketList->size() - 2;; --i) {
|
|
|
|
Bucket *singleBucket = getBucket(*bucketList, i);
|
|
|
|
if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
|
|
|
|
// There is no single-character bucket since the last
|
|
|
|
// underflow or inflow label.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (singleBucket->displayBucket_ == NULL &&
|
|
|
|
!hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
|
|
|
|
singleBucket->lowerBoundary_,
|
|
|
|
ces, errorCode)) {
|
|
|
|
// Add an invisible bucket that redirects strings greater than the expansion
|
|
|
|
// to the previous single-character bucket.
|
|
|
|
// For example, after ... Q R S Sch we add Sch\uFFFF->S
|
|
|
|
// and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
|
|
|
|
bucket = new Bucket(emptyString_,
|
|
|
|
UnicodeString(current).append((UChar)0xFFFF),
|
|
|
|
U_ALPHAINDEX_NORMAL);
|
|
|
|
if (bucket == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucket->displayBucket_ = singleBucket;
|
|
|
|
bucketList->addElement(bucket, errorCode);
|
|
|
|
hasInvisibleBuckets = TRUE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
|
|
if (bucketList->size() == 1) {
|
|
|
|
// No real labels, show only the underflow label.
|
|
|
|
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
|
|
|
|
if (bl == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucketList.orphan();
|
|
|
|
return bl;
|
|
|
|
}
|
|
|
|
// overflow bucket
|
|
|
|
bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
|
|
|
|
if (bucket == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucketList->addElement(bucket, errorCode); // final
|
|
|
|
|
|
|
|
if (hasPinyin) {
|
|
|
|
// Redirect Pinyin buckets.
|
|
|
|
Bucket *asciiBucket = NULL;
|
|
|
|
for (int32_t i = 0; i < 26; ++i) {
|
|
|
|
if (asciiBuckets[i] != NULL) {
|
|
|
|
asciiBucket = asciiBuckets[i];
|
|
|
|
}
|
|
|
|
if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
|
|
|
|
pinyinBuckets[i]->displayBucket_ = asciiBucket;
|
|
|
|
hasInvisibleBuckets = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
|
|
if (!hasInvisibleBuckets) {
|
|
|
|
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
|
|
|
|
if (bl == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucketList.orphan();
|
|
|
|
return bl;
|
|
|
|
}
|
|
|
|
// Merge inflow buckets that are visually adjacent.
|
|
|
|
// Iterate backwards: Merge inflow into overflow rather than the other way around.
|
|
|
|
int32_t i = bucketList->size() - 1;
|
|
|
|
Bucket *nextBucket = getBucket(*bucketList, i);
|
|
|
|
while (--i > 0) {
|
|
|
|
bucket = getBucket(*bucketList, i);
|
|
|
|
if (bucket->displayBucket_ != NULL) {
|
|
|
|
continue; // skip invisible buckets
|
|
|
|
}
|
|
|
|
if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
|
|
|
|
if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
|
|
|
|
bucket->displayBucket_ = nextBucket;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
nextBucket = bucket;
|
|
|
|
}
|
|
|
|
|
|
|
|
LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode);
|
|
|
|
if (U_FAILURE(errorCode)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
// Do not call publicBucketList->setDeleter():
|
|
|
|
// This vector shares its objects with the bucketList.
|
|
|
|
for (int32_t i = 0; i < bucketList->size(); ++i) {
|
|
|
|
bucket = getBucket(*bucketList, i);
|
|
|
|
if (bucket->displayBucket_ == NULL) {
|
|
|
|
publicBucketList->addElement(bucket, errorCode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
|
|
BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
|
|
|
|
if (bl == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
bucketList.orphan();
|
|
|
|
publicBucketList.orphan();
|
|
|
|
return bl;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Creates an index, and buckets and sorts the list of records into the index.
|
|
|
|
*/
|
|
|
|
void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
|
|
|
|
if (U_FAILURE(errorCode) || buckets_ != NULL) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
buckets_ = createBucketList(errorCode);
|
|
|
|
if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort the records by name.
|
|
|
|
// Stable sort preserves input order of collation duplicates.
|
|
|
|
inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
|
|
|
|
|
|
|
|
// Now, we traverse all of the input, which is now sorted.
|
|
|
|
// If the item doesn't go in the current bucket, we find the next bucket that contains it.
|
|
|
|
// This makes the process order n*log(n), since we just sort the list and then do a linear process.
|
|
|
|
// However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
|
|
|
|
// we need to improve it for that case.
|
|
|
|
|
|
|
|
Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
|
|
|
|
int32_t bucketIndex = 1;
|
|
|
|
Bucket *nextBucket;
|
|
|
|
const UnicodeString *upperBoundary;
|
|
|
|
if (bucketIndex < buckets_->bucketList_->size()) {
|
|
|
|
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
|
|
|
|
upperBoundary = &nextBucket->lowerBoundary_;
|
|
|
|
} else {
|
|
|
|
nextBucket = NULL;
|
|
|
|
upperBoundary = NULL;
|
|
|
|
}
|
|
|
|
for (int32_t i = 0; i < inputList_->size(); ++i) {
|
|
|
|
Record *r = getRecord(*inputList_, i);
|
|
|
|
// if the current bucket isn't the right one, find the one that is
|
|
|
|
// We have a special flag for the last bucket so that we don't look any further
|
|
|
|
while (upperBoundary != NULL &&
|
|
|
|
collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
|
|
|
|
currentBucket = nextBucket;
|
|
|
|
// now reset the boundary that we compare against
|
|
|
|
if (bucketIndex < buckets_->bucketList_->size()) {
|
|
|
|
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
|
|
|
|
upperBoundary = &nextBucket->lowerBoundary_;
|
|
|
|
} else {
|
|
|
|
upperBoundary = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// now put the record into the bucket.
|
|
|
|
Bucket *bucket = currentBucket;
|
|
|
|
if (bucket->displayBucket_ != NULL) {
|
|
|
|
bucket = bucket->displayBucket_;
|
|
|
|
}
|
|
|
|
if (bucket->records_ == NULL) {
|
|
|
|
bucket->records_ = new UVector(errorCode);
|
|
|
|
if (bucket->records_ == NULL) {
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bucket->records_->addElement(r, errorCode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void AlphabeticIndex::clearBuckets() {
|
|
|
|
if (buckets_ != NULL) {
|
|
|
|
delete buckets_;
|
|
|
|
buckets_ = NULL;
|
|
|
|
internalResetBucketIterator();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void AlphabeticIndex::internalResetBucketIterator() {
|
|
|
|
labelsIterIndex_ = -1;
|
|
|
|
currentBucket_ = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
|
|
|
|
LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
UnicodeSet exemplars;
|
|
|
|
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
|
|
|
|
if (U_SUCCESS(status)) {
|
|
|
|
initialLabels_->addAll(exemplars);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
|
|
|
|
|
|
|
|
// The locale data did not include explicit Index characters.
|
|
|
|
// Synthesize a set of them from the locale's standard exemplar characters.
|
|
|
|
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// question: should we add auxiliary exemplars?
|
|
|
|
if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
|
|
|
|
exemplars.add(0x61, 0x7A);
|
|
|
|
}
|
|
|
|
if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
|
|
|
|
// cut down to small list
|
|
|
|
exemplars.remove(0xAC00, 0xD7A3).
|
|
|
|
add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
|
|
|
|
add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
|
|
|
|
add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
|
|
|
|
add(0xD30C).add(0xD558);
|
|
|
|
}
|
|
|
|
if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
|
|
|
|
// cut down to small list
|
|
|
|
// make use of the fact that Ethiopic is allocated in 8's, where
|
|
|
|
// the base is 0 mod 8.
|
|
|
|
UnicodeSet ethiopic(
|
|
|
|
UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
|
|
|
|
UnicodeSetIterator it(ethiopic);
|
|
|
|
while (it.next() && !it.isString()) {
|
|
|
|
if ((it.getCodepoint() & 0x7) != 0) {
|
|
|
|
exemplars.remove(it.getCodepoint());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Upper-case any that aren't already so.
|
|
|
|
// (We only do this for synthesized index characters.)
|
|
|
|
UnicodeSetIterator it(exemplars);
|
|
|
|
UnicodeString upperC;
|
|
|
|
while (it.next()) {
|
|
|
|
const UnicodeString &exemplarC = it.getString();
|
|
|
|
upperC = exemplarC;
|
|
|
|
upperC.toUpper(locale);
|
|
|
|
initialLabels_->add(upperC);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
|
|
|
|
UnicodeSet contractions;
|
|
|
|
collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
|
|
|
|
if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
|
|
|
|
initialLabels_->addAll(contractions);
|
|
|
|
UnicodeSetIterator iter(contractions);
|
|
|
|
while (iter.next()) {
|
|
|
|
const UnicodeString &s = iter.getString();
|
|
|
|
U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
|
|
|
|
UChar c = s.charAt(s.length() - 1);
|
|
|
|
if (0x41 <= c && c <= 0x5A) { // A-Z
|
|
|
|
// There are Pinyin labels, add ASCII A-Z labels as well.
|
|
|
|
initialLabels_->add(0x41, 0x5A); // A-Z
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
|
|
|
|
*/
|
|
|
|
static const UChar CGJ = 0x034F;
|
|
|
|
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
|
|
|
|
UnicodeString result;
|
|
|
|
if (item.length() == 0) {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
int32_t i = 0;
|
|
|
|
for (;;) {
|
|
|
|
UChar32 cp = item.char32At(i);
|
|
|
|
result.append(cp);
|
|
|
|
i = item.moveIndex32(i, 1);
|
|
|
|
if (i >= item.length()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
result.append(CGJ);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const RuleBasedCollator &AlphabeticIndex::getCollator() const {
|
|
|
|
return *collator_;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getInflowLabel() const {
|
|
|
|
return inflowLabel_;
|
|
|
|
}
|
|
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
|
|
|
|
return overflowLabel_;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
|
|
|
|
return underflowLabel_;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
|
|
|
|
inflowLabel_ = label;
|
|
|
|
clearBuckets();
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
|
|
|
|
overflowLabel_ = label;
|
|
|
|
clearBuckets();
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
|
|
|
|
underflowLabel_ = label;
|
|
|
|
clearBuckets();
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int32_t AlphabeticIndex::getMaxLabelCount() const {
|
|
|
|
return maxLabelCount_;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
if (maxLabelCount <= 0) {
|
|
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
maxLabelCount_ = maxLabelCount;
|
|
|
|
clearBuckets();
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// init() - Common code for constructors.
|
|
|
|
//
|
|
|
|
|
|
|
|
void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) { return; }
|
|
|
|
if (locale == NULL && collator_ == NULL) {
|
|
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
initialLabels_ = new UnicodeSet();
|
|
|
|
if (initialLabels_ == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
inflowLabel_.setTo((UChar)0x2026); // Ellipsis
|
|
|
|
overflowLabel_ = inflowLabel_;
|
|
|
|
underflowLabel_ = inflowLabel_;
|
|
|
|
|
|
|
|
if (collator_ == NULL) {
|
|
|
|
Collator *coll = Collator::createInstance(*locale, status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
delete coll;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (coll == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
collator_ = dynamic_cast<RuleBasedCollator *>(coll);
|
|
|
|
if (collator_ == NULL) {
|
|
|
|
delete coll;
|
|
|
|
status = U_UNSUPPORTED_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
|
|
|
|
if (collatorPrimaryOnly_ == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
|
|
|
|
firstCharsInScripts_ = firstStringsInScript(status);
|
|
|
|
if (U_FAILURE(status)) { return; }
|
|
|
|
firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
|
|
|
|
// Guard against a degenerate collator where
|
|
|
|
// some script boundary strings are primary ignorable.
|
|
|
|
for (;;) {
|
|
|
|
if (U_FAILURE(status)) { return; }
|
|
|
|
if (firstCharsInScripts_->isEmpty()) {
|
|
|
|
// AlphabeticIndex requires some non-ignorable script boundary strings.
|
|
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (collatorPrimaryOnly_->compare(
|
|
|
|
*static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
|
|
|
|
emptyString_, status) == UCOL_EQUAL) {
|
|
|
|
firstCharsInScripts_->removeElementAt(0);
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Chinese index characters, which are specific to each of the several Chinese tailorings,
|
|
|
|
// take precedence over the single locale data exemplar set per language.
|
|
|
|
if (!addChineseIndexCharacters(status) && locale != NULL) {
|
|
|
|
addIndexExemplars(*locale, status);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// Comparison function for UVector<UnicodeString *> sorting with a collator.
|
|
|
|
//
|
|
|
|
static int32_t U_CALLCONV
|
|
|
|
collatorComparator(const void *context, const void *left, const void *right) {
|
|
|
|
const UElement *leftElement = static_cast<const UElement *>(left);
|
|
|
|
const UElement *rightElement = static_cast<const UElement *>(right);
|
|
|
|
const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
|
|
|
|
const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
|
|
|
|
|
|
|
|
if (leftString == rightString) {
|
|
|
|
// Catches case where both are NULL
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (leftString == NULL) {
|
|
|
|
return 1;
|
|
|
|
};
|
|
|
|
if (rightString == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
const Collator *col = static_cast<const Collator *>(context);
|
|
|
|
UErrorCode errorCode = U_ZERO_ERROR;
|
|
|
|
return col->compare(*leftString, *rightString, errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Comparison function for UVector<Record *> sorting with a collator.
|
|
|
|
//
|
|
|
|
static int32_t U_CALLCONV
|
|
|
|
recordCompareFn(const void *context, const void *left, const void *right) {
|
|
|
|
const UElement *leftElement = static_cast<const UElement *>(left);
|
|
|
|
const UElement *rightElement = static_cast<const UElement *>(right);
|
|
|
|
const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
|
|
|
|
const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
|
|
|
|
const Collator *col = static_cast<const Collator *>(context);
|
|
|
|
UErrorCode errorCode = U_ZERO_ERROR;
|
|
|
|
return col->compare(leftRec->name_, rightRec->name_, errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
LocalPointer<UVector> dest(new UVector(status), status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
dest->setDeleter(uprv_deleteUObject);
|
|
|
|
// Fetch the script-first-primary contractions which are defined in the root collator.
|
|
|
|
// They all start with U+FDD1.
|
|
|
|
UnicodeSet set;
|
|
|
|
collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (set.isEmpty()) {
|
|
|
|
status = U_UNSUPPORTED_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
UnicodeSetIterator iter(set);
|
|
|
|
while (iter.next()) {
|
|
|
|
const UnicodeString &boundary = iter.getString();
|
|
|
|
uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
|
|
|
|
if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
|
|
|
|
// Ignore boundaries for the special reordering groups.
|
|
|
|
// Take only those for "real scripts" (where the sample character is a Letter,
|
|
|
|
// and the one for unassigned implicit weights (Cn).
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
UnicodeString *s = new UnicodeString(boundary);
|
|
|
|
if (s == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
dest->addElement(s, status);
|
|
|
|
}
|
|
|
|
return dest.orphan();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns true if one index character string is "better" than the other.
|
|
|
|
* Shorter NFKD is better, and otherwise NFKD-binary-less-than is
|
|
|
|
* better, and otherwise binary-less-than is better.
|
|
|
|
*/
|
|
|
|
UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
|
|
|
|
const UnicodeString &one, const UnicodeString &other) {
|
|
|
|
// This is called with primary-equal strings, but never with one.equals(other).
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
UnicodeString n1 = nfkdNormalizer.normalize(one, status);
|
|
|
|
UnicodeString n2 = nfkdNormalizer.normalize(other, status);
|
|
|
|
if (U_FAILURE(status)) { return FALSE; }
|
|
|
|
int32_t result = n1.countChar32() - n2.countChar32();
|
|
|
|
if (result != 0) {
|
|
|
|
return result < 0;
|
|
|
|
}
|
|
|
|
result = n1.compareCodePointOrder(n2);
|
|
|
|
if (result != 0) {
|
|
|
|
return result < 0;
|
|
|
|
}
|
|
|
|
return one.compareCodePointOrder(other) < 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
//
|
|
|
|
// Constructor & Destructor for AlphabeticIndex::Record
|
|
|
|
//
|
|
|
|
// Records are internal only, instances are not directly surfaced in the public API.
|
|
|
|
// This class is mostly struct-like, with all public fields.
|
|
|
|
|
|
|
|
AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
|
|
|
|
: name_(name), data_(data) {}
|
|
|
|
|
|
|
|
AlphabeticIndex::Record::~Record() {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
if (inputList_ == NULL) {
|
|
|
|
inputList_ = new UVector(status);
|
|
|
|
if (inputList_ == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
inputList_->setDeleter(alphaIndex_deleteRecord);
|
|
|
|
}
|
|
|
|
Record *r = new Record(name, data);
|
|
|
|
if (r == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
inputList_->addElement(r, status);
|
|
|
|
clearBuckets();
|
|
|
|
//std::string ss;
|
|
|
|
//std::string ss2;
|
|
|
|
//std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
|
|
|
|
// " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
|
|
|
|
if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
|
|
|
|
inputList_->removeAllElements();
|
|
|
|
clearBuckets();
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
|
|
|
|
initBuckets(status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int32_t AlphabeticIndex::getBucketIndex() const {
|
|
|
|
return labelsIterIndex_;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if (buckets_ == NULL && currentBucket_ != NULL) {
|
|
|
|
status = U_ENUM_OUT_OF_SYNC_ERROR;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
initBuckets(status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
++labelsIterIndex_;
|
|
|
|
if (labelsIterIndex_ >= buckets_->getBucketCount()) {
|
|
|
|
labelsIterIndex_ = buckets_->getBucketCount();
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
|
|
|
|
resetRecordIterator();
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getBucketLabel() const {
|
|
|
|
if (currentBucket_ != NULL) {
|
|
|
|
return currentBucket_->label_;
|
|
|
|
} else {
|
|
|
|
return emptyString_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
|
|
|
|
if (currentBucket_ != NULL) {
|
|
|
|
return currentBucket_->labelType_;
|
|
|
|
} else {
|
|
|
|
return U_ALPHAINDEX_NORMAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int32_t AlphabeticIndex::getBucketRecordCount() const {
|
|
|
|
if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
|
|
|
|
return currentBucket_->records_->size();
|
|
|
|
} else {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
internalResetBucketIterator();
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if (currentBucket_ == NULL) {
|
|
|
|
// We are trying to iterate over the items in a bucket, but there is no
|
|
|
|
// current bucket from the enumeration of buckets.
|
|
|
|
status = U_INVALID_STATE_ERROR;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if (buckets_ == NULL) {
|
|
|
|
status = U_ENUM_OUT_OF_SYNC_ERROR;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if (currentBucket_->records_ == NULL) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
++itemsIterIndex_;
|
|
|
|
if (itemsIterIndex_ >= currentBucket_->records_->size()) {
|
|
|
|
itemsIterIndex_ = currentBucket_->records_->size();
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getRecordName() const {
|
|
|
|
const UnicodeString *retStr = &emptyString_;
|
|
|
|
if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
|
|
|
|
itemsIterIndex_ >= 0 &&
|
|
|
|
itemsIterIndex_ < currentBucket_->records_->size()) {
|
|
|
|
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
|
|
|
|
retStr = &item->name_;
|
|
|
|
}
|
|
|
|
return *retStr;
|
|
|
|
}
|
|
|
|
|
|
|
|
const void *AlphabeticIndex::getRecordData() const {
|
|
|
|
const void *retPtr = NULL;
|
|
|
|
if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
|
|
|
|
itemsIterIndex_ >= 0 &&
|
|
|
|
itemsIterIndex_ < currentBucket_->records_->size()) {
|
|
|
|
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
|
|
|
|
retPtr = item->data_;
|
|
|
|
}
|
|
|
|
return retPtr;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
|
|
|
|
itemsIterIndex_ = -1;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
|
|
|
|
const UnicodeString &lowerBoundary,
|
|
|
|
UAlphabeticIndexLabelType type)
|
|
|
|
: label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
|
|
|
|
displayBucket_(NULL), displayIndex_(-1),
|
|
|
|
records_(NULL) {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
AlphabeticIndex::Bucket::~Bucket() {
|
|
|
|
delete records_;
|
|
|
|
}
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif // !UCONFIG_NO_COLLATION
|