|
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (C) 2008-2015, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 05/11/2008 Andy Heninger Port from Java
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
|
|
|
|
|
|
|
|
#include "unicode/brkiter.h"
|
|
|
|
#include "unicode/localpointer.h"
|
|
|
|
#include "unicode/uchar.h"
|
|
|
|
#include "unicode/unifilt.h"
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
|
|
|
|
#include "brktrans.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "mutex.h"
|
|
|
|
#include "uprops.h"
|
|
|
|
#include "uinvchar.h"
|
|
|
|
#include "util.h"
|
|
|
|
#include "uvectr32.h"
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
|
|
|
|
|
|
|
|
static const UChar SPACE = 32; // ' '
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Constructs a transliterator with the default delimiters '{' and
|
|
|
|
* '}'.
|
|
|
|
*/
|
|
|
|
BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
|
|
|
|
Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
|
|
|
|
cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor.
|
|
|
|
*/
|
|
|
|
BreakTransliterator::~BreakTransliterator() {
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Copy constructor.
|
|
|
|
*/
|
|
|
|
BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
|
|
|
|
Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Transliterator API.
|
|
|
|
*/
|
|
|
|
Transliterator* BreakTransliterator::clone(void) const {
|
|
|
|
return new BreakTransliterator(*this);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implements {@link Transliterator#handleTransliterate}.
|
|
|
|
*/
|
|
|
|
void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
|
|
|
|
UBool isIncremental ) const {
|
|
|
|
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
LocalPointer<BreakIterator> bi;
|
|
|
|
LocalPointer<UVector32> boundaries;
|
|
|
|
|
|
|
|
{
|
|
|
|
Mutex m;
|
|
|
|
BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
|
|
|
|
boundaries.moveFrom(nonConstThis->cachedBoundaries);
|
|
|
|
bi.moveFrom(nonConstThis->cachedBI);
|
|
|
|
}
|
|
|
|
if (bi.isNull()) {
|
|
|
|
bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
|
|
|
|
}
|
|
|
|
if (boundaries.isNull()) {
|
|
|
|
boundaries.adoptInstead(new UVector32(status));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
boundaries->removeAllElements();
|
|
|
|
UnicodeString sText = replaceableAsString(text);
|
|
|
|
bi->setText(sText);
|
|
|
|
bi->preceding(offsets.start);
|
|
|
|
|
|
|
|
// To make things much easier, we will stack the boundaries, and then insert at the end.
|
|
|
|
// generally, we won't need too many, since we will be filtered.
|
|
|
|
|
|
|
|
int32_t boundary;
|
|
|
|
for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
|
|
|
|
if (boundary == 0) continue;
|
|
|
|
// HACK: Check to see that preceeding item was a letter
|
|
|
|
|
|
|
|
UChar32 cp = sText.char32At(boundary-1);
|
|
|
|
int type = u_charType(cp);
|
|
|
|
//System.out.println(Integer.toString(cp,16) + " (before): " + type);
|
|
|
|
if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
|
|
|
|
|
|
|
|
cp = sText.char32At(boundary);
|
|
|
|
type = u_charType(cp);
|
|
|
|
//System.out.println(Integer.toString(cp,16) + " (after): " + type);
|
|
|
|
if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
|
|
|
|
|
|
|
|
boundaries->addElement(boundary, status);
|
|
|
|
// printf("Boundary at %d\n", boundary);
|
|
|
|
}
|
|
|
|
|
|
|
|
int delta = 0;
|
|
|
|
int lastBoundary = 0;
|
|
|
|
|
|
|
|
if (boundaries->size() != 0) { // if we found something, adjust
|
|
|
|
delta = boundaries->size() * fInsertion.length();
|
|
|
|
lastBoundary = boundaries->lastElementi();
|
|
|
|
|
|
|
|
// we do this from the end backwards, so that we don't have to keep updating.
|
|
|
|
|
|
|
|
while (boundaries->size() > 0) {
|
|
|
|
boundary = boundaries->popi();
|
|
|
|
text.handleReplaceBetween(boundary, boundary, fInsertion);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now fix up the return values
|
|
|
|
offsets.contextLimit += delta;
|
|
|
|
offsets.limit += delta;
|
|
|
|
offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
|
|
|
|
|
|
|
|
// Return break iterator & boundaries vector to the cache.
|
|
|
|
{
|
|
|
|
Mutex m;
|
|
|
|
BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
|
|
|
|
if (nonConstThis->cachedBI.isNull()) {
|
|
|
|
nonConstThis->cachedBI.moveFrom(bi);
|
|
|
|
}
|
|
|
|
if (nonConstThis->cachedBoundaries.isNull()) {
|
|
|
|
nonConstThis->cachedBoundaries.moveFrom(boundaries);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: do something with U_FAILURE(status);
|
|
|
|
// (need to look at transliterators overall, not just here.)
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// getInsertion()
|
|
|
|
//
|
|
|
|
const UnicodeString &BreakTransliterator::getInsertion() const {
|
|
|
|
return fInsertion;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// setInsertion()
|
|
|
|
//
|
|
|
|
void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
|
|
|
|
this->fInsertion = insertion;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// replaceableAsString Hack to let break iterators work
|
|
|
|
// on the replaceable text from transliterators.
|
|
|
|
// In practice, the only real Replaceable type that we
|
|
|
|
// will be seeing is UnicodeString, so this function
|
|
|
|
// will normally be efficient.
|
|
|
|
//
|
|
|
|
UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
|
|
|
|
UnicodeString s;
|
|
|
|
UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
|
|
|
|
if (rs != NULL) {
|
|
|
|
s = *rs;
|
|
|
|
} else {
|
|
|
|
r.extractBetween(0, r.length(), s);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|