node/deps/icu-small/source/i18n/rbt_rule.h


								/*

								* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.

								**********************************************************************

								*   Date        Name        Description

								*   11/17/99    aliu        Creation.

								**********************************************************************

								*/

								#ifndef RBT_RULE_H

								#define RBT_RULE_H


								#include "unicode/utypes.h"


								#if !UCONFIG_NO_TRANSLITERATION


								#include "unicode/uobject.h"

								#include "unicode/unistr.h"

								#include "unicode/utrans.h"

								#include "unicode/unimatch.h"


								U_NAMESPACE_BEGIN


								class Replaceable;

								class TransliterationRuleData;

								class StringMatcher;

								class UnicodeFunctor;


								/**

								 * A transliteration rule used by

								 * <code>RuleBasedTransliterator</code>.

								 * <code>TransliterationRule</code> is an immutable object.

								 *

								 * <p>A rule consists of an input pattern and an output string.  When

								 * the input pattern is matched, the output string is emitted.  The

								 * input pattern consists of zero or more characters which are matched

								 * exactly (the key) and optional context.  Context must match if it

								 * is specified.  Context may be specified before the key, after the

								 * key, or both.  The key, preceding context, and following context

								 * may contain variables.  Variables represent a set of Unicode

								 * characters, such as the letters <i>a</i> through <i>z</i>.

								 * Variables are detected by looking up each character in a supplied

								 * variable list to see if it has been so defined.

								 *

								 * <p>A rule may contain segments in its input string and segment

								 * references in its output string.  A segment is a substring of the

								 * input pattern, indicated by an offset and limit.  The segment may

								 * be in the preceding or following context.  It may not span a

								 * context boundary.  A segment reference is a special character in

								 * the output string that causes a segment of the input string (not

								 * the input pattern) to be copied to the output string.  The range of

								 * special characters that represent segment references is defined by

								 * RuleBasedTransliterator.Data.

								 *

								 * @author Alan Liu

								 */

								class TransliterationRule : public UMemory {


								private:


								    // TODO Eliminate the pattern and keyLength data members.  They

								    // are used only by masks() and getIndexValue() which are called

								    // only during build time, not during run-time.  Perhaps these

								    // methods and pattern/keyLength can be isolated into a separate

								    // object.


								    /**

								     * The match that must occur before the key, or null if there is no

								     * preceding context.

								     */

								    StringMatcher *anteContext;


								    /**

								     * The matcher object for the key.  If null, then the key is empty.

								     */

								    StringMatcher *key;


								    /**

								     * The match that must occur after the key, or null if there is no

								     * following context.

								     */

								    StringMatcher *postContext;


								    /**

								     * The object that performs the replacement if the key,

								     * anteContext, and postContext are matched.  Never null.

								     */

								    UnicodeFunctor* output;


								    /**

								     * The string that must be matched, consisting of the anteContext, key,

								     * and postContext, concatenated together, in that order.  Some components

								     * may be empty (zero length).

								     * @see anteContextLength

								     * @see keyLength

								     */

								    UnicodeString pattern;


								    /**

								     * An array of matcher objects corresponding to the input pattern

								     * segments.  If there are no segments this is null.  N.B. This is

								     * a UnicodeMatcher for generality, but in practice it is always a

								     * StringMatcher.  In the future we may generalize this, but for

								     * now we sometimes cast down to StringMatcher.

								     *

								     * The array is owned, but the pointers within it are not.

								     */

								    UnicodeFunctor** segments;


								    /**

								     * The number of elements in segments[] or zero if segments is NULL.

								     */

								    int32_t segmentsCount;


								    /**

								     * The length of the string that must match before the key.  If

								     * zero, then there is no matching requirement before the key.

								     * Substring [0,anteContextLength) of pattern is the anteContext.

								     */

								    int32_t anteContextLength;


								    /**

								     * The length of the key.  Substring [anteContextLength,

								     * anteContextLength + keyLength) is the key.


								     */

								    int32_t keyLength;


								    /**

								     * Miscellaneous attributes.

								     */

								    int8_t flags;


								    /**

								     * Flag attributes.

								     */

								    enum {

								        ANCHOR_START = 1,

								        ANCHOR_END   = 2

								    };


								    /**

								     * An alias pointer to the data for this rule.  The data provides

								     * lookup services for matchers and segments.

								     */

								    const TransliterationRuleData* data;


								public:


								    /**

								     * Construct a new rule with the given input, output text, and other

								     * attributes.  A cursor position may be specified for the output text.

								     * @param input          input string, including key and optional ante and

								     *                       post context.

								     * @param anteContextPos offset into input to end of ante context, or -1 if

								     *                       none.  Must be <= input.length() if not -1.

								     * @param postContextPos offset into input to start of post context, or -1

								     *                       if none.  Must be <= input.length() if not -1, and must be >=

								     *                       anteContextPos.

								     * @param outputStr      output string.

								     * @param cursorPosition offset into output at which cursor is located, or -1 if

								     *                       none.  If less than zero, then the cursor is placed after the

								     *                       <code>output</code>; that is, -1 is equivalent to

								     *                       <code>output.length()</code>.  If greater than

								     *                       <code>output.length()</code> then an exception is thrown.

								     * @param cursorOffset   an offset to be added to cursorPos to position the

								     *                       cursor either in the ante context, if < 0, or in the post context, if >

								     *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to

								     *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset

								     *                       of -3.

								     * @param segs           array of UnicodeMatcher corresponding to input pattern

								     *                       segments, or null if there are none.  The array itself is adopted,

								     *                       but the pointers within it are not.

								     * @param segsCount      number of elements in segs[].

								     * @param anchorStart    TRUE if the the rule is anchored on the left to

								     *                       the context start.

								     * @param anchorEnd      TRUE if the rule is anchored on the right to the

								     *                       context limit.

								     * @param data           the rule data.

								     * @param status         Output parameter filled in with success or failure status.

								     */

								    TransliterationRule(const UnicodeString& input,

								                        int32_t anteContextPos, int32_t postContextPos,

								                        const UnicodeString& outputStr,

								                        int32_t cursorPosition, int32_t cursorOffset,

								                        UnicodeFunctor** segs,

								                        int32_t segsCount,

								                        UBool anchorStart, UBool anchorEnd,

								                        const TransliterationRuleData* data,

								                        UErrorCode& status);


								    /**

								     * Copy constructor.

								     * @param other    the object to be copied.

								     */

								    TransliterationRule(TransliterationRule& other);


								    /**

								     * Destructor.

								     */

								    virtual ~TransliterationRule();


								    /**

								     * Change the data object that this rule belongs to.  Used

								     * internally by the TransliterationRuleData copy constructor.

								     * @param data    the new data value to be set.

								     */

								    void setData(const TransliterationRuleData* data);


								    /**

								     * Return the preceding context length.  This method is needed to

								     * support the <code>Transliterator</code> method

								     * <code>getMaximumContextLength()</code>.  Internally, this is

								     * implemented as the anteContextLength, optionally plus one if

								     * there is a start anchor.  The one character anchor gap is

								     * needed to make repeated incremental transliteration with

								     * anchors work.

								     * @return    the preceding context length.

								     */

								    virtual int32_t getContextLength(void) const;


								    /**

								     * Internal method.  Returns 8-bit index value for this rule.

								     * This is the low byte of the first character of the key,

								     * unless the first character of the key is a set.  If it's a

								     * set, or otherwise can match multiple keys, the index value is -1.

								     * @return    8-bit index value for this rule.

								     */

								    int16_t getIndexValue() const;


								    /**

								     * Internal method.  Returns true if this rule matches the given

								     * index value.  The index value is an 8-bit integer, 0..255,

								     * representing the low byte of the first character of the key.

								     * It matches this rule if it matches the first character of the

								     * key, or if the first character of the key is a set, and the set

								     * contains any character with a low byte equal to the index

								     * value.  If the rule contains only ante context, as in foo)>bar,

								     * then it will match any key.

								     * @param v    the given index value.

								     * @return     true if this rule matches the given index value.

								     */

								    UBool matchesIndexValue(uint8_t v) const;


								    /**

								     * Return true if this rule masks another rule.  If r1 masks r2 then

								     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks

								     * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".

								     * "[c]a>x" masks "[dc]a>y".

								     * @param r2  the given rule to be compared with.

								     * @return    true if this rule masks 'r2'

								     */

								    virtual UBool masks(const TransliterationRule& r2) const;


								    /**

								     * Attempt a match and replacement at the given position.  Return

								     * the degree of match between this rule and the given text.  The

								     * degree of match may be mismatch, a partial match, or a full

								     * match.  A mismatch means at least one character of the text

								     * does not match the context or key.  A partial match means some

								     * context and key characters match, but the text is not long

								     * enough to match all of them.  A full match means all context

								     * and key characters match.

								     *

								     * If a full match is obtained, perform a replacement, update pos,

								     * and return U_MATCH.  Otherwise both text and pos are unchanged.

								     *

								     * @param text the text

								     * @param pos the position indices

								     * @param incremental if TRUE, test for partial matches that may

								     * be completed by additional text inserted at pos.limit.

								     * @return one of <code>U_MISMATCH</code>,

								     * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If

								     * incremental is FALSE then U_PARTIAL_MATCH will not be returned.

								     */

								    UMatchDegree matchAndReplace(Replaceable& text,

								                                 UTransPosition& pos,

								                                 UBool incremental) const;


								    /**

								     * Create a rule string that represents this rule object.  Append

								     * it to the given string.

								     */

								    virtual UnicodeString& toRule(UnicodeString& pat,

								                                  UBool escapeUnprintable) const;


								    /**

								     * Union the set of all characters that may be modified by this rule

								     * into the given set.

								     */

								    void addSourceSetTo(UnicodeSet& toUnionTo) const;


								    /**

								     * Union the set of all characters that may be emitted by this rule

								     * into the given set.

								     */

								    void addTargetSetTo(UnicodeSet& toUnionTo) const;


								 private:


								    friend class StringMatcher;


								    TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class

								};


								U_NAMESPACE_END


								#endif /* #if !UCONFIG_NO_TRANSLITERATION */


								#endif