node/deps/icu-small/source/common/util.cpp


								/*

								**********************************************************************

								*   Copyright (c) 2001-2011, International Business Machines

								*   Corporation and others.  All Rights Reserved.

								**********************************************************************

								*   Date        Name        Description

								*   11/19/2001  aliu        Creation.

								**********************************************************************

								*/


								#include "unicode/unimatch.h"

								#include "unicode/utf16.h"

								#include "patternprops.h"

								#include "util.h"


								// Define UChar constants using hex for EBCDIC compatibility


								static const UChar BACKSLASH  = 0x005C; /*\*/

								static const UChar UPPER_U    = 0x0055; /*U*/

								static const UChar LOWER_U    = 0x0075; /*u*/

								static const UChar APOSTROPHE = 0x0027; // '\''

								static const UChar SPACE      = 0x0020; // ' '


								// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"

								static const UChar DIGITS[] = {

								    48,49,50,51,52,53,54,55,56,57,

								    65,66,67,68,69,70,71,72,73,74,

								    75,76,77,78,79,80,81,82,83,84,

								    85,86,87,88,89,90

								};


								U_NAMESPACE_BEGIN


								UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,

								                                     int32_t radix, int32_t minDigits) {

								    if (radix < 2 || radix > 36) {

								        // Bogus radix

								        return result.append((UChar)63/*?*/);

								    }

								    // Handle negatives

								    if (n < 0) {

								        n = -n;

								        result.append((UChar)45/*-*/);

								    }

								    // First determine the number of digits

								    int32_t nn = n;

								    int32_t r = 1;

								    while (nn >= radix) {

								        nn /= radix;

								        r *= radix;

								        --minDigits;

								    }

								    // Now generate the digits

								    while (--minDigits > 0) {

								        result.append(DIGITS[0]);

								    }

								    while (r > 0) {

								        int32_t digit = n / r;

								        result.append(DIGITS[digit]);

								        n -= digit * r;

								        r /= radix;

								    }

								    return result;

								}


								/**

								 * Return true if the character is NOT printable ASCII.

								 */

								UBool ICU_Utility::isUnprintable(UChar32 c) {

								    return !(c >= 0x20 && c <= 0x7E);

								}


								/**

								 * Escape unprintable characters using \uxxxx notation for U+0000 to

								 * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is

								 * printable ASCII, then do nothing and return FALSE.  Otherwise,

								 * append the escaped notation and return TRUE.

								 */

								UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {

								    if (isUnprintable(c)) {

								        result.append(BACKSLASH);

								        if (c & ~0xFFFF) {

								            result.append(UPPER_U);

								            result.append(DIGITS[0xF&(c>>28)]);

								            result.append(DIGITS[0xF&(c>>24)]);

								            result.append(DIGITS[0xF&(c>>20)]);

								            result.append(DIGITS[0xF&(c>>16)]);

								        } else {

								            result.append(LOWER_U);

								        }

								        result.append(DIGITS[0xF&(c>>12)]);

								        result.append(DIGITS[0xF&(c>>8)]);

								        result.append(DIGITS[0xF&(c>>4)]);

								        result.append(DIGITS[0xF&c]);

								        return TRUE;

								    }

								    return FALSE;

								}


								/**

								 * Returns the index of a character, ignoring quoted text.

								 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be

								 * found by a search for 'h'.

								 */

								// FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.

								/*

								int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,

								                               int32_t start, int32_t limit,

								                               UChar charToFind) {

								    for (int32_t i=start; i<limit; ++i) {

								        UChar c = text.charAt(i);

								        if (c == BACKSLASH) {

								            ++i;

								        } else if (c == APOSTROPHE) {

								            while (++i < limit

								                   && text.charAt(i) != APOSTROPHE) {}

								        } else if (c == charToFind) {

								            return i;

								        }

								    }

								    return -1;

								}

								*/


								/**

								 * Skip over a sequence of zero or more white space characters at pos.

								 * @param advance if true, advance pos to the first non-white-space

								 * character at or after pos, or str.length(), if there is none.

								 * Otherwise leave pos unchanged.

								 * @return the index of the first non-white-space character at or

								 * after pos, or str.length(), if there is none.

								 */

								int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,

								                                    UBool advance) {

								    int32_t p = pos;

								    const UChar* s = str.getBuffer();

								    p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s);

								    if (advance) {

								        pos = p;

								    }

								    return p;

								}


								/**

								 * Skip over Pattern_White_Space in a Replaceable.

								 * Skipping may be done in the forward or

								 * reverse direction.  In either case, the leftmost index will be

								 * inclusive, and the rightmost index will be exclusive.  That is,

								 * given a range defined as [start, limit), the call

								 * skipWhitespace(text, start, limit) will advance start past leading

								 * whitespace, whereas the call skipWhitespace(text, limit, start),

								 * will back up limit past trailing whitespace.

								 * @param text the text to be analyzed

								 * @param pos either the start or limit of a range of 'text', to skip

								 * leading or trailing whitespace, respectively

								 * @param stop either the limit or start of a range of 'text', to skip

								 * leading or trailing whitespace, respectively

								 * @return the new start or limit, depending on what was passed in to

								 * 'pos'

								 */

								//?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.

								//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,

								//?                                    int32_t pos, int32_t stop) {

								//?    UChar32 c;

								//?    UBool isForward = (stop >= pos);

								//?

								//?    if (!isForward) {

								//?        --pos; // pos is a limit, so back up by one

								//?    }

								//?

								//?    while (pos != stop &&

								//?           PatternProps::isWhiteSpace(c = text.char32At(pos))) {

								//?        if (isForward) {

								//?            pos += U16_LENGTH(c);

								//?        } else {

								//?            pos -= U16_LENGTH(c);

								//?        }

								//?    }

								//?

								//?    if (!isForward) {

								//?        ++pos; // make pos back into a limit

								//?    }

								//?

								//?    return pos;

								//?}


								/**

								 * Parse a single non-whitespace character 'ch', optionally

								 * preceded by whitespace.

								 * @param id the string to be parsed

								 * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the

								 * offset of the first character to be parsed.  On output, pos[0]

								 * is the index after the last parsed character.  If the parse

								 * fails, pos[0] will be unchanged.

								 * @param ch the non-whitespace character to be parsed.

								 * @return true if 'ch' is seen preceded by zero or more

								 * whitespace characters.

								 */

								UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {

								    int32_t start = pos;

								    skipWhitespace(id, pos, TRUE);

								    if (pos == id.length() ||

								        id.charAt(pos) != ch) {

								        pos = start;

								        return FALSE;

								    }

								    ++pos;

								    return TRUE;

								}


								/**

								 * Parse a pattern string within the given Replaceable and a parsing

								 * pattern.  Characters are matched literally and case-sensitively

								 * except for the following special characters:

								 *

								 * ~  zero or more Pattern_White_Space chars

								 *

								 * If end of pattern is reached with all matches along the way,

								 * pos is advanced to the first unparsed index and returned.

								 * Otherwise -1 is returned.

								 * @param pat pattern that controls parsing

								 * @param text text to be parsed, starting at index

								 * @param index offset to first character to parse

								 * @param limit offset after last character to parse

								 * @return index after last parsed character, or -1 on parse failure.

								 */

								int32_t ICU_Utility::parsePattern(const UnicodeString& pat,

								                                  const Replaceable& text,

								                                  int32_t index,

								                                  int32_t limit) {

								    int32_t ipat = 0;


								    // empty pattern matches immediately

								    if (ipat == pat.length()) {

								        return index;

								    }


								    UChar32 cpat = pat.char32At(ipat);


								    while (index < limit) {

								        UChar32 c = text.char32At(index);


								        // parse \s*

								        if (cpat == 126 /*~*/) {

								            if (PatternProps::isWhiteSpace(c)) {

								                index += U16_LENGTH(c);

								                continue;

								            } else {

								                if (++ipat == pat.length()) {

								                    return index; // success; c unparsed

								                }

								                // fall thru; process c again with next cpat

								            }

								        }


								        // parse literal

								        else if (c == cpat) {

								            index += U16_LENGTH(c);

								            ipat += U16_LENGTH(cpat);

								            if (ipat == pat.length()) {

								                return index; // success; c parsed

								            }

								            // fall thru; get next cpat

								        }


								        // match failure of literal

								        else {

								            return -1;

								        }


								        cpat = pat.char32At(ipat);

								    }


								    return -1; // text ended before end of pat

								}


								/**

								 * Append a character to a rule that is being built up.  To flush

								 * the quoteBuf to rule, make one final call with isLiteral == TRUE.

								 * If there is no final character, pass in (UChar32)-1 as c.

								 * @param rule the string to append the character to

								 * @param c the character to append, or (UChar32)-1 if none.

								 * @param isLiteral if true, then the given character should not be

								 * quoted or escaped.  Usually this means it is a syntactic element

								 * such as > or $

								 * @param escapeUnprintable if true, then unprintable characters

								 * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will

								 * appear outside of quotes.

								 * @param quoteBuf a buffer which is used to build up quoted

								 * substrings.  The caller should initially supply an empty buffer,

								 * and thereafter should not modify the buffer.  The buffer should be

								 * cleared out by, at the end, calling this method with a literal

								 * character.

								 */

								void ICU_Utility::appendToRule(UnicodeString& rule,

								                               UChar32 c,

								                               UBool isLiteral,

								                               UBool escapeUnprintable,

								                               UnicodeString& quoteBuf) {

								    // If we are escaping unprintables, then escape them outside

								    // quotes.  \u and \U are not recognized within quotes.  The same

								    // logic applies to literals, but literals are never escaped.

								    if (isLiteral ||

								        (escapeUnprintable && ICU_Utility::isUnprintable(c))) {

								        if (quoteBuf.length() > 0) {

								            // We prefer backslash APOSTROPHE to double APOSTROPHE

								            // (more readable, less similar to ") so if there are

								            // double APOSTROPHEs at the ends, we pull them outside

								            // of the quote.


								            // If the first thing in the quoteBuf is APOSTROPHE

								            // (doubled) then pull it out.

								            while (quoteBuf.length() >= 2 &&

								                   quoteBuf.charAt(0) == APOSTROPHE &&

								                   quoteBuf.charAt(1) == APOSTROPHE) {

								                rule.append(BACKSLASH).append(APOSTROPHE);

								                quoteBuf.remove(0, 2);

								            }

								            // If the last thing in the quoteBuf is APOSTROPHE

								            // (doubled) then remove and count it and add it after.

								            int32_t trailingCount = 0;

								            while (quoteBuf.length() >= 2 &&

								                   quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&

								                   quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {

								                quoteBuf.truncate(quoteBuf.length()-2);

								                ++trailingCount;

								            }

								            if (quoteBuf.length() > 0) {

								                rule.append(APOSTROPHE);

								                rule.append(quoteBuf);

								                rule.append(APOSTROPHE);

								                quoteBuf.truncate(0);

								            }

								            while (trailingCount-- > 0) {

								                rule.append(BACKSLASH).append(APOSTROPHE);

								            }

								        }

								        if (c != (UChar32)-1) {

								            /* Since spaces are ignored during parsing, they are

								             * emitted only for readability.  We emit one here

								             * only if there isn't already one at the end of the

								             * rule.

								             */

								            if (c == SPACE) {

								                int32_t len = rule.length();

								                if (len > 0 && rule.charAt(len-1) != c) {

								                    rule.append(c);

								                }

								            } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {

								                rule.append(c);

								            }

								        }

								    }


								    // Escape ' and '\' and don't begin a quote just for them

								    else if (quoteBuf.length() == 0 &&

								             (c == APOSTROPHE || c == BACKSLASH)) {

								        rule.append(BACKSLASH);

								        rule.append(c);

								    }


								    // Specials (printable ascii that isn't [0-9a-zA-Z]) and

								    // whitespace need quoting.  Also append stuff to quotes if we are

								    // building up a quoted substring already.

								    else if (quoteBuf.length() > 0 ||

								             (c >= 0x0021 && c <= 0x007E &&

								              !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||

								                (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||

								                (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||

								             PatternProps::isWhiteSpace(c)) {

								        quoteBuf.append(c);

								        // Double ' within a quote

								        if (c == APOSTROPHE) {

								            quoteBuf.append(c);

								        }

								    }


								    // Otherwise just append

								    else {

								        rule.append(c);

								    }

								}


								void ICU_Utility::appendToRule(UnicodeString& rule,

								                               const UnicodeString& text,

								                               UBool isLiteral,

								                               UBool escapeUnprintable,

								                               UnicodeString& quoteBuf) {

								    for (int32_t i=0; i<text.length(); ++i) {

								        appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);

								    }

								}


								/**

								 * Given a matcher reference, which may be null, append its

								 * pattern as a literal to the given rule.

								 */

								void ICU_Utility::appendToRule(UnicodeString& rule,

								                               const UnicodeMatcher* matcher,

								                               UBool escapeUnprintable,

								                               UnicodeString& quoteBuf) {

								    if (matcher != NULL) {

								        UnicodeString pat;

								        appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),

								                     TRUE, escapeUnprintable, quoteBuf);

								    }

								}


								U_NAMESPACE_END