node/deps/icu-small/source/i18n/uspoof_wsconf.cpp


								/*

								******************************************************************************

								*

								*   Copyright (C) 2008-2013, International Business Machines

								*   Corporation and others.  All Rights Reserved.

								*

								******************************************************************************

								*   file name:  uspoof_wsconf.cpp

								*   encoding:   US-ASCII

								*   tab size:   8 (not used)

								*   indentation:4

								*

								*   created on: 2009Jan05  (refactoring earlier files)

								*   created by: Andy Heninger

								*

								*   Internal functions for compililing Whole Script confusable source data

								*   into its binary (runtime) form.  The binary data format is described

								*   in uspoof_impl.h

								*/


								#include "unicode/utypes.h"

								#include "unicode/uspoof.h"


								#if !UCONFIG_NO_NORMALIZATION


								#if !UCONFIG_NO_REGULAR_EXPRESSIONS


								#include "unicode/unorm.h"

								#include "unicode/uregex.h"

								#include "unicode/ustring.h"

								#include "cmemory.h"

								#include "scriptset.h"

								#include "uspoof_impl.h"

								#include "uhash.h"

								#include "uvector.h"

								#include "uassert.h"

								#include "uspoof_wsconf.h"


								U_NAMESPACE_USE


								// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt

								// Example Lines:

								//   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O

								//   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I

								//    |               |     |    |

								//    |               |     |    |---- Which table, Any Case or Lower Case (A or L)

								//    |               |     |----------Target script.   We need this.

								//    |               |----------------Src script.  Should match the script of the source

								//    |                                code points.  Beyond checking that, we don't keep it.

								//    |--------------------------------Source code points or range.

								//

								// The expression will match _all_ lines, including erroneous lines.

								// The result of the parse is returned via the contents of the (match) groups.

								static const char *parseExp =

								        "(?m)"                                         // Multi-line mode

								        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.

								        "|^(?:"                                        //   OR

								        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.

								        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.

								        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.

								        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7

								        "[ \\t]*(?:#.*?)?"                             // Trailing commment

								        ")$|"                                          //   OR

								        "^(.*?)$";                                     // An error line.      Group 8.

								                                                       //    Any line not matching the preceding

								                                                       //    parts of the expression.will match

								                                                       //    this, and thus be flagged as an error


								// Extract a regular expression match group into a char * string.

								//    The group must contain only invariant characters.

								//    Used for script names

								//

								static void extractGroup(

								    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {


								    UChar ubuf[50];

								    ubuf[0] = 0;

								    destBuf[0] = 0;

								    int32_t len = uregex_group(e, group, ubuf, 50, &status);

								    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {

								        return;

								    }

								    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor

								    s.extract(0, len, destBuf, destCapacity, US_INV);

								}


								U_NAMESPACE_BEGIN


								//  Build the Whole Script Confusable data

								//

								//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,

								//                         because everything is local to this one build function anyhow,

								//                           OR

								//                         break this function into more reasonably sized pieces, with

								//                         state in WSConfusableDataBuilder.

								//

								void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,

								          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)

								{

								    if (U_FAILURE(status)) {

								        return;

								    }

								    URegularExpression *parseRegexp = NULL;

								    int32_t             inputLen    = 0;

								    UChar              *input       = NULL;

								    int32_t             lineNum     = 0;


								    UVector            *scriptSets        = NULL;

								    uint32_t            rtScriptSetsCount = 2;


								    UTrie2             *anyCaseTrie   = NULL;

								    UTrie2             *lowerCaseTrie = NULL;


								    anyCaseTrie = utrie2_open(0, 0, &status);

								    lowerCaseTrie = utrie2_open(0, 0, &status);


								    UnicodeString pattern(parseExp, -1, US_INV);


								    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.

								    //

								    // Reserved TRIE values:

								    //   0:  Code point has no whole script confusables.

								    //   1:  Code point is of script Common or Inherited.

								    //       These code points do not participate in whole script confusable detection.

								    //       (This is logically equivalent to saying that they contain confusables in

								    //        all scripts)

								    //

								    // Because Trie values are indexes into the ScriptSets vector, pre-fill

								    // vector positions 0 and 1 to avoid conflicts with the reserved values.


								    scriptSets = new UVector(status);

								    if (scriptSets == NULL) {

								        status = U_MEMORY_ALLOCATION_ERROR;

								        goto cleanup;

								    }

								    scriptSets->addElement((void *)NULL, status);

								    scriptSets->addElement((void *)NULL, status);


								    // Convert the user input data from UTF-8 to UChar (UTF-16)

								    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);

								    if (status != U_BUFFER_OVERFLOW_ERROR) {

								        goto cleanup;

								    }

								    status = U_ZERO_ERROR;

								    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));

								    if (input == NULL) {

								        status = U_MEMORY_ALLOCATION_ERROR;

								        goto cleanup;

								    }

								    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);


								    parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);


								    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign

								    //   given the syntax of the input.

								    if (*input == 0xfeff) {

								        *input = 0x20;

								    }


								    // Parse the input, one line per iteration of this loop.

								    uregex_setText(parseRegexp, input, inputLen, &status);

								    while (uregex_findNext(parseRegexp, &status)) {

								        lineNum++;

								        if (uregex_start(parseRegexp, 1, &status) >= 0) {

								            // this was a blank or comment line.

								            continue;

								        }

								        if (uregex_start(parseRegexp, 8, &status) >= 0) {

								            // input file syntax error.

								            status = U_PARSE_ERROR;

								            goto cleanup;

								        }

								        if (U_FAILURE(status)) {

								            goto cleanup;

								        }


								        // Pick up the start and optional range end code points from the parsed line.

								        UChar32  startCodePoint = SpoofImpl::ScanHex(

								            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);

								        UChar32  endCodePoint = startCodePoint;

								        if (uregex_start(parseRegexp, 3, &status) >=0) {

								            endCodePoint = SpoofImpl::ScanHex(

								                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);

								        }


								        // Extract the two script names from the source line.  We need these in an 8 bit

								        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on

								        //   to the ICU u_getPropertyValueEnum() function.  Ugh.

								        char  srcScriptName[20];

								        char  targScriptName[20];

								        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);

								        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);

								        UScriptCode srcScript  =

								            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));

								        UScriptCode targScript =

								            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));

								        if (U_FAILURE(status)) {

								            goto cleanup;

								        }

								        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {

								            status = U_INVALID_FORMAT_ERROR;

								            goto cleanup;

								        }


								        // select the table - (A) any case or (L) lower case only

								        UTrie2 *table = anyCaseTrie;

								        if (uregex_start(parseRegexp, 7, &status) >= 0) {

								            table = lowerCaseTrie;

								        }


								        // Build the set of scripts containing confusable characters for

								        //   the code point(s) specified in this input line.

								        // Sanity check that the script of the source code point is the same

								        //   as the source script indicated in the input file.  Failure of this check is

								        //   an error in the input file.

								        // Include the source script in the set (needed for Mixed Script Confusable detection).

								        //

								        UChar32 cp;

								        for (cp=startCodePoint; cp<=endCodePoint; cp++) {

								            int32_t setIndex = utrie2_get32(table, cp);

								            BuilderScriptSet *bsset = NULL;

								            if (setIndex > 0) {

								                U_ASSERT(setIndex < scriptSets->size());

								                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));

								            } else {

								                bsset = new BuilderScriptSet();

								                if (bsset == NULL) {

								                    status = U_MEMORY_ALLOCATION_ERROR;

								                    goto cleanup;

								                }

								                bsset->codePoint = cp;

								                bsset->trie = table;

								                bsset->sset = new ScriptSet();

								                setIndex = scriptSets->size();

								                bsset->index = setIndex;

								                bsset->rindex = 0;

								                if (bsset->sset == NULL) {

								                    status = U_MEMORY_ALLOCATION_ERROR;

								                    goto cleanup;

								                }

								                scriptSets->addElement(bsset, status);

								                utrie2_set32(table, cp, setIndex, &status);

								            }

								            bsset->sset->set(targScript, status);

								            bsset->sset->set(srcScript, status);


								            if (U_FAILURE(status)) {

								                goto cleanup;

								            }

								            UScriptCode cpScript = uscript_getScript(cp, &status);

								            if (cpScript != srcScript) {

								                status = U_INVALID_FORMAT_ERROR;

								                goto cleanup;

								            }

								        }

								    }


								    // Eliminate duplicate script sets.  At this point we have a separate

								    // script set for every code point that had data in the input file.

								    //

								    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them

								    //

								    // printf("Number of scriptSets: %d\n", scriptSets->size());

								    {

								        int32_t duplicateCount = 0;

								        rtScriptSetsCount = 2;

								        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {

								            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));

								            if (outerSet->index != static_cast<uint32_t>(outeri)) {

								                // This set was already identified as a duplicate.

								                //   It will not be allocated a position in the runtime array of ScriptSets.

								                continue;

								            }

								            outerSet->rindex = rtScriptSetsCount++;

								            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {

								                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));

								                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {

								                    delete innerSet->sset;

								                    innerSet->scriptSetOwned = FALSE;

								                    innerSet->sset = outerSet->sset;

								                    innerSet->index = outeri;

								                    innerSet->rindex = outerSet->rindex;

								                    duplicateCount++;

								                }

								                // But this doesn't get all.  We need to fix the TRIE.

								            }

								        }

								        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);

								    }


								    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).

								    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets

								    //     are unused, which is why the loop index starts at 2.)

								    {

								        for (int32_t i=2; i<scriptSets->size(); i++) {

								            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));

								            if (bSet->rindex != (uint32_t)i) {

								                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);

								            }

								        }

								    }


								    // For code points with script==Common or script==Inherited,

								    //   Set the reserved value of 1 into both Tries.  These characters do not participate

								    //   in Whole Script Confusable detection; this reserved value is the means

								    //   by which they are detected.

								    {

								        UnicodeSet ignoreSet;

								        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);

								        UnicodeSet inheritedSet;

								        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);

								        ignoreSet.addAll(inheritedSet);

								        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {

								            UChar32 rangeStart = ignoreSet.getRangeStart(rn);

								            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);

								            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);

								            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);

								        }

								    }


								    // Serialize the data to the Spoof Detector

								    {

								        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);

								        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);

								        // printf("Any case Trie size: %d\n", size);

								        if (status != U_BUFFER_OVERFLOW_ERROR) {

								            goto cleanup;

								        }

								        status = U_ZERO_ERROR;

								        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;

								        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;

								        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;

								        void *where = spImpl->fSpoofData->reserveSpace(size, status);

								        utrie2_serialize(anyCaseTrie, where, size, &status);


								        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);

								        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);

								        // printf("Lower case Trie size: %d\n", size);

								        if (status != U_BUFFER_OVERFLOW_ERROR) {

								            goto cleanup;

								        }

								        status = U_ZERO_ERROR;

								        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;

								        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;

								        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;

								        where = spImpl->fSpoofData->reserveSpace(size, status);

								        utrie2_serialize(lowerCaseTrie, where, size, &status);


								        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;

								        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;

								        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>

								            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));

								        uint32_t rindex = 2;

								        for (int32_t i=2; i<scriptSets->size(); i++) {

								            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));

								            if (bSet->rindex < rindex) {

								                // We have already copied this script set to the serialized data.

								                continue;

								            }

								            U_ASSERT(rindex == bSet->rindex);

								            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.

								            rindex++;

								        }

								    }


								    // Open new utrie2s from the serialized data.  We don't want to keep the ones

								    //   we just built because we would then have two copies of the data, one internal to

								    //   the utries that we have already constructed, and one in the serialized data area.

								    //   An alternative would be to not pre-serialize the Trie data, but that makes the

								    //   spoof detector data different, depending on how the detector was constructed.

								    //   It's simpler to keep the data always the same.


								    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(

								            UTRIE2_16_VALUE_BITS,

								            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,

								            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,

								            NULL,

								            &status);


								    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(

								            UTRIE2_16_VALUE_BITS,

								            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,

								            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,

								            NULL,

								            &status);


								cleanup:

								    if (U_FAILURE(status)) {

								        pe->line = lineNum;

								    }

								    uregex_close(parseRegexp);

								    uprv_free(input);


								    int32_t i;

								    if (scriptSets != NULL) {

								        for (i=0; i<scriptSets->size(); i++) {

								            BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));

								            delete bsset;

								        }

								        delete scriptSets;

								    }

								    utrie2_close(anyCaseTrie);

								    utrie2_close(lowerCaseTrie);

								    return;

								}


								U_NAMESPACE_END


								BuilderScriptSet::BuilderScriptSet() {

								    codePoint = -1;

								    trie = NULL;

								    sset = NULL;

								    index = 0;

								    rindex = 0;

								    scriptSetOwned = TRUE;

								}


								BuilderScriptSet::~BuilderScriptSet() {

								    if (scriptSetOwned) {

								        delete sset;

								    }

								}


								#endif

								#endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS