node/deps/icu-small/source/i18n/collationdatareader.h


								// Copyright (C) 2016 and later: Unicode, Inc. and others.

								// License & terms of use: http://www.unicode.org/copyright.html

								/*

								*******************************************************************************

								* Copyright (C) 2013-2015, International Business Machines

								* Corporation and others.  All Rights Reserved.

								*******************************************************************************

								* collationdatareader.h

								*

								* created on: 2013feb07

								* created by: Markus W. Scherer

								*/


								#ifndef __COLLATIONDATAREADER_H__

								#define __COLLATIONDATAREADER_H__


								#include "unicode/utypes.h"


								#if !UCONFIG_NO_COLLATION


								#include "unicode/udata.h"


								struct UDataMemory;


								U_NAMESPACE_BEGIN


								struct CollationTailoring;


								/**

								 * Collation binary data reader.

								 */

								struct U_I18N_API CollationDataReader /* all static */ {

								    // The following constants are also copied into source/common/ucol_swp.cpp.

								    // Keep them in sync!

								    enum {

								        /**

								         * Number of int32_t indexes.

								         *

								         * Can be 2 if there are only options.

								         * Can be 7 or 8 if there are only options and a script reordering.

								         * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.

								         */

								        IX_INDEXES_LENGTH,  // 0

								        /**

								         * Bits 31..24: numericPrimary, for numeric collation

								         *      23..16: fast Latin format version (0 = no fast Latin table)

								         *      15.. 0: options bit set

								         */

								        IX_OPTIONS,

								        IX_RESERVED2,

								        IX_RESERVED3,


								        /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */

								        IX_JAMO_CE32S_START,  // 4


								        // Byte offsets from the start of the data, after the generic header.

								        // The indexes[] are at byte offset 0, other data follows.

								        // Each data item is aligned properly.

								        // The data items should be in descending order of unit size,

								        // to minimize the need for padding.

								        // Each item's byte length is given by the difference between its offset and

								        // the next index/offset value.

								        /** Byte offset to int32_t reorderCodes[]. */

								        IX_REORDER_CODES_OFFSET,

								        /**

								         * Byte offset to uint8_t reorderTable[].

								         * Empty table if <256 bytes (padding only).

								         * Otherwise 256 bytes or more (with padding).

								         */

								        IX_REORDER_TABLE_OFFSET,

								        /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */

								        IX_TRIE_OFFSET,


								        IX_RESERVED8_OFFSET,  // 8

								        /** Byte offset to int64_t ces[]. */

								        IX_CES_OFFSET,

								        IX_RESERVED10_OFFSET,

								        /** Byte offset to uint32_t ce32s[]. */

								        IX_CE32S_OFFSET,


								        /** Byte offset to uint32_t rootElements[]. */

								        IX_ROOT_ELEMENTS_OFFSET,  // 12

								        /** Byte offset to UChar *contexts[]. */

								        IX_CONTEXTS_OFFSET,

								        /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */

								        IX_UNSAFE_BWD_OFFSET,

								        /** Byte offset to uint16_t fastLatinTable[]. */

								        IX_FAST_LATIN_TABLE_OFFSET,


								        /** Byte offset to uint16_t scripts[]. */

								        IX_SCRIPTS_OFFSET,  // 16

								        /**

								         * Byte offset to UBool compressibleBytes[].

								         * Empty table if <256 bytes (padding only).

								         * Otherwise 256 bytes or more (with padding).

								         */

								        IX_COMPRESSIBLE_BYTES_OFFSET,

								        IX_RESERVED18_OFFSET,

								        IX_TOTAL_SIZE

								    };


								    static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,

								                     CollationTailoring &tailoring, UErrorCode &errorCode);


								    static UBool U_CALLCONV

								    isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);


								private:

								    CollationDataReader();  // no constructor

								};


								/*

								 * Format of collation data (ucadata.icu, binary data in coll/ *.res files).

								 * Format version 5.

								 *

								 * The root collation data is stored in the ucadata.icu file.

								 * Tailorings are stored inside .res resource bundle files, with a complete file header.

								 *

								 * Collation data begins with a standard ICU data file header

								 * (DataHeader, see ucmndata.h and unicode/udata.h).

								 * The UDataInfo.dataVersion field contains the UCA and other version numbers,

								 * see the comments for CollationTailoring.version.

								 *

								 * After the header, the file contains the following parts.

								 * Constants are defined as enum values of the CollationDataReader class.

								 * See also the Collation class.

								 *

								 * int32_t indexes[indexesLength];

								 *      The indexes array has variable length.

								 *      Some tailorings only need the length and the options,

								 *      others only add reorderCodes and the reorderTable,

								 *      some need to store mappings.

								 *      Only as many indexes are stored as needed to read all of the data.

								 *

								 *      Index 0: indexesLength

								 *      Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS

								 *      Index 2..3: Unused/reserved/0.

								 *      Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo

								 *               are stored in a short, contiguous part of the ce32s array.

								 *

								 *      Indexes 5..19 are byte offsets in ascending order.

								 *      Each byte offset marks the start of the next part in the data file,

								 *      and the end of the previous one.

								 *      When two consecutive byte offsets are the same (or too short),

								 *      then the corresponding part is empty.

								 *      Byte offsets are offsets from after the header,

								 *      that is, from the beginning of the indexes[].

								 *      Each part starts at an offset with proper alignment for its data.

								 *      If necessary, the previous part may include padding bytes to achieve this alignment.

								 *      The last byte offset that is stored in the indexes indicates the total size of the data

								 *      (starting with the indexes).

								 *

								 * int32_t reorderCodes[]; -- empty in root

								 *      The list of script and reordering codes.

								 *

								 *      Beginning with format version 5, this array may optionally

								 *      have trailing entries with a full list of reorder ranges

								 *      as described for CollationSettings::reorderRanges.

								 *

								 *      Script or reorder codes are first and do not exceed 16-bit values.

								 *      Range limits are stored in the upper 16 bits, and are never 0.

								 *      Split this array into reorder codes and ranges at the first entry

								 *      with non-zero upper 16 bits.

								 *

								 *      If the ranges are missing but needed for split-reordered primary lead bytes,

								 *      then they are regenerated at load time.

								 *

								 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes

								 *      Primary-weight lead byte permutation table.

								 *      Normally present when the reorderCodes are, but can be built at load time.

								 *

								 *      Beginning with format version 5, a 0 entry at a non-zero index

								 *      (which is otherwise an illegal value)

								 *      means that the primary lead byte is "split"

								 *      (there are different offsets for primaries that share that lead byte)

								 *      and the reordering offset must be determined via the reorder ranges

								 *      that are either stored as part of the reorderCodes array

								 *      or regenerated at load time.

								 *

								 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h

								 *      The trie holds the main collation data. Each code point is mapped to a 32-bit value.

								 *      It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,

								 *      in which case it is a special CE32 and contains a 4-bit tag and further data.

								 *      See the Collation class for details.

								 *

								 *      The trie has a value for each lead surrogate code unit with some bits encoding

								 *      collective properties of the 1024 supplementary characters whose UTF-16 form starts with

								 *      the lead surrogate. See Collation::LEAD_SURROGATE_TAG..

								 *

								 * int64_t ces[];

								 *      64-bit CEs and expansions that cannot be stored in a more compact form.

								 *

								 * uint32_t ce32s[];

								 *      CE32s for expansions in compact form, and for characters whose trie values

								 *      contain special data.

								 *

								 * uint32_t rootElements[]; -- empty in all tailorings

								 *      Compact storage for all of the CEs that occur in the root collation.

								 *      See the CollationRootElements class.

								 *

								 * UChar *contexts[];

								 *      Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.

								 *

								 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()

								 *      Serialized form of characters that are unsafe when iterating backwards,

								 *      and at the end of an identical string prefix.

								 *      Back up to a safe character.

								 *      Lead surrogates are "unsafe" when any of their corresponding supplementary

								 *      code points are unsafe.

								 *      Does not include [:^lccc=0:][:^tccc=0:].

								 *      For each tailoring, the root unsafeBackwardSet is subtracted.

								 *      (As a result, in many tailorings no set needs to be stored.)

								 *

								 * uint16_t fastLatinTable[];

								 *      Optional optimization for Latin text.

								 *      See the CollationFastLatin class.

								 *

								 * uint16_t scripts[]; -- empty in all tailorings

								 *      Format version 5:

								 *      uint16_t numScripts;

								 *      uint16_t scriptsIndex[numScripts+16];

								 *      uint16_t scriptStarts[];

								 *      See CollationData::numScripts etc.

								 *

								 *      Format version 4:

								 *      Table of the reordering groups with their first and last lead bytes,

								 *      and their script and reordering codes.

								 *      See CollationData::scripts.

								 *

								 * UBool compressibleBytes[]; -- empty in all tailorings

								 *      Flag for getSortKey(), indicating primary weight lead bytes that are compressible.

								 *

								 * -----------------

								 * Changes for formatVersion 5 (ICU 55)

								 *

								 * Reordering moves single scripts, not groups of scripts.

								 * Reorder ranges are optionally appended to the reorderCodes,

								 * and a 0 entry in the reorderTable indicates a split lead byte.

								 * The scripts data has a new format.

								 *

								 * The rootElements may contain secondary and tertiary weights below common=05.

								 * (Used for small Hiragana letters.)

								 * Where is occurs, there is also an explicit unit with common secondary & tertiary weights.

								 * There are no other data structure changes, but builder code needs to be able to handle such data.

								 *

								 * The collation element for the merge separator code point U+FFFE

								 * does not necessarily have special, unique secondary/tertiary weights any more.

								 */


								U_NAMESPACE_END


								#endif  // !UCONFIG_NO_COLLATION

								#endif  // __COLLATIONDATAREADER_H__