// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationdatareader.h * * created on: 2013feb07 * created by: Markus W. Scherer */ #ifndef __COLLATIONDATAREADER_H__ #define __COLLATIONDATAREADER_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/udata.h" struct UDataMemory; U_NAMESPACE_BEGIN struct CollationTailoring; /** * Collation binary data reader. */ struct U_I18N_API CollationDataReader /* all static */ { // The following constants are also copied into source/common/ucol_swp.cpp. // Keep them in sync! enum { /** * Number of int32_t indexes. * * Can be 2 if there are only options. * Can be 7 or 8 if there are only options and a script reordering. * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. */ IX_INDEXES_LENGTH, // 0 /** * Bits 31..24: numericPrimary, for numeric collation * 23..16: fast Latin format version (0 = no fast Latin table) * 15.. 0: options bit set */ IX_OPTIONS, IX_RESERVED2, IX_RESERVED3, /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ IX_JAMO_CE32S_START, // 4 // Byte offsets from the start of the data, after the generic header. // The indexes[] are at byte offset 0, other data follows. // Each data item is aligned properly. // The data items should be in descending order of unit size, // to minimize the need for padding. // Each item's byte length is given by the difference between its offset and // the next index/offset value. /** Byte offset to int32_t reorderCodes[]. */ IX_REORDER_CODES_OFFSET, /** * Byte offset to uint8_t reorderTable[]. * Empty table if <256 bytes (padding only). * Otherwise 256 bytes or more (with padding). */ IX_REORDER_TABLE_OFFSET, /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ IX_TRIE_OFFSET, IX_RESERVED8_OFFSET, // 8 /** Byte offset to int64_t ces[]. */ IX_CES_OFFSET, IX_RESERVED10_OFFSET, /** Byte offset to uint32_t ce32s[]. */ IX_CE32S_OFFSET, /** Byte offset to uint32_t rootElements[]. */ IX_ROOT_ELEMENTS_OFFSET, // 12 /** Byte offset to char16_t *contexts[]. */ IX_CONTEXTS_OFFSET, /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ IX_UNSAFE_BWD_OFFSET, /** Byte offset to uint16_t fastLatinTable[]. */ IX_FAST_LATIN_TABLE_OFFSET, /** Byte offset to uint16_t scripts[]. */ IX_SCRIPTS_OFFSET, // 16 /** * Byte offset to UBool compressibleBytes[]. * Empty table if <256 bytes (padding only). * Otherwise 256 bytes or more (with padding). */ IX_COMPRESSIBLE_BYTES_OFFSET, IX_RESERVED18_OFFSET, IX_TOTAL_SIZE }; static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, CollationTailoring &tailoring, UErrorCode &errorCode); static UBool U_CALLCONV isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); private: CollationDataReader() = delete; // no constructor }; /* * Format of collation data (ucadata.icu, binary data in coll/ *.res files). * Format version 5. * * The root collation data is stored in the ucadata.icu file. * Tailorings are stored inside .res resource bundle files, with a complete file header. * * Collation data begins with a standard ICU data file header * (DataHeader, see ucmndata.h and unicode/udata.h). * The UDataInfo.dataVersion field contains the UCA and other version numbers, * see the comments for CollationTailoring.version. * * After the header, the file contains the following parts. * Constants are defined as enum values of the CollationDataReader class. * See also the Collation class. * * int32_t indexes[indexesLength]; * The indexes array has variable length. * Some tailorings only need the length and the options, * others only add reorderCodes and the reorderTable, * some need to store mappings. * Only as many indexes are stored as needed to read all of the data. * * Index 0: indexesLength * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS * Index 2..3: Unused/reserved/0. * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo * are stored in a short, contiguous part of the ce32s array. * * Indexes 5..19 are byte offsets in ascending order. * Each byte offset marks the start of the next part in the data file, * and the end of the previous one. * When two consecutive byte offsets are the same (or too short), * then the corresponding part is empty. * Byte offsets are offsets from after the header, * that is, from the beginning of the indexes[]. * Each part starts at an offset with proper alignment for its data. * If necessary, the previous part may include padding bytes to achieve this alignment. * The last byte offset that is stored in the indexes indicates the total size of the data * (starting with the indexes). * * int32_t reorderCodes[]; -- empty in root * The list of script and reordering codes. * * Beginning with format version 5, this array may optionally * have trailing entries with a full list of reorder ranges * as described for CollationSettings::reorderRanges. * * Script or reorder codes are first and do not exceed 16-bit values. * Range limits are stored in the upper 16 bits, and are never 0. * Split this array into reorder codes and ranges at the first entry * with non-zero upper 16 bits. * * If the ranges are missing but needed for split-reordered primary lead bytes, * then they are regenerated at load time. * * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes * Primary-weight lead byte permutation table. * Normally present when the reorderCodes are, but can be built at load time. * * Beginning with format version 5, a 0 entry at a non-zero index * (which is otherwise an illegal value) * means that the primary lead byte is "split" * (there are different offsets for primaries that share that lead byte) * and the reordering offset must be determined via the reorder ranges * that are either stored as part of the reorderCodes array * or regenerated at load time. * * UTrie2 trie; -- see utrie2_impl.h and utrie2.h * The trie holds the main collation data. Each code point is mapped to a 32-bit value. * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, * in which case it is a special CE32 and contains a 4-bit tag and further data. * See the Collation class for details. * * The trie has a value for each lead surrogate code unit with some bits encoding * collective properties of the 1024 supplementary characters whose UTF-16 form starts with * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. * * int64_t ces[]; * 64-bit CEs and expansions that cannot be stored in a more compact form. * * uint32_t ce32s[]; * CE32s for expansions in compact form, and for characters whose trie values * contain special data. * * uint32_t rootElements[]; -- empty in all tailorings * Compact storage for all of the CEs that occur in the root collation. * See the CollationRootElements class. * * char16_t *contexts[]; * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. * * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() * Serialized form of characters that are unsafe when iterating backwards, * and at the end of an identical string prefix. * Back up to a safe character. * Lead surrogates are "unsafe" when any of their corresponding supplementary * code points are unsafe. * Does not include [:^lccc=0:][:^tccc=0:]. * For each tailoring, the root unsafeBackwardSet is subtracted. * (As a result, in many tailorings no set needs to be stored.) * * uint16_t fastLatinTable[]; * Optional optimization for Latin text. * See the CollationFastLatin class. * * uint16_t scripts[]; -- empty in all tailorings * Format version 5: * uint16_t numScripts; * uint16_t scriptsIndex[numScripts+16]; * uint16_t scriptStarts[]; * See CollationData::numScripts etc. * * Format version 4: * Table of the reordering groups with their first and last lead bytes, * and their script and reordering codes. * See CollationData::scripts. * * UBool compressibleBytes[]; -- empty in all tailorings * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. * * ----------------- * Changes for formatVersion 5 (ICU 55) * * Reordering moves single scripts, not groups of scripts. * Reorder ranges are optionally appended to the reorderCodes, * and a 0 entry in the reorderTable indicates a split lead byte. * The scripts data has a new format. * * The rootElements may contain secondary and tertiary weights below common=05. * (Used for small Hiragana letters.) * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. * There are no other data structure changes, but builder code needs to be able to handle such data. * * The collation element for the merge separator code point U+FFFE * does not necessarily have special, unique secondary/tertiary weights any more. */ U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONDATAREADER_H__