diff options
Diffstat (limited to 'intl/icu/source/tools/icuexportdata/icuexportdata.cpp')
-rw-r--r-- | intl/icu/source/tools/icuexportdata/icuexportdata.cpp | 1566 |
1 files changed, 1566 insertions, 0 deletions
diff --git a/intl/icu/source/tools/icuexportdata/icuexportdata.cpp b/intl/icu/source/tools/icuexportdata/icuexportdata.cpp new file mode 100644 index 0000000000..a286040eef --- /dev/null +++ b/intl/icu/source/tools/icuexportdata/icuexportdata.cpp @@ -0,0 +1,1566 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include <cstddef> +#include <cstdint> +#include <cstdio> +#include <iostream> +#include <unicode/localpointer.h> +#include <unicode/umachine.h> +#include <unicode/unistr.h> +#include <unicode/urename.h> +#include <unicode/uset.h> +#include <vector> +#include <algorithm> +#include "toolutil.h" +#include "uoptions.h" +#include "cmemory.h" +#include "charstr.h" +#include "cstring.h" +#include "unicode/uchar.h" +#include "unicode/errorcode.h" +#include "unicode/uniset.h" +#include "unicode/uscript.h" +#include "unicode/putil.h" +#include "unicode/umutablecptrie.h" +#include "unicode/ucharstriebuilder.h" +#include "ucase.h" +#include "unicode/normalizer2.h" +#include "normalizer2impl.h" +#include "writesrc.h" + +U_NAMESPACE_USE + +/* + * Global - verbosity + */ +UBool VERBOSE = false; +UBool QUIET = false; + +UBool haveCopyright = true; +UCPTrieType trieType = UCPTRIE_TYPE_SMALL; +const char* destdir = ""; + +// Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits. +int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON = 0x0400; +int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800; +int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER = 0x0c00; + +// TODO(ICU-21821): Replace this with a call to a library function +int32_t scxCodePoints[] = { + 7415, 7377, 7380, 7387, 7390, 7391, 7394, 7395, 7396, 7397, + 7398, 7399, 7400, 7403, 7404, 7406, 7407, 7408, 7409, 113824, + 113825, 113826, 113827, 834, 837, 7616, 7617, 12294, 12350, 12351, + 12688, 12689, 12690, 12691, 12692, 12693, 12694, 12695, 12696, 12697, + 12698, 12699, 12700, 12701, 12702, 12703, 12736, 12737, 12738, 12739, + 12740, 12741, 12742, 12743, 12744, 12745, 12746, 12747, 12748, 12749, + 12750, 12751, 12752, 12753, 12754, 12755, 12756, 12757, 12758, 12759, + 12760, 12761, 12762, 12763, 12764, 12765, 12766, 12767, 12768, 12769, + 12770, 12771, 12832, 12833, 12834, 12835, 12836, 12837, 12838, 12839, + 12840, 12841, 12842, 12843, 12844, 12845, 12846, 12847, 12848, 12849, + 12850, 12851, 12852, 12853, 12854, 12855, 12856, 12857, 12858, 12859, + 12860, 12861, 12862, 12863, 12864, 12865, 12866, 12867, 12868, 12869, + 12870, 12871, 12928, 12929, 12930, 12931, 12932, 12933, 12934, 12935, + 12936, 12937, 12938, 12939, 12940, 12941, 12942, 12943, 12944, 12945, + 12946, 12947, 12948, 12949, 12950, 12951, 12952, 12953, 12954, 12955, + 12956, 12957, 12958, 12959, 12960, 12961, 12962, 12963, 12964, 12965, + 12966, 12967, 12968, 12969, 12970, 12971, 12972, 12973, 12974, 12975, + 12976, 12992, 12993, 12994, 12995, 12996, 12997, 12998, 12999, 13000, + 13001, 13002, 13003, 13055, 13144, 13145, 13146, 13147, 13148, 13149, + 13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159, + 13160, 13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13179, + 13180, 13181, 13182, 13183, 13280, 13281, 13282, 13283, 13284, 13285, + 13286, 13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295, + 13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304, 13305, + 13306, 13307, 13308, 13309, 13310, 119648, 119649, 119650, 119651, 119652, + 119653, 119654, 119655, 119656, 119657, 119658, 119659, 119660, 119661, 119662, + 119663, 119664, 119665, 127568, 127569, 867, 868, 869, 870, 871, + 872, 873, 874, 875, 876, 877, 878, 879, 7418, 7674, + 66272, 66273, 66274, 66275, 66276, 66277, 66278, 66279, 66280, 66281, + 66282, 66283, 66284, 66285, 66286, 66287, 66288, 66289, 66290, 66291, + 66292, 66293, 66294, 66295, 66296, 66297, 66298, 66299, 1748, 64830, + 64831, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, + 1620, 1621, 1648, 65010, 65021, 7381, 7382, 7384, 7393, 7402, + 7405, 7413, 7414, 43249, 12330, 12331, 12332, 12333, 43471, 65794, + 65847, 65848, 65849, 65850, 65851, 65852, 65853, 65854, 65855, 1156, + 1159, 11843, 42607, 1157, 1158, 1155, 7672, 7379, 7411, 7416, + 7417, 7401, 7383, 7385, 7388, 7389, 7392, 43251, 4347, 3046, + 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056, + 3057, 3058, 3059, 70401, 70403, 70459, 70460, 73680, 73681, 73683, + 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, + 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671, + 42752, 42753, 42754, 42755, 42756, 42757, 42758, 42759, 12337, 12338, + 12339, 12340, 12341, 12441, 12442, 12443, 12444, 12448, 12540, 65392, + 65438, 65439, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309, + 3310, 3311, 8239, 68338, 6146, 6147, 6149, 1564, 1632, 1633, + 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 2534, 2535, + 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 4160, 4161, + 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 65792, 65793, + 65799, 65800, 65801, 65802, 65803, 65804, 65805, 65806, 65807, 65808, + 65809, 65810, 65811, 65812, 65813, 65814, 65815, 65816, 65817, 65818, + 65819, 65820, 65821, 65822, 65823, 65824, 65825, 65826, 65827, 65828, + 65829, 65830, 65831, 65832, 65833, 65834, 65835, 65836, 65837, 65838, + 65839, 65840, 65841, 65842, 65843, 7412, 8432, 12348, 12349, 43310, + 7376, 7378, 5941, 5942, 2406, 2407, 2408, 2409, 2410, 2411, + 2412, 2413, 2414, 2415, 12291, 12307, 12316, 12317, 12318, 12319, + 12336, 12343, 65093, 65094, 1548, 1563, 12289, 12290, 12296, 12297, + 12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, 12308, 12309, + 12310, 12311, 12312, 12313, 12314, 12315, 12539, 65377, 65378, 65379, + 65380, 65381, 7386, 1567, 7410, 1600, 43062, 43063, 43064, 43065, + 2386, 2385, 43059, 43060, 43061, 43056, 43057, 43058, 2404, 2405 + }; + +void handleError(ErrorCode& status, const char* context) { + if (status.isFailure()) { + std::cerr << "Error: " << context << ": " << status.errorName() << std::endl; + exit(status.reset()); + } +} + +class PropertyValueNameGetter : public ValueNameGetter { +public: + PropertyValueNameGetter(UProperty prop) : property(prop) {} + ~PropertyValueNameGetter() override; + const char *getName(uint32_t value) override { + return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME); + } + +private: + UProperty property; +}; + +PropertyValueNameGetter::~PropertyValueNameGetter() {} + +// Dump an aliases = [...] key for properties with aliases +void dumpPropertyAliases(UProperty uproperty, FILE* f) { + int i = U_LONG_PROPERTY_NAME + 1; + + while(true) { + // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially, + // and returning null after that + const char* alias = u_getPropertyName(uproperty, (UPropertyNameChoice) i); + if (!alias) { + break; + } + if (i == U_LONG_PROPERTY_NAME + 1) { + fprintf(f, "aliases = [\"%s\"", alias); + } else { + fprintf(f, ", \"%s\"", alias); + } + i++; + } + if (i != U_LONG_PROPERTY_NAME + 1) { + fprintf(f, "]\n"); + } +} + +void dumpBinaryProperty(UProperty uproperty, FILE* f) { + IcuToolErrorCode status("icuexportdata: dumpBinaryProperty"); + const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); + const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); + const USet* uset = u_getBinaryPropertySet(uproperty, status); + handleError(status, fullPropName); + + fputs("[[binary_property]]\n", f); + fprintf(f, "long_name = \"%s\"\n", fullPropName); + if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); + fprintf(f, "uproperty_discr = 0x%X\n", uproperty); + dumpPropertyAliases(uproperty, f); + usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML); +} + +// If the value exists, dump an indented entry of the format +// `" {discr = <discriminant>, long = <longname>, short = <shortname>, aliases = [<aliases>]},"` +void dumpValueEntry(UProperty uproperty, int v, bool is_mask, FILE* f) { + const char* fullValueName = u_getPropertyValueName(uproperty, v, U_LONG_PROPERTY_NAME); + const char* shortValueName = u_getPropertyValueName(uproperty, v, U_SHORT_PROPERTY_NAME); + if (!fullValueName) { + return; + } + if (is_mask) { + fprintf(f, " {discr = 0x%X", v); + } else { + fprintf(f, " {discr = %i", v); + } + fprintf(f, ", long = \"%s\"", fullValueName); + if (shortValueName) { + fprintf(f, ", short = \"%s\"", shortValueName); + } + int i = U_LONG_PROPERTY_NAME + 1; + while(true) { + // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially, + // and returning null after that + const char* alias = u_getPropertyValueName(uproperty, v, (UPropertyNameChoice) i); + if (!alias) { + break; + } + if (i == U_LONG_PROPERTY_NAME + 1) { + fprintf(f, ", aliases = [\"%s\"", alias); + } else { + fprintf(f, ", \"%s\"", alias); + } + i++; + } + if (i != U_LONG_PROPERTY_NAME + 1) { + fprintf(f, "]"); + } + fprintf(f, "},\n"); +} + +void dumpEnumeratedProperty(UProperty uproperty, FILE* f) { + IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty"); + const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); + const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); + const UCPMap* umap = u_getIntPropertyMap(uproperty, status); + handleError(status, fullPropName); + + fputs("[[enum_property]]\n", f); + fprintf(f, "long_name = \"%s\"\n", fullPropName); + if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); + fprintf(f, "uproperty_discr = 0x%X\n", uproperty); + dumpPropertyAliases(uproperty, f); + + int32_t minValue = u_getIntPropertyMinValue(uproperty); + U_ASSERT(minValue >= 0); + int32_t maxValue = u_getIntPropertyMaxValue(uproperty); + U_ASSERT(maxValue >= 0); + + fprintf(f, "values = [\n"); + for (int v = minValue; v <= maxValue; v++) { + dumpValueEntry(uproperty, v, false, f); + } + fprintf(f, "]\n"); + + PropertyValueNameGetter valueNameGetter(uproperty); + usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML); + fputs("\n", f); + + + UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32; + if (maxValue <= 0xff) { + width = UCPTRIE_VALUE_BITS_8; + } else if (maxValue <= 0xffff) { + width = UCPTRIE_VALUE_BITS_16; + } + LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status)); + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + width, + status)); + handleError(status, fullPropName); + + fputs("[enum_property.code_point_trie]\n", f); + usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); +} + +/* +* Export Bidi_Mirroring_Glyph values (code points) in a similar way to how enumerated +* properties are dumped to file. +* Note: the data will store 0 for code points without a value defined for +* Bidi_Mirroring_Glyph. +*/ +void dumpBidiMirroringGlyph(FILE* f) { + UProperty uproperty = UCHAR_BIDI_MIRRORING_GLYPH; + IcuToolErrorCode status("icuexportdata: dumpBidiMirroringGlyph"); + const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); + const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); + handleError(status, fullPropName); + + // Store 21-bit code point as is + UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32; + + // note: unlike dumpEnumeratedProperty, which can get inversion map data using + // u_getIntPropertyMap(uproperty), the only reliable way to get Bidi_Mirroring_Glyph + // is to use u_charMirror(cp) over the code point space. + LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); + for(UChar32 c = UCHAR_MIN_VALUE; c <= UCHAR_MAX_VALUE; c++) { + UChar32 mirroringGlyph = u_charMirror(c); + // The trie builder code throws an error when it cannot compress the data sufficiently. + // Therefore, when the value is undefined for a code point, keep a 0 in the trie + // instead of the ICU API behavior of returning the code point value. Using 0 + // results in a relatively significant space savings by not including redundant data. + if (c != mirroringGlyph) { + umutablecptrie_set(builder.getAlias(), c, mirroringGlyph, status); + } + } + + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + width, + status)); + handleError(status, fullPropName); + + // currently a trie and inversion map are the same (as relied upon in characterproperties.cpp) + const UCPMap* umap = reinterpret_cast<UCPMap *>(utrie.getAlias()); + + fputs("[[enum_property]]\n", f); + fprintf(f, "long_name = \"%s\"\n", fullPropName); + if (shortPropName) { + fprintf(f, "short_name = \"%s\"\n", shortPropName); + } + fprintf(f, "uproperty_discr = 0x%X\n", uproperty); + dumpPropertyAliases(uproperty, f); + + usrc_writeUCPMap(f, umap, nullptr, UPRV_TARGET_SYNTAX_TOML); + fputs("\n", f); + + fputs("[enum_property.code_point_trie]\n", f); + usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); +} + +// After printing property value `v`, print `mask` if and only if `mask` comes immediately +// after the property in the listing +void maybeDumpMaskValue(UProperty uproperty, uint32_t v, uint32_t mask, FILE* f) { + if (U_MASK(v) < mask && U_MASK(v + 1) > mask) + dumpValueEntry(uproperty, mask, true, f); +} + +void dumpGeneralCategoryMask(FILE* f) { + IcuToolErrorCode status("icuexportdata: dumpGeneralCategoryMask"); + UProperty uproperty = UCHAR_GENERAL_CATEGORY_MASK; + + fputs("[[mask_property]]\n", f); + const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); + const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); + fprintf(f, "long_name = \"%s\"\n", fullPropName); + if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); + fprintf(f, "uproperty_discr = 0x%X\n", uproperty); + dumpPropertyAliases(uproperty, f); + + + fprintf(f, "mask_for = \"General_Category\"\n"); + uint32_t minValue = u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY); + U_ASSERT(minValue >= 0); + uint32_t maxValue = u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY); + U_ASSERT(maxValue >= 0); + + fprintf(f, "values = [\n"); + for (uint32_t v = minValue; v <= maxValue; v++) { + dumpValueEntry(uproperty, U_MASK(v), true, f); + + // We want to dump these masks "in order", which means they + // should come immediately after every property they contain + maybeDumpMaskValue(uproperty, v, U_GC_L_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_LC_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_M_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_N_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_Z_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_C_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_P_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_S_MASK, f); + } + fprintf(f, "]\n"); +} + +void dumpScriptExtensions(FILE* f) { + IcuToolErrorCode status("icuexportdata: dumpScriptExtensions"); + + fputs("[[script_extensions]]\n", f); + const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME); + const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME); + fprintf(f, "long_name = \"%s\"\n", scxFullPropName); + if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName); + fprintf(f, "uproperty_discr = 0x%X\n", UCHAR_SCRIPT_EXTENSIONS); + dumpPropertyAliases(UCHAR_SCRIPT_EXTENSIONS, f); + + // We want to use 16 bits for our exported trie of sc/scx data because we + // need 12 bits to match the 12 bits of data stored for sc/scx in the trie + // in the uprops.icu data file. + UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16; + + // Create a mutable UCPTrie builder populated with Script property values data. + const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status); + handleError(status, scxFullPropName); + LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status)); + handleError(status, scxFullPropName); + + // The values for the output scx companion array. + // Invariant is that all subvectors are distinct. + std::vector< std::vector<uint16_t> > outputDedupVec; + + // The sc/scx companion array is an array of arrays (of script codes) + fputs("script_code_array = [\n", f); + for(const UChar32 cp : scxCodePoints) { + // Get the Script value + uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp); + // Get the Script_Extensions value (array of Script codes) + const int32_t SCX_ARRAY_CAPACITY = 32; + UScriptCode scxValArray[SCX_ARRAY_CAPACITY]; + int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status); + handleError(status, scxFullPropName); + + // Convert the scx array into a vector + std::vector<uint16_t> scxValVec; + for(int i = 0; i < numScripts; i++) { + scxValVec.push_back(scxValArray[i]); + } + // Ensure that it is sorted + std::sort(scxValVec.begin(), scxValVec.end()); + // Copy the Script value into the first position of the scx array only + // if we have the "other" case (Script value is not Common nor Inherited). + // This offers faster access when users want only the Script value. + if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) { + scxValVec.insert(scxValVec.begin(), scVal); + } + + // See if there is already an scx value array matching the newly built one. + // If there is, then use its index. + // If not, then append the new value array. + bool isScxValUnique = true; + size_t outputIndex = 0; + for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) { + if (outputDedupVec[outputIndex] == scxValVec) { + isScxValUnique = false; + break; + } + } + + if (isScxValUnique) { + outputDedupVec.push_back(scxValVec); + usrc_writeArray(f, " [", scxValVec.data(), 16, scxValVec.size(), " ", "],\n"); + } + + // We must update the value in the UCPTrie for the code point to contain: + // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is + // the index into the companion array + // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether + // 3: other + // 2: Script=Inherited + // 1: Script=Common + // 0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases) + uint16_t mask = 0; + if (scVal == USCRIPT_COMMON) { + mask = DATAEXPORT_SCRIPT_X_WITH_COMMON; + } else if (scVal == USCRIPT_INHERITED) { + mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED; + } else { + mask = DATAEXPORT_SCRIPT_X_WITH_OTHER; + } + + // The new trie value is the index into the new array with the high order bits set + uint32_t newScVal = outputIndex | mask; + + // Update the code point in the mutable trie builder with the trie value + umutablecptrie_set(builder.getAlias(), cp, newScVal, status); + handleError(status, scxFullPropName); + } + fputs("]\n\n", f); // Print the TOML close delimiter for the outer array. + + // Convert from mutable trie builder to immutable trie. + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + scWidth, + status)); + handleError(status, scxFullPropName); + + fputs("[script_extensions.code_point_trie]\n", f); + usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); +} + +FILE* prepareOutputFile(const char* basename) { + IcuToolErrorCode status("icuexportdata"); + CharString outFileName; + if (destdir != nullptr && *destdir != 0) { + outFileName.append(destdir, status).ensureEndsWithFileSeparator(status); + } + outFileName.append(basename, status); + outFileName.append(".toml", status); + handleError(status, basename); + + FILE* f = fopen(outFileName.data(), "w"); + if (f == nullptr) { + std::cerr << "Unable to open file: " << outFileName.data() << std::endl; + exit(U_FILE_ACCESS_ERROR); + } + if (!QUIET) { + std::cout << "Writing to: " << outFileName.data() << std::endl; + } + + if (haveCopyright) { + usrc_writeCopyrightHeader(f, "#", 2021); + } + usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp"); + + return f; +} + +#if !UCONFIG_NO_NORMALIZATION + +struct PendingDescriptor { + UChar32 scalar; + uint32_t descriptor; + UBool supplementary; +}; + +void writeCanonicalCompositions(USet* backwardCombiningStarters) { + IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions"); + const char* basename = "compositions"; + FILE* f = prepareOutputFile(basename); + + LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status); + + const int32_t DECOMPOSITION_BUFFER_SIZE = 20; + UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; + + const Normalizer2* nfc = Normalizer2::getNFCInstance(status); + for (UChar32 c = 0; c <= 0x10FFFF; ++c) { + if (c >= 0xD800 && c < 0xE000) { + // Surrogate + continue; + } + UnicodeString decomposition; + if (!nfc->getRawDecomposition(c, decomposition)) { + continue; + } + int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (len != 2) { + continue; + } + UChar32 starter = utf32[0]; + UChar32 second = utf32[1]; + UChar32 composite = nfc->composePair(starter, second); + if (composite < 0) { + continue; + } + if (c != composite) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (!u_getCombiningClass(second)) { + uset_add(backwardCombiningStarters, second); + } + if (composite >= 0xAC00 && composite <= 0xD7A3) { + // Hangul syllable + continue; + } + + UnicodeString backward; + backward.append(second); + backward.append(starter); + backwardBuilder->add(backward, int32_t(composite), status); + } + UnicodeString canonicalCompositionTrie; + backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status); + + usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n"); + fclose(f); + handleError(status, basename); +} + +void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) { + FILE* f = prepareOutputFile(basename); + usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n"); + usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n"); + fclose(f); +} + +void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) { + IcuToolErrorCode status("icuexportdata: writeDecompositionData"); + FILE* f = prepareOutputFile(basename); + + // Zero is a magic number that means the character decomposes to itself. + LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); + + // Iterate backwards to insert lower code points in the trie first in case it matters + // for trie block allocation. + for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) { + const PendingDescriptor& pending = pendingTrieInsertions[i]; + uint32_t additional = 0; + if (!(pending.descriptor & 0xFFFE0000)) { + uint32_t offset = pending.descriptor & 0xFFF; + if (!pending.supplementary) { + if (offset >= baseSize16) { + // This is a offset to supplementary 16-bit data. We have + // 16-bit base data and 32-bit base data before. However, + // the 16-bit base data length is already part of offset. + additional = baseSize32; + } + } else { + if (offset >= baseSize32) { + // This is an offset to supplementary 32-bit data. We have 16-bit + // base data, 32-bit base data, and 16-bit supplementary data before. + // However, the 32-bit base data length is already part + // of offset. + additional = baseSize16 + supplementSize16; + } else { + // This is an offset to 32-bit base data. We have 16-bit + // base data before. + additional = baseSize16; + } + } + if (offset + additional > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + // It turns out it's better to swap the halves compared to the initial + // idea in order to put special marker values close to zero so that + // an important marker value becomes 1, so it's efficient to compare + // "1 or 0". Unfortunately, going through all the code to swap + // things is too error prone, so let's do the swapping here in one + // place. + uint32_t oldTrieValue = pending.descriptor + additional; + uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16); + umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status); + } + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + UCPTRIE_VALUE_BITS_32, + status)); + handleError(status, basename); + + if (reference) { + if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) { + // NFD expectations don't hold. The set must not contain the half-width + // kana voicing marks and must contain iota subscript. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + + USet* halfWidthVoicing = uset_openEmpty(); + uset_add(halfWidthVoicing, 0xFF9E); + uset_add(halfWidthVoicing, 0xFF9F); + + USet* iotaSubscript = uset_openEmpty(); + uset_add(iotaSubscript, 0x0345); + + uint8_t flags = 0; + + USet* halfWidthCheck = uset_cloneAsThawed(uset); + uset_removeAll(halfWidthCheck, reference); + if (uset_equals(halfWidthCheck, halfWidthVoicing)) { + flags |= 1; + } else if (!uset_isEmpty(halfWidthCheck)) { + // The result was neither empty nor contained exactly + // the two half-width voicing marks. The ICU4X + // normalizer doesn't know how to deal with this case. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + uset_close(halfWidthCheck); + + USet* iotaCheck = uset_cloneAsThawed(reference); + uset_removeAll(iotaCheck, uset); + if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) { + // The result was neither empty nor contained exactly + // the iota subscript. The ICU4X normalizer doesn't + // know how to deal with this case. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + uset_close(halfWidthCheck); + + uset_close(iotaSubscript); + uset_close(halfWidthVoicing); + + fprintf(f, "flags = 0x%X\n", flags); + fprintf(f, "cap = 0x%X\n", passthroughCap); + } + fprintf(f, "[trie]\n"); + usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + fclose(f); + handleError(status, basename); +} + +// Special marker for the NFKD form of U+FDFA +const int32_t FDFA_MARKER = 3; + +// Special marker for characters whose decomposition starts with a non-starter +// and the decomposition isn't the character itself. +const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2; + +// Special marker for starters that decompose to themselves but that may +// combine backwards under canonical composition +const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1; + +/// Marker that a complex decomposition isn't round-trippable +/// under re-composition. +const uint32_t NON_ROUND_TRIP_MARKER = 1; + +UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) { + if (knownToRoundTrip) { + return true; + } + // Nuktas, Hebrew presentation forms and polytonic Greek with oxia + // are special-cased in ICU4X. + if (c >= 0xFB1D && c <= 0xFB4E) { + // Hebrew presentation forms + return true; + } + if (c >= 0x1F71 && c <= 0x1FFB) { + // Polytonic Greek with oxia + return true; + } + if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) { + // Nukta + return true; + } + // To avoid more branchiness, 4 characters that decompose to + // a BMP starter followed by a BMP non-starter are excluded + // from being encoded directly into the trie value and are + // handled as complex decompositions instead. These are: + // U+0F76 TIBETAN VOWEL SIGN VOCALIC R + // U+0F78 TIBETAN VOWEL SIGN VOCALIC L + // U+212B ANGSTROM SIGN + // U+2ADC FORKING + return false; +} + +// Computes data for canonical decompositions +void computeDecompositions(const char* basename, + const USet* backwardCombiningStarters, + std::vector<uint16_t>& storage16, + std::vector<uint32_t>& storage32, + USet* decompositionStartsWithNonStarter, + USet* decompositionStartsWithBackwardCombiningStarter, + std::vector<PendingDescriptor>& pendingTrieInsertions, + UChar32& decompositionPassthroughBound, + UChar32& compositionPassthroughBound) { + IcuToolErrorCode status("icuexportdata: computeDecompositions"); + const Normalizer2* mainNormalizer; + const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status); + const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status); + FILE* f = nullptr; + std::vector<uint32_t> nonRecursive32; + LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status)); + + if (uprv_strcmp(basename, "nfkd") == 0) { + mainNormalizer = Normalizer2::getNFKDInstance(status); + } else if (uprv_strcmp(basename, "uts46d") == 0) { + mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status); + } else { + mainNormalizer = nfdNormalizer; + f = prepareOutputFile("decompositionex"); + } + + // Max length as of Unicode 14 is 4 for NFD. For NFKD the max + // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB). + const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9; + const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8; + const int32_t DECOMPOSITION_BUFFER_SIZE = 20; + UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; + const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2; + UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE]; + + // Iterate over all scalar values excluding Hangul syllables. + // + // We go backwards in order to better find overlapping decompositions. + // + // As of Unicode 14: + // Iterate forward without overlap search: + // nfd: 16 size: 896, 32 size: 173 + // nfkd: 16 size: 3854, 32 size: 179 + // + // Iterate forward with overlap search: + // nfd: 16 size: 888, 32 size: 173 + // nfkd: 16 size: 3266, 32 size: 179 + // + // Iterate backward with overlap search: + // nfd: 16 size: 776, 32 size: 173 + // nfkd: 16 size: 2941, 32 size: 179 + // + // UChar32 is signed! + for (UChar32 c = 0x10FFFF; c >= 0; --c) { + if (c >= 0xAC00 && c <= 0xD7A3) { + // Hangul syllable + continue; + } + if (c >= 0xD800 && c < 0xE000) { + // Surrogate + continue; + } + UnicodeString src; + UnicodeString dst; + // True if we're building non-NFD or we're building NFD but + // the `c` round trips to NFC. + // False if we're building NFD and `c` does not round trip to NFC. + UBool nonNfdOrRoundTrips = true; + src.append(c); + if (mainNormalizer != nfdNormalizer) { + UnicodeString inter; + mainNormalizer->normalize(src, inter, status); + nfdNormalizer->normalize(inter, dst, status); + } else { + nfdNormalizer->normalize(src, dst, status); + UnicodeString nfc; + nfcNormalizer->normalize(dst, nfc, status); + nonNfdOrRoundTrips = (src == nfc); + } + int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { + // Characters that normalize to nothing or to U+FFFD (without the + // input being U+FFFD) in ICU4C's UTS 46 normalization normalize + // as in NFD in ICU4X's UTF 46 normalization in the interest + // of data size and ICU4X's normalizer being unable to handle + // normalizing to nothing. + // When UTS 46 is implemented on top of ICU4X, a preprocessing + // step is supposed to remove these characters before the + // normalization step. + if (uprv_strcmp(basename, "uts46d") != 0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + nfdNormalizer->normalize(src, dst, status); + len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + if (len > DECOMPOSITION_BUFFER_SIZE) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]); + bool specialNonStarterDecomposition = false; + bool startsWithBackwardCombiningStarter = false; + if (firstCombiningClass) { + decompositionPassthroughBound = c; + compositionPassthroughBound = c; + uset_add(decompositionStartsWithNonStarter, c); + if (src != dst) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) { + specialNonStarterDecomposition = true; + } else { + // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + } else if (uset_contains(backwardCombiningStarters, utf32[0])) { + compositionPassthroughBound = c; + startsWithBackwardCombiningStarter = true; + uset_add(decompositionStartsWithBackwardCombiningStarter, c); + } + if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (mainNormalizer != nfdNormalizer) { + UnicodeString nfd; + nfdNormalizer->normalize(src, nfd, status); + if (dst == nfd) { + continue; + } + decompositionPassthroughBound = c; + compositionPassthroughBound = c; + } else if (firstCombiningClass) { + len = 1; + if (specialNonStarterDecomposition) { + utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value + } else { + // Use the surrogate range to store the canonical combining class + utf32[0] = 0xD800 | UChar32(firstCombiningClass); + } + } else { + if (src == dst) { + if (startsWithBackwardCombiningStarter) { + pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false}); + } + continue; + } + decompositionPassthroughBound = c; + // ICU4X hard-codes ANGSTROM SIGN + if (c != 0x212B) { + UnicodeString raw; + if (!nfdNormalizer->getRawDecomposition(c, raw)) { + // We're always supposed to have a non-recursive decomposition + // if we had a recursive one. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + // In addition to actual difference, put the whole range that contains characters + // with oxia into the non-recursive trie in order to catch cases where characters + // with oxia have singleton decompositions to corresponding characters with tonos. + // This way, the run-time decision to fall through can be done on the range + // without checking for individual characters inside the range. + if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) { + int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status); + if (!rawLen) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (rawLen == 1) { + if (c >= 0xFFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status); + } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) { + if (!rawUtf32[0] || !rawUtf32[1]) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + // Swapped for consistency with the primary trie + uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]); + umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status); + } else { + // Let's add 1 to index to make it always non-zero to distinguish + // it from the default zero. + uint32_t index = nonRecursive32.size() + 1; + nonRecursive32.push_back(uint32_t(rawUtf32[0])); + nonRecursive32.push_back(uint32_t(rawUtf32[1])); + if (index > 0xFFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status); + } + } + } + } + if (!nonNfdOrRoundTrips) { + compositionPassthroughBound = c; + } + if (len == 1 && utf32[0] <= 0xFFFF) { + if (startsWithBackwardCombiningStarter) { + if (mainNormalizer == nfdNormalizer) { + // Not supposed to happen in NFD + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) { + // Other than conjoining jamo vowels and trails + // unsupported for non-NFD. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false}); + } else if (len == 2 && + utf32[0] <= 0xFFFF && + utf32[1] <= 0xFFFF && + !u_getCombiningClass(utf32[0]) && + u_getCombiningClass(utf32[1]) && + permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) { + for (int32_t i = 0; i < len; ++i) { + if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { + // Assert that iota subscript and half-width voicing marks never occur in these + // expansions in the normalization forms where they are special. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + if (startsWithBackwardCombiningStarter) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false}); + } else { + if (startsWithBackwardCombiningStarter) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + + UBool supplementary = false; + UBool nonInitialStarter = false; + for (int32_t i = 0; i < len; ++i) { + if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { + // Assert that iota subscript and half-width voicing marks never occur in these + // expansions in the normalization forms where they are special. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + + if (utf32[i] > 0xFFFF) { + supplementary = true; + } + if (utf32[i] == 0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (i != 0 && !u_getCombiningClass(utf32[i])) { + nonInitialStarter = true; + } + } + if (!supplementary) { + if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) { + if (len == 18 && c == 0xFDFA) { + // Special marker for the one character whose decomposition + // is too long. + pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary}); + continue; + } else { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + // Complex decomposition + // Format for 16-bit value: + // 15..13: length minus two for 16-bit case and length minus one for + // the 32-bit case. Length 8 needs to fit in three bits in + // the 16-bit case, and this way the value is future-proofed + // up to 9 in the 16-bit case. Zero is unused and length one + // in the 16-bit case goes directly into the trie. + // 12: 1 if all trailing characters are guaranteed non-starters, + // 0 if no guarantees about non-starterness. + // Note: The bit choice is this way around to allow for + // dynamically falling back to not having this but instead + // having one more bit for length by merely choosing + // different masks. + // 11..0: Start offset in storage. The offset is to the logical + // sequence of scalars16, scalars32, supplementary_scalars16, + // supplementary_scalars32. + uint32_t descriptor = uint32_t(!nonInitialStarter) << 12; + if (!supplementary) { + descriptor |= (uint32_t(len) - 2) << 13; + } else { + descriptor |= (uint32_t(len) - 1) << 13; + } + if (descriptor & 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + size_t index = 0; + bool writeToStorage = false; + // Sadly, C++ lacks break and continue by label, so using goto in the + // inner loops to break or continue the outer loop. + if (!supplementary) { + outer16: for (;;) { + if (index == storage16.size()) { + writeToStorage = true; + break; + } + if (storage16[index] == utf32[0]) { + for (int32_t i = 1; i < len; ++i) { + if (storage16[index + i] != uint32_t(utf32[i])) { + ++index; + // continue outer + goto outer16; + } + } + // break outer + goto after; + } + ++index; + } + } else { + outer32: for (;;) { + if (index == storage32.size()) { + writeToStorage = true; + break; + } + if (storage32[index] == uint32_t(utf32[0])) { + for (int32_t i = 1; i < len; ++i) { + if (storage32[index + i] != uint32_t(utf32[i])) { + ++index; + // continue outer + goto outer32; + } + } + // break outer + goto after; + } + ++index; + } + } + after: + if (index > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + descriptor |= uint32_t(index); + if (!descriptor || descriptor > 0xFFFF) { + // > 0xFFFF should never happen if the code above is correct. + // == 0 should not happen due to the nature of the data. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (writeToStorage) { + if (!supplementary) { + for (int32_t i = 0; i < len; ++i) { + storage16.push_back(uint16_t(utf32[i])); + } + } else { + for (int32_t i = 0; i < len; ++i) { + storage32.push_back(uint32_t(utf32[i])); + } + } + } + + uint32_t nonRoundTripMarker = 0; + if (!nonNfdOrRoundTrips) { + nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16); + } + pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary}); + } + } + if (storage16.size() + storage32.size() > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + } + if (f) { + usrc_writeArray(f, "scalars32 = [\n ", nonRecursive32.data(), 32, nonRecursive32.size(), " ", "\n]\n"); + + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + nonRecursiveBuilder.getAlias(), + trieType, + UCPTRIE_VALUE_BITS_32, + status)); + handleError(status, basename); + + fprintf(f, "[trie]\n"); + usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + + fclose(f); + } + handleError(status, basename); +} + +#endif // !UCONFIG_NO_NORMALIZATION + +enum { + OPT_HELP_H, + OPT_HELP_QUESTION_MARK, + OPT_MODE, + OPT_TRIE_TYPE, + OPT_VERSION, + OPT_DESTDIR, + OPT_ALL, + OPT_INDEX, + OPT_COPYRIGHT, + OPT_VERBOSE, + OPT_QUIET, + + OPT_COUNT +}; + +#define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG) +#define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG) +#define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG) +#define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG) + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_MODE, + UOPTION_TRIE_TYPE, + UOPTION_VERSION, + UOPTION_DESTDIR, + UOPTION_ALL, + UOPTION_INDEX, + UOPTION_COPYRIGHT, + UOPTION_VERBOSE, + UOPTION_QUIET, +}; + +void printHelp(FILE* stdfile, const char* program) { + fprintf(stdfile, + "usage: %s -m mode [-options] [--all | properties...]\n" + "\tdump Unicode property data to .toml files\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n" + "\t --trie-type set the trie type (small or fast, default small)\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t --all write out all properties known to icuexportdata\n" + "\t --index write an _index.toml summarizing all data exported\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose Turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n", + program); +} + +int exportUprops(int argc, char* argv[]) { + // Load list of Unicode properties + std::vector<const char*> propNames; + for (int i=1; i<argc; i++) { + propNames.push_back(argv[i]); + } + if (options[OPT_ALL].doesOccur) { + int i = UCHAR_BINARY_START; + while (true) { + if (i == UCHAR_BINARY_LIMIT) { + i = UCHAR_INT_START; + } + if (i == UCHAR_INT_LIMIT) { + i = UCHAR_GENERAL_CATEGORY_MASK; + } + if (i == UCHAR_GENERAL_CATEGORY_MASK + 1) { + i = UCHAR_BIDI_MIRRORING_GLYPH; + } + if (i == UCHAR_BIDI_MIRRORING_GLYPH + 1) { + i = UCHAR_SCRIPT_EXTENSIONS; + } + if (i == UCHAR_SCRIPT_EXTENSIONS + 1) { + break; + } + UProperty uprop = static_cast<UProperty>(i); + const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME); + if (propName == nullptr) { + propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME); + if (propName != nullptr && VERBOSE) { + std::cerr << "Note: falling back to long name for: " << propName << std::endl; + } + } + if (propName != nullptr) { + propNames.push_back(propName); + } else { + std::cerr << "Warning: Could not find name for: " << uprop << std::endl; + } + i++; + } + } + + if (propNames.empty() + || options[OPT_HELP_H].doesOccur + || options[OPT_HELP_QUESTION_MARK].doesOccur + || !options[OPT_MODE].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + fprintf(stdfile, + "usage: %s -m uprops [-options] [--all | properties...]\n" + "\tdump Unicode property data to .toml files\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-m or --mode mode: currently only 'uprops', but more may be added\n" + "\t --trie-type set the trie type (small or fast, default small)\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t --all write out all properties known to icuexportdata\n" + "\t --index write an _index.toml summarizing all data exported\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose Turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n", + argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + const char* mode = options[OPT_MODE].value; + if (uprv_strcmp(mode, "uprops") != 0) { + fprintf(stderr, "Invalid option for --mode (must be uprops)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + + if (options[OPT_TRIE_TYPE].doesOccur) { + if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) { + trieType = UCPTRIE_TYPE_FAST; + } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) { + trieType = UCPTRIE_TYPE_SMALL; + } else { + fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + for (const char* propName : propNames) { + UProperty propEnum = u_getPropertyEnum(propName); + if (propEnum == UCHAR_INVALID_CODE) { + std::cerr << "Error: Invalid property alias: " << propName << std::endl; + return U_ILLEGAL_ARGUMENT_ERROR; + } + + FILE* f = prepareOutputFile(propName); + + UVersionInfo versionInfo; + u_getUnicodeVersion(versionInfo); + char uvbuf[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(versionInfo, uvbuf); + fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n", + U_ICU_VERSION, + uvbuf); + + if (propEnum < UCHAR_BINARY_LIMIT) { + dumpBinaryProperty(propEnum, f); + } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) { + dumpEnumeratedProperty(propEnum, f); + } else if (propEnum == UCHAR_GENERAL_CATEGORY_MASK) { + dumpGeneralCategoryMask(f); + } else if (propEnum == UCHAR_BIDI_MIRRORING_GLYPH) { + dumpBidiMirroringGlyph(f); + } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) { + dumpScriptExtensions(f); + } else { + std::cerr << "Don't know how to write property: " << propEnum << std::endl; + return U_INTERNAL_PROGRAM_ERROR; + } + + fclose(f); + } + + if (options[OPT_INDEX].doesOccur) { + FILE* f = prepareOutputFile("_index"); + fprintf(f, "index = [\n"); + for (const char* propName : propNames) { + // At this point, propName is a valid property name, so it should be alphanum ASCII + fprintf(f, " { filename=\"%s.toml\" },\n", propName); + } + fprintf(f, "]\n"); + fclose(f); + } + + return 0; +} + +struct AddRangeHelper { + UMutableCPTrie* ucptrie; +}; + +static UBool U_CALLCONV +addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) { + IcuToolErrorCode status("addRangeToUCPTrie"); + UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie; + umutablecptrie_setRange(ucptrie, start, end, value, status); + handleError(status, "setRange"); + + return true; +} + +int exportCase(int argc, char* argv[]) { + if (argc > 1) { + fprintf(stderr, "ucase mode does not expect additional arguments\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + (void) argv; // Suppress unused variable warning + + IcuToolErrorCode status("icuexportdata"); + LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); + handleError(status, "exportCase"); + + int32_t exceptionsLength, unfoldLength; + const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength); + const UTrie2* caseTrie = &caseProps->trie; + + AddRangeHelper helper = { builder.getAlias() }; + utrie2_enum(caseTrie, nullptr, addRangeToUCPTrie, &helper); + + UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16; + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + width, + status)); + handleError(status, "exportCase"); + + FILE* f = prepareOutputFile("ucase"); + + UVersionInfo versionInfo; + u_getUnicodeVersion(versionInfo); + char uvbuf[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(versionInfo, uvbuf); + fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n", + U_ICU_VERSION, + uvbuf); + + fputs("[ucase.code_point_trie]\n", f); + usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + fputs("\n", f); + + const char* indent = " "; + const char* suffix = "\n]\n"; + + fputs("[ucase.exceptions]\n", f); + const char* exceptionsPrefix = "exceptions = [\n "; + int32_t exceptionsWidth = 16; + usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth, + exceptionsLength, indent, suffix); + fputs("\n", f); + + fputs("[ucase.unfold]\n", f); + const char* unfoldPrefix = "unfold = [\n "; + int32_t unfoldWidth = 16; + usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth, + unfoldLength, indent, suffix); + + return 0; +} + +#if !UCONFIG_NO_NORMALIZATION + +int exportNorm() { + IcuToolErrorCode status("icuexportdata: exportNorm"); + USet* backwardCombiningStarters = uset_openEmpty(); + writeCanonicalCompositions(backwardCombiningStarters); + + std::vector<uint16_t> storage16; + std::vector<uint32_t> storage32; + + // Note: the USets are not exported. They are only used to check that a new + // Unicode version doesn't violate expectations that are hard-coded in ICU4X. + USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty(); + USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector<PendingDescriptor> nfdPendingTrieInsertions; + UChar32 nfdBound = 0x10FFFF; + UChar32 nfcBound = 0x10FFFF; + computeDecompositions("nfd", + backwardCombiningStarters, + storage16, + storage32, + nfdDecompositionStartsWithNonStarter, + nfdDecompositionStartsWithBackwardCombiningStarter, + nfdPendingTrieInsertions, + nfdBound, + nfcBound); + if (!(nfdBound == 0xC0 && nfcBound == 0x300)) { + // Unexpected bounds for NFD/NFC. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + + uint32_t baseSize16 = storage16.size(); + uint32_t baseSize32 = storage32.size(); + + USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty(); + USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector<PendingDescriptor> nfkdPendingTrieInsertions; + UChar32 nfkdBound = 0x10FFFF; + UChar32 nfkcBound = 0x10FFFF; + computeDecompositions("nfkd", + backwardCombiningStarters, + storage16, + storage32, + nfkdDecompositionStartsWithNonStarter, + nfkdDecompositionStartsWithBackwardCombiningStarter, + nfkdPendingTrieInsertions, + nfkdBound, + nfkcBound); + if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + if (nfkcBound > 0xC0) { + if (nfkdBound != 0xC0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } else { + if (nfkdBound != nfkcBound) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } + + USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty(); + USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector<PendingDescriptor> uts46PendingTrieInsertions; + UChar32 uts46dBound = 0x10FFFF; + UChar32 uts46Bound = 0x10FFFF; + computeDecompositions("uts46d", + backwardCombiningStarters, + storage16, + storage32, + uts46DecompositionStartsWithNonStarter, + uts46DecompositionStartsWithBackwardCombiningStarter, + uts46PendingTrieInsertions, + uts46dBound, + uts46Bound); + if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + if (uts46Bound > 0xC0) { + if (uts46dBound != 0xC0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } else { + if (uts46dBound != uts46Bound) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } + + uint32_t supplementSize16 = storage16.size() - baseSize16; + uint32_t supplementSize32 = storage32.size() - baseSize32; + + writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound)); + writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound)); + writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound)); + + writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32); + writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32); + + uset_close(nfdDecompositionStartsWithNonStarter); + uset_close(nfkdDecompositionStartsWithNonStarter); + uset_close(uts46DecompositionStartsWithNonStarter); + + uset_close(nfdDecompositionStartsWithBackwardCombiningStarter); + uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter); + uset_close(uts46DecompositionStartsWithBackwardCombiningStarter); + + uset_close(backwardCombiningStarters); + handleError(status, "exportNorm"); + return 0; +} + +#endif // !UCONFIG_NO_NORMALIZATION + +int main(int argc, char* argv[]) { + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + options[OPT_DESTDIR].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + if(options[OPT_VERSION].doesOccur) { + printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n", + U_ICU_DATA_VERSION); + printf("%s\n", U_COPYRIGHT_STRING); + exit(0); + } + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + + if (argc < 0 + || options[OPT_HELP_H].doesOccur + || options[OPT_HELP_QUESTION_MARK].doesOccur + || !options[OPT_MODE].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + printHelp(stdfile, argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + /* get the options values */ + haveCopyright = options[OPT_COPYRIGHT].doesOccur; + destdir = options[OPT_DESTDIR].value; + VERBOSE = options[OPT_VERBOSE].doesOccur; + QUIET = options[OPT_QUIET].doesOccur; + + if (options[OPT_TRIE_TYPE].doesOccur) { + if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) { + trieType = UCPTRIE_TYPE_FAST; + } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) { + trieType = UCPTRIE_TYPE_SMALL; + } else { + fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + const char* mode = options[OPT_MODE].value; + if (uprv_strcmp(mode, "norm") == 0) { +#if !UCONFIG_NO_NORMALIZATION + return exportNorm(); +#else + fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n"); + return U_ILLEGAL_ARGUMENT_ERROR; +#endif + } + if (uprv_strcmp(mode, "uprops") == 0) { + return exportUprops(argc, argv); + } else if (uprv_strcmp(mode, "ucase") == 0) { + return exportCase(argc, argv); + } + + fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; +} |