diff options
Diffstat (limited to '')
-rw-r--r-- | intl/icu/source/common/rbbisetb.h | 147 |
1 files changed, 147 insertions, 0 deletions
diff --git a/intl/icu/source/common/rbbisetb.h b/intl/icu/source/common/rbbisetb.h new file mode 100644 index 0000000000..cd09d3317a --- /dev/null +++ b/intl/icu/source/common/rbbisetb.h @@ -0,0 +1,147 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// +// rbbisetb.h +/* +********************************************************************** +* Copyright (c) 2001-2005, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +*/ + +#ifndef RBBISETB_H +#define RBBISETB_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +#include "unicode/ucptrie.h" +#include "unicode/umutablecptrie.h" +#include "unicode/uobject.h" +#include "rbbirb.h" +#include "uvector.h" + +U_NAMESPACE_BEGIN + +// +// RBBISetBuilder Derives the character categories used by the runtime RBBI engine +// from the Unicode Sets appearing in the source RBBI rules, and +// creates the TRIE table used to map from Unicode to the +// character categories. +// + + +// +// RangeDescriptor +// +// Each of the non-overlapping character ranges gets one of these descriptors. +// All of them are strung together in a linked list, which is kept in order +// (by character) +// +class RangeDescriptor : public UMemory { +public: + UChar32 fStartChar {}; // Start of range, unicode 32 bit value. + UChar32 fEndChar {}; // End of range, unicode 32 bit value. + int32_t fNum {0}; // runtime-mapped input value for this range. + bool fIncludesDict {false}; // True if the range includes $dictionary. + bool fFirstInGroup {false}; // True if first range in a group with the same fNum. + UVector *fIncludesSets {nullptr}; // vector of the the original + // Unicode sets that include this range. + // (Contains ptrs to uset nodes) + RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list. + + RangeDescriptor(UErrorCode &status); + RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); + ~RangeDescriptor(); + void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with + // where appearing in the second (higher) part. + bool isDictionaryRange(); // Check whether this range appears as part of + // the Unicode set named "dictionary" + + RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class + RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class +}; + + +// +// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. +// +// Starting with the rules parse tree from the scanner, +// +// - Enumerate the set of UnicodeSets that are referenced +// by the RBBI rules. +// - compute a derived set of non-overlapping UnicodeSets +// that will correspond to columns in the state table for +// the RBBI execution engine. +// - construct the trie table that maps input characters +// to set numbers in the non-overlapping set of sets. +// + + +class RBBISetBuilder : public UMemory { +public: + RBBISetBuilder(RBBIRuleBuilder *rb); + ~RBBISetBuilder(); + + void buildRanges(); + void buildTrie(); + void addValToSets(UVector *sets, uint32_t val); + void addValToSet (RBBINode *usetNode, uint32_t val); + int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the + // runtime state machine, which are the same as + // columns in the DFA state table + int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or + // last category + 1 if there are no dictionary categories. + int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. + void serializeTrie(uint8_t *where); // write out the serialized Trie. + UChar32 getFirstChar(int32_t val) const; + UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo + // character were encountered. + /** + * Merge two character categories that have been identified as having equivalent behavior. + * The ranges belonging to the second category (table column) will be added to the first. + * @param categories the pair of categories to be merged. + */ + void mergeCategories(IntPair categories); + +#ifdef RBBI_DEBUG + void printSets(); + void printRanges(); + void printRangeGroups(); +#else + #define printSets() + #define printRanges() + #define printRangeGroups() +#endif + +private: + RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. + UErrorCode *fStatus; + + RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors + + UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing + UCPTrie *fTrie; // the Unicode Sets. + uint32_t fTrieSize; + + // Number of range groups, which are groups of ranges that are in the same original UnicodeSets. + int32_t fGroupCount; + + // The number of the first dictionary char category. + // If there are no Dictionary categories, set to the last category + 1. + int32_t fDictCategoriesStart; + + UBool fSawBOF; + + RBBISetBuilder(const RBBISetBuilder &other) = delete; // forbid copying of this class + RBBISetBuilder &operator=(const RBBISetBuilder &other) = delete; // forbid copying of this class +}; + + + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ + +#endif |