summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/rbbirb.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/common/rbbirb.h')
-rw-r--r--intl/icu/source/common/rbbirb.h237
1 files changed, 237 insertions, 0 deletions
diff --git a/intl/icu/source/common/rbbirb.h b/intl/icu/source/common/rbbirb.h
new file mode 100644
index 0000000000..d983a184b6
--- /dev/null
+++ b/intl/icu/source/common/rbbirb.h
@@ -0,0 +1,237 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+//
+// rbbirb.h
+//
+// Copyright (C) 2002-2008, International Business Machines Corporation and others.
+// All Rights Reserved.
+//
+// This file contains declarations for several classes from the
+// Rule Based Break Iterator rule builder.
+//
+
+
+#ifndef RBBIRB_H
+#define RBBIRB_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include <utility>
+
+#include "unicode/uobject.h"
+#include "unicode/rbbi.h"
+#include "unicode/uniset.h"
+#include "unicode/parseerr.h"
+#include "uhash.h"
+#include "uvector.h"
+#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
+ // looks up references to $variables within a set.
+
+
+U_NAMESPACE_BEGIN
+
+class RBBIRuleScanner;
+struct RBBIRuleTableEl;
+class RBBISetBuilder;
+class RBBINode;
+class RBBITableBuilder;
+
+
+
+//--------------------------------------------------------------------------------
+//
+// RBBISymbolTable. Implements SymbolTable interface that is used by the
+// UnicodeSet parser to resolve references to $variables.
+//
+//--------------------------------------------------------------------------------
+class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
+public: // of these structs for each entry.
+ RBBISymbolTableEntry();
+ UnicodeString key;
+ RBBINode *val;
+ ~RBBISymbolTableEntry();
+
+private:
+ RBBISymbolTableEntry(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class
+ RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class
+};
+
+
+class RBBISymbolTable : public UMemory, public SymbolTable {
+private:
+ const UnicodeString &fRules;
+ UHashtable *fHashTable;
+ RBBIRuleScanner *fRuleScanner;
+
+ // These next two fields are part of the mechanism for passing references to
+ // already-constructed UnicodeSets back to the UnicodeSet constructor
+ // when the pattern includes $variable references.
+ const UnicodeString ffffString; // = "/uffff"
+ UnicodeSet *fCachedSetLookup;
+
+public:
+ // API inherited from class SymbolTable
+ virtual const UnicodeString* lookup(const UnicodeString& s) const override;
+ virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override;
+ virtual UnicodeString parseReference(const UnicodeString& text,
+ ParsePosition& pos, int32_t limit) const override;
+
+ // Additional Functions
+ RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
+ virtual ~RBBISymbolTable();
+
+ virtual RBBINode *lookupNode(const UnicodeString &key) const;
+ virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err);
+
+#ifdef RBBI_DEBUG
+ virtual void rbbiSymtablePrint() const;
+#else
+ // A do-nothing inline function for non-debug builds. Member funcs can't be empty
+ // or the call sites won't compile.
+ int32_t fFakeField;
+ #define rbbiSymtablePrint() fFakeField=0;
+#endif
+
+private:
+ RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
+ RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
+};
+
+
+//--------------------------------------------------------------------------------
+//
+// class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
+//
+//--------------------------------------------------------------------------------
+class RBBIRuleBuilder : public UMemory {
+public:
+
+ // Create a rule based break iterator from a set of rules.
+ // This function is the main entry point into the rule builder. The
+ // public ICU API for creating RBBIs uses this function to do the actual work.
+ //
+ static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules,
+ UParseError *parseError,
+ UErrorCode &status);
+
+public:
+ // The "public" functions and data members that appear below are accessed
+ // (and shared) by the various parts that make up the rule builder. They
+ // are NOT intended to be accessed by anything outside of the
+ // rule builder implementation.
+ RBBIRuleBuilder(const UnicodeString &rules,
+ UParseError *parseErr,
+ UErrorCode &status
+ );
+
+ virtual ~RBBIRuleBuilder();
+
+ /**
+ * Build the state tables and char class Trie from the source rules.
+ */
+ RBBIDataHeader *build(UErrorCode &status);
+
+
+ /**
+ * Fold together redundant character classes (table columns) and
+ * redundant states (table rows). Done after initial table generation,
+ * before serializing the result.
+ */
+ void optimizeTables();
+
+ char *fDebugEnv; // controls debug trace output
+ UErrorCode *fStatus; // Error reporting. Keeping status
+ UParseError *fParseError; // here avoids passing it everywhere.
+ const UnicodeString &fRules; // The rule string that we are compiling
+ UnicodeString fStrippedRules; // The rule string, with comments stripped.
+
+ RBBIRuleScanner *fScanner; // The scanner.
+ RBBINode *fForwardTree; // The parse trees, generated by the scanner,
+ RBBINode *fReverseTree; // then manipulated by subsequent steps.
+ RBBINode *fSafeFwdTree;
+ RBBINode *fSafeRevTree;
+
+ RBBINode **fDefaultTree; // For rules not qualified with a !
+ // the tree to which they belong to.
+
+ UBool fChainRules; // True for chained Unicode TR style rules.
+ // False for traditional regexp rules.
+
+ UBool fLBCMNoChain; // True: suppress chaining of rules on
+ // chars with LineBreak property == CM.
+
+ UBool fLookAheadHardBreak; // True: Look ahead matches cause an
+ // immediate break, no continuing for the
+ // longest match.
+
+ RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
+ UVector *fUSetNodes; // Vector of all uset nodes.
+
+ RBBITableBuilder *fForwardTable; // State transition table, build time form.
+
+ UVector *fRuleStatusVals; // The values that can be returned
+ // from getRuleStatus().
+
+ RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
+ // data tables..
+private:
+ RBBIRuleBuilder(const RBBIRuleBuilder &other) = delete; // forbid copying of this class
+ RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other) = delete; // forbid copying of this class
+};
+
+
+
+
+//----------------------------------------------------------------------------
+//
+// RBBISetTableEl is an entry in the hash table of UnicodeSets that have
+// been encountered. The val Node will be of nodetype uset
+// and contain pointers to the actual UnicodeSets.
+// The Key is the source string for initializing the set.
+//
+// The hash table is used to avoid creating duplicate
+// unnamed (not $var references) UnicodeSets.
+//
+// Memory Management:
+// The Hash Table owns these RBBISetTableEl structs and
+// the key strings. It does NOT own the val nodes.
+//
+//----------------------------------------------------------------------------
+struct RBBISetTableEl {
+ UnicodeString *key;
+ RBBINode *val;
+};
+
+/**
+ * A pair of ints, used to bundle pairs of states or pairs of character classes.
+ */
+typedef std::pair<int32_t, int32_t> IntPair;
+
+
+//----------------------------------------------------------------------------
+//
+// RBBIDebugPrintf Printf equivalent, for debugging output.
+// Conditional compilation of the implementation lets us
+// get rid of the stdio dependency in environments where it
+// is unavailable.
+//
+//----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+#include <stdio.h>
+#define RBBIDebugPrintf printf
+#define RBBIDebugPuts puts
+#else
+#undef RBBIDebugPrintf
+#define RBBIDebugPuts(arg)
+#endif
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
+#endif
+
+
+