diff options
Diffstat (limited to 'intl/icu/source/common/rbbirb.cpp')
-rw-r--r-- | intl/icu/source/common/rbbirb.cpp | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/intl/icu/source/common/rbbirb.cpp b/intl/icu/source/common/rbbirb.cpp new file mode 100644 index 0000000000..7177254ec4 --- /dev/null +++ b/intl/icu/source/common/rbbirb.cpp @@ -0,0 +1,361 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// +// file: rbbirb.cpp +// +// Copyright (C) 2002-2011, International Business Machines Corporation and others. +// All Rights Reserved. +// +// This file contains the RBBIRuleBuilder class implementation. This is the main class for +// building (compiling) break rules into the tables required by the runtime +// RBBI engine. +// + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +#include "unicode/brkiter.h" +#include "unicode/rbbi.h" +#include "unicode/ubrk.h" +#include "unicode/unistr.h" +#include "unicode/uniset.h" +#include "unicode/uchar.h" +#include "unicode/uchriter.h" +#include "unicode/ustring.h" +#include "unicode/parsepos.h" +#include "unicode/parseerr.h" + +#include "cmemory.h" +#include "cstring.h" +#include "rbbirb.h" +#include "rbbinode.h" +#include "rbbiscan.h" +#include "rbbisetb.h" +#include "rbbitblb.h" +#include "rbbidata.h" +#include "uassert.h" + + +U_NAMESPACE_BEGIN + + +//---------------------------------------------------------------------------------------- +// +// Constructor. +// +//---------------------------------------------------------------------------------------- +RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, + UParseError *parseErr, + UErrorCode &status) + : fRules(rules), fStrippedRules(rules) +{ + fStatus = &status; // status is checked below + fParseError = parseErr; + fDebugEnv = nullptr; +#ifdef RBBI_DEBUG + fDebugEnv = getenv("U_RBBIDEBUG"); +#endif + + + fForwardTree = nullptr; + fReverseTree = nullptr; + fSafeFwdTree = nullptr; + fSafeRevTree = nullptr; + fDefaultTree = &fForwardTree; + fForwardTable = nullptr; + fRuleStatusVals = nullptr; + fChainRules = false; + fLBCMNoChain = false; + fLookAheadHardBreak = false; + fUSetNodes = nullptr; + fRuleStatusVals = nullptr; + fScanner = nullptr; + fSetBuilder = nullptr; + if (parseErr) { + uprv_memset(parseErr, 0, sizeof(UParseError)); + } + + if (U_FAILURE(status)) { + return; + } + + fUSetNodes = new UVector(status); // bcos status gets overwritten here + fRuleStatusVals = new UVector(status); + fScanner = new RBBIRuleScanner(this); + fSetBuilder = new RBBISetBuilder(this); + if (U_FAILURE(status)) { + return; + } + if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + } +} + + + +//---------------------------------------------------------------------------------------- +// +// Destructor +// +//---------------------------------------------------------------------------------------- +RBBIRuleBuilder::~RBBIRuleBuilder() { + + int i; + for (i=0; ; i++) { + RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); + if (n==nullptr) { + break; + } + delete n; + } + + delete fUSetNodes; + delete fSetBuilder; + delete fForwardTable; + delete fForwardTree; + delete fReverseTree; + delete fSafeFwdTree; + delete fSafeRevTree; + delete fScanner; + delete fRuleStatusVals; +} + + + + + +//---------------------------------------------------------------------------------------- +// +// flattenData() - Collect up the compiled RBBI rule data and put it into +// the format for saving in ICU data files, +// which is also the format needed by the RBBI runtime engine. +// +//---------------------------------------------------------------------------------------- +static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} + +RBBIDataHeader *RBBIRuleBuilder::flattenData() { + int32_t i; + + if (U_FAILURE(*fStatus)) { + return nullptr; + } + + // Remove whitespace from the rules to make it smaller. + // The rule parser has already removed comments. + fStrippedRules = fScanner->stripRules(fStrippedRules); + + // Calculate the size of each section in the data. + // Sizes here are padded up to a multiple of 8 for better memory alignment. + // Sections sizes actually stored in the header are for the actual data + // without the padding. + // + int32_t headerSize = align8(sizeof(RBBIDataHeader)); + int32_t forwardTableSize = align8(fForwardTable->getTableSize()); + int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize()); + int32_t trieSize = align8(fSetBuilder->getTrieSize()); + int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); + + int32_t rulesLengthInUTF8 = 0; + u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8, + fStrippedRules.getBuffer(), fStrippedRules.length(), + 0xfffd, nullptr, fStatus); + *fStatus = U_ZERO_ERROR; + + int32_t rulesSize = align8((rulesLengthInUTF8+1)); + + int32_t totalSize = headerSize + + forwardTableSize + + reverseTableSize + + statusTableSize + trieSize + rulesSize; + +#ifdef RBBI_DEBUG + if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) { + RBBIDebugPrintf("Header Size: %8d\n", headerSize); + RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize); + RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize); + RBBIDebugPrintf("Trie Size: %8d\n", trieSize); + RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize); + RBBIDebugPrintf("Rules Size: %8d\n", rulesSize); + RBBIDebugPrintf("-----------------------------\n"); + RBBIDebugPrintf("Total Size: %8d\n", totalSize); + } +#endif + + RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); + if (data == nullptr) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + uprv_memset(data, 0, totalSize); + + + data->fMagic = 0xb1a0; + data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0]; + data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1]; + data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2]; + data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3]; + data->fLength = totalSize; + data->fCatCount = fSetBuilder->getNumCharCategories(); + + data->fFTable = headerSize; + data->fFTableLen = forwardTableSize; + + data->fRTable = data->fFTable + data->fFTableLen; + data->fRTableLen = reverseTableSize; + + data->fTrie = data->fRTable + data->fRTableLen; + data->fTrieLen = trieSize; + data->fStatusTable = data->fTrie + data->fTrieLen; + data->fStatusTableLen= statusTableSize; + data->fRuleSource = data->fStatusTable + statusTableSize; + data->fRuleSourceLen = rulesLengthInUTF8; + + uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); + + fForwardTable->exportTable((uint8_t *)data + data->fFTable); + fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable); + fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); + + int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); + for (i=0; i<fRuleStatusVals->size(); i++) { + ruleStatusTable[i] = fRuleStatusVals->elementAti(i); + } + + u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8, + fStrippedRules.getBuffer(), fStrippedRules.length(), + 0xfffd, nullptr, fStatus); + if (U_FAILURE(*fStatus)) { + return nullptr; + } + + return data; +} + + +//---------------------------------------------------------------------------------------- +// +// createRuleBasedBreakIterator construct from source rules that are passed in +// in a UnicodeString +// +//---------------------------------------------------------------------------------------- +BreakIterator * +RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, + UParseError *parseError, + UErrorCode &status) +{ + // + // Read the input rules, generate a parse tree, symbol table, + // and list of all Unicode Sets referenced by the rules. + // + RBBIRuleBuilder builder(rules, parseError, status); + if (U_FAILURE(status)) { // status checked here bcos build below doesn't + return nullptr; + } + + RBBIDataHeader *data = builder.build(status); + + if (U_FAILURE(status)) { + return nullptr; + } + + // + // Create a break iterator from the compiled rules. + // (Identical to creation from stored pre-compiled rules) + // + // status is checked after init in construction. + RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); + if (U_FAILURE(status)) { + delete This; + This = nullptr; + } + else if(This == nullptr) { // test for nullptr + status = U_MEMORY_ALLOCATION_ERROR; + } + return This; +} + +RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { + if (U_FAILURE(status)) { + return nullptr; + } + + fScanner->parse(); + if (U_FAILURE(status)) { + return nullptr; + } + + // + // UnicodeSet processing. + // Munge the Unicode Sets to create an initial set of character categories. + // + fSetBuilder->buildRanges(); + + // + // Generate the DFA state transition table. + // + fForwardTable = new RBBITableBuilder(this, &fForwardTree, status); + if (fForwardTable == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + + fForwardTable->buildForwardTable(); + + // State table and character category optimization. + // Merge equivalent rows and columns. + // Note that this process alters the initial set of character categories, + // causing the representation of UnicodeSets in the parse tree to become invalid. + + optimizeTables(); + fForwardTable->buildSafeReverseTable(status); + + +#ifdef RBBI_DEBUG + if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) { + fForwardTable->printStates(); + fForwardTable->printRuleStatusTable(); + fForwardTable->printReverseTable(); + } +#endif + + // Generate the mapping tables (TRIE) from input code points to + // the character categories. + // + fSetBuilder->buildTrie(); + + // + // Package up the compiled data into a memory image + // in the run-time format. + // + RBBIDataHeader *data = flattenData(); // returns nullptr if error + if (U_FAILURE(status)) { + return nullptr; + } + return data; +} + +void RBBIRuleBuilder::optimizeTables() { + bool didSomething; + do { + didSomething = false; + + // Begin looking for duplicates with char class 3. + // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, + // and should not have other categories merged into them. + IntPair duplPair = {3, 0}; + while (fForwardTable->findDuplCharClassFrom(&duplPair)) { + fSetBuilder->mergeCategories(duplPair); + fForwardTable->removeColumn(duplPair.second); + didSomething = true; + } + + while (fForwardTable->removeDuplicateStates() > 0) { + didSomething = true; + } + } while (didSomething); +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |