diff options
Diffstat (limited to 'intl/icu/source/common/brkeng.cpp')
-rw-r--r-- | intl/icu/source/common/brkeng.cpp | 284 |
1 files changed, 284 insertions, 0 deletions
diff --git a/intl/icu/source/common/brkeng.cpp b/intl/icu/source/common/brkeng.cpp new file mode 100644 index 0000000000..78492db662 --- /dev/null +++ b/intl/icu/source/common/brkeng.cpp @@ -0,0 +1,284 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ************************************************************************************ + * Copyright (C) 2006-2016, International Business Machines Corporation + * and others. All Rights Reserved. + ************************************************************************************ + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "unicode/chariter.h" +#include "unicode/ures.h" +#include "unicode/udata.h" +#include "unicode/putil.h" +#include "unicode/ustring.h" +#include "unicode/uscript.h" +#include "unicode/ucharstrie.h" +#include "unicode/bytestrie.h" + +#include "brkeng.h" +#include "cmemory.h" +#include "dictbe.h" +#include "charstr.h" +#include "dictionarydata.h" +#include "mutex.h" +#include "uvector.h" +#include "umutex.h" +#include "uresimp.h" +#include "ubrkimpl.h" + +U_NAMESPACE_BEGIN + +/* + ****************************************************************** + */ + +LanguageBreakEngine::LanguageBreakEngine() { +} + +LanguageBreakEngine::~LanguageBreakEngine() { +} + +/* + ****************************************************************** + */ + +LanguageBreakFactory::LanguageBreakFactory() { +} + +LanguageBreakFactory::~LanguageBreakFactory() { +} + +/* + ****************************************************************** + */ + +UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) { + (void)status; +} + +UnhandledEngine::~UnhandledEngine() { + delete fHandled; + fHandled = nullptr; +} + +UBool +UnhandledEngine::handles(UChar32 c) const { + return fHandled && fHandled->contains(c); +} + +int32_t +UnhandledEngine::findBreaks( UText *text, + int32_t /* startPos */, + int32_t endPos, + UVector32 &/*foundBreaks*/ ) const { + UChar32 c = utext_current32(text); + while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) { + utext_next32(text); // TODO: recast loop to work with post-increment operations. + c = utext_current32(text); + } + return 0; +} + +void +UnhandledEngine::handleCharacter(UChar32 c) { + if (fHandled == nullptr) { + fHandled = new UnicodeSet(); + if (fHandled == nullptr) { + return; + } + } + if (!fHandled->contains(c)) { + UErrorCode status = U_ZERO_ERROR; + // Apply the entire script of the character. + int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); + fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status); + } +} + +/* + ****************************************************************** + */ + +ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { + fEngines = 0; +} + +ICULanguageBreakFactory::~ICULanguageBreakFactory() { + if (fEngines != 0) { + delete fEngines; + } +} + +U_NAMESPACE_END +U_CDECL_BEGIN +static void U_CALLCONV _deleteEngine(void *obj) { + delete (const icu::LanguageBreakEngine *) obj; +} +U_CDECL_END +U_NAMESPACE_BEGIN + +const LanguageBreakEngine * +ICULanguageBreakFactory::getEngineFor(UChar32 c) { + const LanguageBreakEngine *lbe = NULL; + UErrorCode status = U_ZERO_ERROR; + + static UMutex gBreakEngineMutex; + Mutex m(&gBreakEngineMutex); + + if (fEngines == NULL) { + UStack *engines = new UStack(_deleteEngine, NULL, status); + if (U_FAILURE(status) || engines == NULL) { + // Note: no way to return error code to caller. + delete engines; + return NULL; + } + fEngines = engines; + } else { + int32_t i = fEngines->size(); + while (--i >= 0) { + lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); + if (lbe != NULL && lbe->handles(c)) { + return lbe; + } + } + } + + // We didn't find an engine. Create one. + lbe = loadEngineFor(c); + if (lbe != NULL) { + fEngines->push((void *)lbe, status); + } + return lbe; +} + +const LanguageBreakEngine * +ICULanguageBreakFactory::loadEngineFor(UChar32 c) { + UErrorCode status = U_ZERO_ERROR; + UScriptCode code = uscript_getScript(c, &status); + if (U_SUCCESS(status)) { + DictionaryMatcher *m = loadDictionaryMatcherFor(code); + if (m != NULL) { + const LanguageBreakEngine *engine = NULL; + switch(code) { + case USCRIPT_THAI: + engine = new ThaiBreakEngine(m, status); + break; + case USCRIPT_LAO: + engine = new LaoBreakEngine(m, status); + break; + case USCRIPT_MYANMAR: + engine = new BurmeseBreakEngine(m, status); + break; + case USCRIPT_KHMER: + engine = new KhmerBreakEngine(m, status); + break; + +#if !UCONFIG_NO_NORMALIZATION + // CJK not available w/o normalization + case USCRIPT_HANGUL: + engine = new CjkBreakEngine(m, kKorean, status); + break; + + // use same BreakEngine and dictionary for both Chinese and Japanese + case USCRIPT_HIRAGANA: + case USCRIPT_KATAKANA: + case USCRIPT_HAN: + engine = new CjkBreakEngine(m, kChineseJapanese, status); + break; +#if 0 + // TODO: Have to get some characters with script=common handled + // by CjkBreakEngine (e.g. U+309B). Simply subjecting + // them to CjkBreakEngine does not work. The engine has to + // special-case them. + case USCRIPT_COMMON: + { + UBlockCode block = ublock_getCode(code); + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) + engine = new CjkBreakEngine(dict, kChineseJapanese, status); + break; + } +#endif +#endif + + default: + break; + } + if (engine == NULL) { + delete m; + } + else if (U_FAILURE(status)) { + delete engine; + engine = NULL; + } + return engine; + } + } + return NULL; +} + +DictionaryMatcher * +ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { + UErrorCode status = U_ZERO_ERROR; + // open root from brkitr tree. + UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); + b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); + int32_t dictnlength = 0; + const UChar *dictfname = + ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); + if (U_FAILURE(status)) { + ures_close(b); + return NULL; + } + CharString dictnbuf; + CharString ext; + const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot + if (extStart != NULL) { + int32_t len = (int32_t)(extStart - dictfname); + ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); + dictnlength = len; + } + dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); + ures_close(b); + + UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); + if (U_SUCCESS(status)) { + // build trie + const uint8_t *data = (const uint8_t *)udata_getMemory(file); + const int32_t *indexes = (const int32_t *)data; + const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; + const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; + DictionaryMatcher *m = NULL; + if (trieType == DictionaryData::TRIE_TYPE_BYTES) { + const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; + const char *characters = (const char *)(data + offset); + m = new BytesDictionaryMatcher(characters, transform, file); + } + else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { + const UChar *characters = (const UChar *)(data + offset); + m = new UCharsDictionaryMatcher(characters, file); + } + if (m == NULL) { + // no matcher exists to take ownership - either we are an invalid + // type or memory allocation failed + udata_close(file); + } + return m; + } else if (dictfname != NULL) { + // we don't have a dictionary matcher. + // returning NULL here will cause us to fail to find a dictionary break engine, as expected + status = U_ZERO_ERROR; + return NULL; + } + return NULL; +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |