summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/dictbe.h
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 01:47:29 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 01:47:29 +0000
commit0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d (patch)
treea31f07c9bcca9d56ce61e9a1ffd30ef350d513aa /intl/icu/source/common/dictbe.h
parentInitial commit. (diff)
downloadfirefox-esr-0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d.tar.xz
firefox-esr-0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d.zip
Adding upstream version 115.8.0esr.upstream/115.8.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/icu/source/common/dictbe.h')
-rw-r--r--intl/icu/source/common/dictbe.h433
1 files changed, 433 insertions, 0 deletions
diff --git a/intl/icu/source/common/dictbe.h b/intl/icu/source/common/dictbe.h
new file mode 100644
index 0000000000..a2c761bdc3
--- /dev/null
+++ b/intl/icu/source/common/dictbe.h
@@ -0,0 +1,433 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/**
+ *******************************************************************************
+ * Copyright (C) 2006-2014, International Business Machines Corporation *
+ * and others. All Rights Reserved. *
+ *******************************************************************************
+ */
+
+#ifndef DICTBE_H
+#define DICTBE_H
+
+#include "unicode/utypes.h"
+#include "unicode/uniset.h"
+#include "unicode/utext.h"
+
+#include "brkeng.h"
+#include "hash.h"
+#include "mlbe.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+class DictionaryMatcher;
+class MlBreakEngine;
+class Normalizer2;
+
+/*******************************************************************
+ * DictionaryBreakEngine
+ */
+
+/**
+ * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
+ * dictionary to determine language-specific breaks.</p>
+ *
+ * <p>After it is constructed a DictionaryBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class DictionaryBreakEngine : public LanguageBreakEngine {
+ private:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+
+ UnicodeSet fSet;
+
+ public:
+
+ /**
+ * <p>Constructor </p>
+ */
+ DictionaryBreakEngine();
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~DictionaryBreakEngine();
+
+ /**
+ * <p>Indicate whether this engine handles a particular character for
+ * a particular kind of break.</p>
+ *
+ * @param c A character which begins a run that the engine might handle
+ * @return true if this engine handles the particular character and break
+ * type.
+ */
+ virtual UBool handles(UChar32 c) const override;
+
+ /**
+ * <p>Find any breaks within a run in the supplied text.</p>
+ *
+ * @param text A UText representing the text. The iterator is left at
+ * the end of the run of characters which the engine is capable of handling
+ * that starts from the first character in the range.
+ * @param startPos The start of the run within the supplied text.
+ * @param endPos The end of the run within the supplied text.
+ * @param foundBreaks vector of int32_t to receive the break positions
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found.
+ */
+ virtual int32_t findBreaks( UText *text,
+ int32_t startPos,
+ int32_t endPos,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode& status ) const override;
+
+ protected:
+
+ /**
+ * <p>Set the character set handled by this engine.</p>
+ *
+ * @param set A UnicodeSet of the set of characters handled by the engine
+ */
+ virtual void setCharacters( const UnicodeSet &set );
+
+ /**
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode& status) const = 0;
+
+};
+
+/*******************************************************************
+ * ThaiBreakEngine
+ */
+
+/**
+ * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * dictionary and heuristics to determine Thai-specific breaks.</p>
+ *
+ * <p>After it is constructed a ThaiBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class ThaiBreakEngine : public DictionaryBreakEngine {
+ private:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+
+ UnicodeSet fEndWordSet;
+ UnicodeSet fBeginWordSet;
+ UnicodeSet fSuffixSet;
+ UnicodeSet fMarkSet;
+ DictionaryMatcher *fDictionary;
+
+ public:
+
+ /**
+ * <p>Default constructor.</p>
+ *
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+ * engine is deleted.
+ */
+ ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~ThaiBreakEngine();
+
+ protected:
+ /**
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode& status) const override;
+
+};
+
+/*******************************************************************
+ * LaoBreakEngine
+ */
+
+/**
+ * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * dictionary and heuristics to determine Lao-specific breaks.</p>
+ *
+ * <p>After it is constructed a LaoBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class LaoBreakEngine : public DictionaryBreakEngine {
+ private:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+
+ UnicodeSet fEndWordSet;
+ UnicodeSet fBeginWordSet;
+ UnicodeSet fMarkSet;
+ DictionaryMatcher *fDictionary;
+
+ public:
+
+ /**
+ * <p>Default constructor.</p>
+ *
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+ * engine is deleted.
+ */
+ LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~LaoBreakEngine();
+
+ protected:
+ /**
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode& status) const override;
+
+};
+
+/*******************************************************************
+ * BurmeseBreakEngine
+ */
+
+/**
+ * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
+ *
+ * <p>After it is constructed a BurmeseBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class BurmeseBreakEngine : public DictionaryBreakEngine {
+ private:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+
+ UnicodeSet fEndWordSet;
+ UnicodeSet fBeginWordSet;
+ UnicodeSet fMarkSet;
+ DictionaryMatcher *fDictionary;
+
+ public:
+
+ /**
+ * <p>Default constructor.</p>
+ *
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+ * engine is deleted.
+ */
+ BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~BurmeseBreakEngine();
+
+ protected:
+ /**
+ * <p>Divide up a range of known dictionary characters.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode& status) const override;
+
+};
+
+/*******************************************************************
+ * KhmerBreakEngine
+ */
+
+/**
+ * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
+ *
+ * <p>After it is constructed a KhmerBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class KhmerBreakEngine : public DictionaryBreakEngine {
+ private:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+
+ UnicodeSet fEndWordSet;
+ UnicodeSet fBeginWordSet;
+ UnicodeSet fMarkSet;
+ DictionaryMatcher *fDictionary;
+
+ public:
+
+ /**
+ * <p>Default constructor.</p>
+ *
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+ * engine is deleted.
+ */
+ KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~KhmerBreakEngine();
+
+ protected:
+ /**
+ * <p>Divide up a range of known dictionary characters.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode& status) const override;
+
+};
+
+#if !UCONFIG_NO_NORMALIZATION
+
+/*******************************************************************
+ * CjkBreakEngine
+ */
+
+//indicates language/script that the CjkBreakEngine will handle
+enum LanguageType {
+ kKorean,
+ kChineseJapanese
+};
+
+/**
+ * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * dictionary with costs associated with each word and
+ * Viterbi decoding to determine CJK-specific breaks.</p>
+ */
+class CjkBreakEngine : public DictionaryBreakEngine {
+ protected:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+ UnicodeSet fHangulWordSet;
+ UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
+ UnicodeSet fClosePunctuationSet;
+
+ DictionaryMatcher *fDictionary;
+ const Normalizer2 *nfkcNorm2;
+ MlBreakEngine *fMlBreakEngine;
+ bool isCj;
+
+ private:
+ // Load Japanese extensions.
+ void loadJapaneseExtensions(UErrorCode& error);
+ // Load Japanese Hiragana.
+ void loadHiragana(UErrorCode& error);
+ // Initialize fSkipSet by loading Japanese Hiragana and extensions.
+ void initJapanesePhraseParameter(UErrorCode& error);
+
+ Hashtable fSkipSet;
+
+ public:
+
+ /**
+ * <p>Default constructor.</p>
+ *
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+ * engine is deleted. The DictionaryMatcher must contain costs for each word
+ * in order for the dictionary to work properly.
+ */
+ CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~CjkBreakEngine();
+
+ protected:
+ /**
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode& status) const override;
+
+};
+
+#endif
+
+U_NAMESPACE_END
+
+ /* DICTBE_H */
+#endif