summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/mlbe.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/common/mlbe.h')
-rw-r--r--intl/icu/source/common/mlbe.h116
1 files changed, 116 insertions, 0 deletions
diff --git a/intl/icu/source/common/mlbe.h b/intl/icu/source/common/mlbe.h
new file mode 100644
index 0000000000..38de47e5f5
--- /dev/null
+++ b/intl/icu/source/common/mlbe.h
@@ -0,0 +1,116 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#ifndef MLBREAKENGINE_H
+#define MLBREAKENGINE_H
+
+#include "hash.h"
+#include "unicode/resbund.h"
+#include "unicode/uniset.h"
+#include "unicode/utext.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+/**
+ * A machine learning break engine for the phrase breaking in Japanese.
+ */
+class MlBreakEngine : public UMemory {
+ public:
+ /**
+ * Constructor.
+ *
+ * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
+ * alphabet.
+ * @param closePunctuationSet An UnicodeSet with close punctuation.
+ * @param status Information on any errors encountered.
+ */
+ MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
+ const UnicodeSet &closePunctuationSet, UErrorCode &status);
+
+ /**
+ * Virtual destructor.
+ */
+ virtual ~MlBreakEngine();
+
+ public:
+ /**
+ * Divide up a range of characters handled by this break engine.
+ *
+ * @param inText A UText representing the text
+ * @param rangeStart The start of the range of the characters
+ * @param rangeEnd The end of the range of the characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param inString The normalized string of text ranging from rangeStart to rangeEnd
+ * @param inputMap The vector storing the native index of inText
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
+ UVector32 &foundBreaks, const UnicodeString &inString,
+ const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
+
+ private:
+ /**
+ * Load the machine learning's model file.
+ *
+ * @param error Information on any errors encountered.
+ */
+ void loadMLModel(UErrorCode &error);
+
+ /**
+ * In the machine learning's model file, specify the name of the key and value to load the
+ * corresponding feature and its score.
+ *
+ * @param rb A ResouceBundle corresponding to the model file.
+ * @param keyName The kay name in the model file.
+ * @param valueName The value name in the model file.
+ * @param model A hashtable to store the pairs of the feature and its score.
+ * @param error Information on any errors encountered.
+ */
+ void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
+ Hashtable &model, UErrorCode &error);
+
+ /**
+ * Initialize the index list from the input string.
+ *
+ * @param inString A input string to be segmented.
+ * @param indexList A code unit index list of inString.
+ * @param status Information on any errors encountered.
+ * @return The number of code units of the first four characters in inString.
+ */
+ int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
+ UErrorCode &status) const;
+
+ /**
+ * Evaluate whether the index is a potential breakpoint.
+ *
+ * @param inString A input string to be segmented.
+ * @param indexList A code unit index list of the inString.
+ * @param startIdx The start index of the indexList.
+ * @param numCodeUnits The current code unit boundary of the indexList.
+ * @param numBreaks The accumulated number of breakpoints.
+ * @param boundary A vector including the index of the breakpoint.
+ * @param status Information on any errors encountered.
+ * @return The number of breakpoints
+ */
+ int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
+ int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
+ UErrorCode &status) const;
+
+ void printUnicodeString(const UnicodeString &s) const;
+
+ UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
+ UnicodeSet fClosePunctuationSet;
+ Hashtable fModel[13]; // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
+ int32_t fNegativeSum;
+};
+
+#endif
+
+U_NAMESPACE_END
+
+/* MLBREAKENGINE_H */
+#endif