summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/lstmbe.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/common/lstmbe.h')
-rw-r--r--intl/icu/source/common/lstmbe.h88
1 files changed, 88 insertions, 0 deletions
diff --git a/intl/icu/source/common/lstmbe.h b/intl/icu/source/common/lstmbe.h
new file mode 100644
index 0000000000..77c97d85fa
--- /dev/null
+++ b/intl/icu/source/common/lstmbe.h
@@ -0,0 +1,88 @@
+// © 2021 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#ifndef LSTMBE_H
+#define LSTMBE_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "unicode/uniset.h"
+#include "unicode/ures.h"
+#include "unicode/utext.h"
+#include "unicode/utypes.h"
+
+#include "brkeng.h"
+#include "dictbe.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+class Vectorizer;
+struct LSTMData;
+
+/*******************************************************************
+ * LSTMBreakEngine
+ */
+
+/**
+ * <p>LSTMBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * LSTM to determine language-specific breaks.</p>
+ *
+ * <p>After it is constructed a LSTMBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class LSTMBreakEngine : public DictionaryBreakEngine {
+public:
+ /**
+ * <p>Constructor.</p>
+ */
+ LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~LSTMBreakEngine();
+
+ virtual const char16_t* name() const;
+
+protected:
+ /**
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange(UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode& status) const override;
+private:
+ const LSTMData* fData;
+ const Vectorizer* fVectorizer;
+};
+
+U_CAPI const LanguageBreakEngine* U_EXPORT2 CreateLSTMBreakEngine(
+ UScriptCode script, const LSTMData* data, UErrorCode& status);
+
+U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(
+ UResourceBundle* rb, UErrorCode& status);
+
+U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
+ UScriptCode script, UErrorCode& status);
+
+U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
+U_CAPI const char16_t* U_EXPORT2 LSTMDataName(const LSTMData* data);
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
+#endif /* LSTMBE_H */