summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/regexst.cpp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--intl/icu/source/i18n/regexst.cpp172
1 files changed, 172 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/regexst.cpp b/intl/icu/source/i18n/regexst.cpp
new file mode 100644
index 0000000000..9103230544
--- /dev/null
+++ b/intl/icu/source/i18n/regexst.cpp
@@ -0,0 +1,172 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+//
+// regexst.h
+//
+// Copyright (C) 2004-2015, International Business Machines Corporation and others.
+// All Rights Reserved.
+//
+// This file contains class RegexStaticSets
+//
+// This class is internal to the regular expression implementation.
+// For the public Regular Expression API, see the file "unicode/regex.h"
+//
+// RegexStaticSets groups together the common UnicodeSets that are needed
+// for compiling or executing RegularExpressions. This grouping simplifies
+// the thread safe lazy creation and sharing of these sets across
+// all instances of regular expressions.
+//
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+
+#include "unicode/unistr.h"
+#include "unicode/uniset.h"
+#include "unicode/uchar.h"
+#include "unicode/regex.h"
+#include "uprops.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "uassert.h"
+#include "ucln_in.h"
+#include "umutex.h"
+
+#include "regexcst.h" // Contains state table for the regex pattern parser.
+ // generated by a Perl script.
+#include "regexst.h"
+
+U_NAMESPACE_BEGIN
+
+// "Rule Char" Characters are those with special meaning, and therefore
+// need to be escaped to appear as literals in a regexp.
+constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";
+
+//
+// The backslash escape characters that ICU's unescape() function will handle.
+//
+constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";
+
+//
+// Unicode Set pattern for Regular Expression \w
+//
+constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";
+
+//
+// Unicode Set Definitions for Regular Expression \s
+//
+constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";
+
+//
+// UnicodeSets used in implementation of Grapheme Cluster detection, \X
+//
+constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
+constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]";
+constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]";
+constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]";
+constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]";
+constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]";
+constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]";
+
+
+RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
+UInitOnce gStaticSetsInitOnce {};
+
+
+RegexStaticSets::RegexStaticSets(UErrorCode *status) {
+ // Initialize the shared static sets to their correct values.
+ fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
+ fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
+ fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
+ fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(true, gGC_ExtendPattern, -1), *status).freeze();
+ fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(true, gGC_ControlPattern, -1), *status).freeze();
+ fPropSets[URX_GC_L].applyPattern(UnicodeString(true, gGC_LPattern, -1), *status).freeze();
+ fPropSets[URX_GC_V].applyPattern(UnicodeString(true, gGC_VPattern, -1), *status).freeze();
+ fPropSets[URX_GC_T].applyPattern(UnicodeString(true, gGC_TPattern, -1), *status).freeze();
+ fPropSets[URX_GC_LV].applyPattern(UnicodeString(true, gGC_LVPattern, -1), *status).freeze();
+ fPropSets[URX_GC_LVT].applyPattern(UnicodeString(true, gGC_LVTPattern, -1), *status).freeze();
+
+
+ //
+ // "Normal" is the set of characters that don't need special handling
+ // when finding grapheme cluster boundaries.
+ //
+ fPropSets[URX_GC_NORMAL].complement();
+ fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
+ fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
+ fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
+ fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
+ fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
+ fPropSets[URX_GC_NORMAL].freeze();
+
+ // Initialize the 8-bit fast bit sets from the parallel full
+ // UnicodeSets.
+ //
+ // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
+ // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
+ // This runs in exponential time, making it easy to adjust the time for
+ // convenient measuring.
+ //
+ // This 8 bit optimization dates from the early days of ICU,
+ // with a less optimized UnicodeSet. At the time, the difference
+ // was substantial.
+
+ for (int32_t i=0; i<URX_LAST_SET; i++) {
+ fPropSets8[i].init(&fPropSets[i]);
+ }
+
+ // Sets used while parsing rules, but not referenced from the parse state table
+ fRuleSets[kRuleSet_rule_char-128]
+ .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();
+
+ fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
+ fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
+ fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
+
+ // Finally, initialize an empty UText string for utility purposes
+ fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);
+
+}
+
+
+RegexStaticSets::~RegexStaticSets() {
+ fRuleDigitsAlias = nullptr;
+ utext_close(fEmptyText);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// regex_cleanup Memory cleanup function, free/delete all
+// cached memory. Called by ICU's u_cleanup() function.
+//
+//------------------------------------------------------------------------------
+
+U_CDECL_BEGIN
+static UBool U_CALLCONV
+regex_cleanup() {
+ delete RegexStaticSets::gStaticSets;
+ RegexStaticSets::gStaticSets = nullptr;
+ gStaticSetsInitOnce.reset();
+ return true;
+}
+
+static void U_CALLCONV initStaticSets(UErrorCode &status) {
+ U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
+ ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
+ RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
+ if (U_FAILURE(status)) {
+ delete RegexStaticSets::gStaticSets;
+ RegexStaticSets::gStaticSets = nullptr;
+ }
+ if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ }
+}
+U_CDECL_END
+
+void RegexStaticSets::initGlobals(UErrorCode *status) {
+ umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
+}
+
+U_NAMESPACE_END
+#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS