// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // regexst.h // // Copyright (C) 2004-2015, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains class RegexStaticSets // // This class is internal to the regular expression implementation. // For the public Regular Expression API, see the file "unicode/regex.h" // // RegexStaticSets groups together the common UnicodeSets that are needed // for compiling or executing RegularExpressions. This grouping simplifies // the thread safe lazy creation and sharing of these sets across // all instances of regular expressions. // #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/regex.h" #include "uprops.h" #include "cmemory.h" #include "cstring.h" #include "uassert.h" #include "ucln_in.h" #include "umutex.h" #include "regexcst.h" // Contains state table for the regex pattern parser. // generated by a Perl script. #include "regexst.h" U_NAMESPACE_BEGIN // "Rule Char" Characters are those with special meaning, and therefore // need to be escaped to appear as literals in a regexp. constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\."; // // The backslash escape characters that ICU's unescape() function will handle. // constexpr char16_t const *gUnescapeChars = u"acefnrtuUx"; // // Unicode Set pattern for Regular Expression \w // constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]"; // // Unicode Set Definitions for Regular Expression \s // constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]"; // // UnicodeSets used in implementation of Grapheme Cluster detection, \X // constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]"; constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]"; constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]"; constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]"; constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]"; constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]"; constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]"; RegexStaticSets *RegexStaticSets::gStaticSets = nullptr; UInitOnce gStaticSetsInitOnce {}; RegexStaticSets::RegexStaticSets(UErrorCode *status) { // Initialize the shared static sets to their correct values. fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze(); fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze(); fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze(); fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(true, gGC_ExtendPattern, -1), *status).freeze(); fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(true, gGC_ControlPattern, -1), *status).freeze(); fPropSets[URX_GC_L].applyPattern(UnicodeString(true, gGC_LPattern, -1), *status).freeze(); fPropSets[URX_GC_V].applyPattern(UnicodeString(true, gGC_VPattern, -1), *status).freeze(); fPropSets[URX_GC_T].applyPattern(UnicodeString(true, gGC_TPattern, -1), *status).freeze(); fPropSets[URX_GC_LV].applyPattern(UnicodeString(true, gGC_LVPattern, -1), *status).freeze(); fPropSets[URX_GC_LVT].applyPattern(UnicodeString(true, gGC_LVTPattern, -1), *status).freeze(); // // "Normal" is the set of characters that don't need special handling // when finding grapheme cluster boundaries. // fPropSets[URX_GC_NORMAL].complement(); fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4); fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]); fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]); fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]); fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]); fPropSets[URX_GC_NORMAL].freeze(); // Initialize the 8-bit fast bit sets from the parallel full // UnicodeSets. // // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" // This runs in exponential time, making it easy to adjust the time for // convenient measuring. // // This 8 bit optimization dates from the early days of ICU, // with a less optimized UnicodeSet. At the time, the difference // was substantial. for (int32_t i=0; i