summaryrefslogtreecommitdiffstats
path: root/vendor/bstr/scripts/regex/word.sh
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/bstr/scripts/regex/word.sh')
-rw-r--r--vendor/bstr/scripts/regex/word.sh111
1 files changed, 111 insertions, 0 deletions
diff --git a/vendor/bstr/scripts/regex/word.sh b/vendor/bstr/scripts/regex/word.sh
new file mode 100644
index 000000000..78c7a05cf
--- /dev/null
+++ b/vendor/bstr/scripts/regex/word.sh
@@ -0,0 +1,111 @@
+#!/bin/sh
+
+# vim: indentexpr= nosmartindent autoindent
+# vim: tabstop=2 shiftwidth=2 softtabstop=2
+
+# See the comments in regex/sentence.sh for the general approach to how this
+# regex was written.
+#
+# Writing the regex for this was *hard*. It took me two days of hacking to get
+# this far, and that was after I had finished the sentence regex, so my brain
+# was fully cached on this. Unlike the sentence regex, the rules in the regex
+# below don't correspond as nicely to the rules in UAX #29. In particular, the
+# UAX #29 rules have a ton of overlap with each other, which requires crazy
+# stuff in the regex. I'm not even sure the regex below is 100% correct or even
+# minimal, however, I did compare this with the ICU word segmenter on a few
+# different corpora, and it produces identical results. (In addition to of
+# course passing the UCD tests.)
+#
+# In general, I consider this approach to be a failure. Firstly, this is
+# clearly a write-only regex. Secondly, building the minimized DFA for this is
+# incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly,
+# reversing this regex (for reverse word iteration) results in a >19MB DFA.
+# Yes. That's MB. Wat. And it took 5 minutes to build.
+#
+# I think we might consider changing our approach to this problem. The normal
+# path I've seen, I think, is to decode codepoints one at a time, and then
+# thread them through a state machine in the code itself. We could take this
+# approach, or possibly combine it with a DFA that tells us which Word_Break
+# value a codepoint has. I'd prefer the latter approach, but it requires adding
+# RegexSet support to regex-automata. Something that should definitely be done,
+# but is a fair amount of work.
+#
+# Gah.
+
+CR="\p{wb=CR}"
+LF="\p{wb=LF}"
+Newline="\p{wb=Newline}"
+ZWJ="\p{wb=ZWJ}"
+RI="\p{wb=Regional_Indicator}"
+Katakana="\p{wb=Katakana}"
+HebrewLet="\p{wb=HebrewLetter}"
+ALetter="\p{wb=ALetter}"
+SingleQuote="\p{wb=SingleQuote}"
+DoubleQuote="\p{wb=DoubleQuote}"
+MidNumLet="\p{wb=MidNumLet}"
+MidLetter="\p{wb=MidLetter}"
+MidNum="\p{wb=MidNum}"
+Numeric="\p{wb=Numeric}"
+ExtendNumLet="\p{wb=ExtendNumLet}"
+WSegSpace="\p{wb=WSegSpace}"
+
+Any="\p{any}"
+Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]"
+ExtendPict="\p{Extended_Pictographic}"
+AHLetter="[$ALetter $HebrewLet]"
+MidNumLetQ="[$MidNumLet $SingleQuote]"
+
+AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*"
+NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*"
+
+echo "(?x)
+$CR $LF
+|
+[$Newline $CR $LF]
+|
+$WSegSpace $WSegSpace+
+|
+(
+ ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+
+ |
+ ($ExtendNumLet $Ex*)* $AHLetter $Ex*
+ (
+ (
+ ($NumericRepeat | $ExtendNumLet $Ex*)*
+ |
+ [$MidLetter $MidNumLetQ] $Ex*
+ )
+ $AHLetter $Ex*
+ )+
+ ($NumericRepeat | $ExtendNumLet $Ex*)*
+ |
+ ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+
+ |
+ ($ExtendNumLet $Ex*)* $Numeric $Ex*
+ (
+ (
+ ($AHLetterRepeat | $ExtendNumLet $Ex*)*
+ |
+ [$MidNum $MidNumLetQ] $Ex*
+ )
+ $Numeric $Ex*
+ )+
+ ($AHLetterRepeat | $ExtendNumLet $Ex*)*
+ |
+ ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+
+ |
+ $Katakana $Ex*
+ (($Katakana | $ExtendNumLet) $Ex*)+
+ |
+ $ExtendNumLet $Ex*
+ (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+
+)+
+|
+$HebrewLet $Ex* $SingleQuote $Ex*
+|
+($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex*
+|
+$RI $Ex* $RI $Ex*
+|
+$Any $Ex*
+"