diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/bstr/scripts/regex/word.sh | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/bstr/scripts/regex/word.sh')
-rw-r--r-- | vendor/bstr/scripts/regex/word.sh | 111 |
1 files changed, 111 insertions, 0 deletions
diff --git a/vendor/bstr/scripts/regex/word.sh b/vendor/bstr/scripts/regex/word.sh new file mode 100644 index 000000000..78c7a05cf --- /dev/null +++ b/vendor/bstr/scripts/regex/word.sh @@ -0,0 +1,111 @@ +#!/bin/sh + +# vim: indentexpr= nosmartindent autoindent +# vim: tabstop=2 shiftwidth=2 softtabstop=2 + +# See the comments in regex/sentence.sh for the general approach to how this +# regex was written. +# +# Writing the regex for this was *hard*. It took me two days of hacking to get +# this far, and that was after I had finished the sentence regex, so my brain +# was fully cached on this. Unlike the sentence regex, the rules in the regex +# below don't correspond as nicely to the rules in UAX #29. In particular, the +# UAX #29 rules have a ton of overlap with each other, which requires crazy +# stuff in the regex. I'm not even sure the regex below is 100% correct or even +# minimal, however, I did compare this with the ICU word segmenter on a few +# different corpora, and it produces identical results. (In addition to of +# course passing the UCD tests.) +# +# In general, I consider this approach to be a failure. Firstly, this is +# clearly a write-only regex. Secondly, building the minimized DFA for this is +# incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly, +# reversing this regex (for reverse word iteration) results in a >19MB DFA. +# Yes. That's MB. Wat. And it took 5 minutes to build. +# +# I think we might consider changing our approach to this problem. The normal +# path I've seen, I think, is to decode codepoints one at a time, and then +# thread them through a state machine in the code itself. We could take this +# approach, or possibly combine it with a DFA that tells us which Word_Break +# value a codepoint has. I'd prefer the latter approach, but it requires adding +# RegexSet support to regex-automata. Something that should definitely be done, +# but is a fair amount of work. +# +# Gah. + +CR="\p{wb=CR}" +LF="\p{wb=LF}" +Newline="\p{wb=Newline}" +ZWJ="\p{wb=ZWJ}" +RI="\p{wb=Regional_Indicator}" +Katakana="\p{wb=Katakana}" +HebrewLet="\p{wb=HebrewLetter}" +ALetter="\p{wb=ALetter}" +SingleQuote="\p{wb=SingleQuote}" +DoubleQuote="\p{wb=DoubleQuote}" +MidNumLet="\p{wb=MidNumLet}" +MidLetter="\p{wb=MidLetter}" +MidNum="\p{wb=MidNum}" +Numeric="\p{wb=Numeric}" +ExtendNumLet="\p{wb=ExtendNumLet}" +WSegSpace="\p{wb=WSegSpace}" + +Any="\p{any}" +Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]" +ExtendPict="\p{Extended_Pictographic}" +AHLetter="[$ALetter $HebrewLet]" +MidNumLetQ="[$MidNumLet $SingleQuote]" + +AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*" +NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*" + +echo "(?x) +$CR $LF +| +[$Newline $CR $LF] +| +$WSegSpace $WSegSpace+ +| +( + ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+ + | + ($ExtendNumLet $Ex*)* $AHLetter $Ex* + ( + ( + ($NumericRepeat | $ExtendNumLet $Ex*)* + | + [$MidLetter $MidNumLetQ] $Ex* + ) + $AHLetter $Ex* + )+ + ($NumericRepeat | $ExtendNumLet $Ex*)* + | + ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+ + | + ($ExtendNumLet $Ex*)* $Numeric $Ex* + ( + ( + ($AHLetterRepeat | $ExtendNumLet $Ex*)* + | + [$MidNum $MidNumLetQ] $Ex* + ) + $Numeric $Ex* + )+ + ($AHLetterRepeat | $ExtendNumLet $Ex*)* + | + ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+ + | + $Katakana $Ex* + (($Katakana | $ExtendNumLet) $Ex*)+ + | + $ExtendNumLet $Ex* + (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+ +)+ +| +$HebrewLet $Ex* $SingleQuote $Ex* +| +($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex* +| +$RI $Ex* $RI $Ex* +| +$Any $Ex* +" |