diff options
Diffstat (limited to 'vendor/bstr/scripts/regex')
-rw-r--r-- | vendor/bstr/scripts/regex/grapheme.sh | 50 | ||||
-rw-r--r-- | vendor/bstr/scripts/regex/sentence.sh | 176 | ||||
-rw-r--r-- | vendor/bstr/scripts/regex/word.sh | 111 |
3 files changed, 0 insertions, 337 deletions
diff --git a/vendor/bstr/scripts/regex/grapheme.sh b/vendor/bstr/scripts/regex/grapheme.sh deleted file mode 100644 index 0b2b54daa..000000000 --- a/vendor/bstr/scripts/regex/grapheme.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/sh - -# vim: indentexpr= nosmartindent autoindent -# vim: tabstop=2 shiftwidth=2 softtabstop=2 - -# This regex was manually written, derived from the rules in UAX #29. -# Particularly, from Table 1c, which lays out a regex for grapheme clusters. - -CR="\p{gcb=CR}" -LF="\p{gcb=LF}" -Control="\p{gcb=Control}" -Prepend="\p{gcb=Prepend}" -L="\p{gcb=L}" -V="\p{gcb=V}" -LV="\p{gcb=LV}" -LVT="\p{gcb=LVT}" -T="\p{gcb=T}" -RI="\p{gcb=RI}" -Extend="\p{gcb=Extend}" -ZWJ="\p{gcb=ZWJ}" -SpacingMark="\p{gcb=SpacingMark}" - -Any="\p{any}" -ExtendPict="\p{Extended_Pictographic}" - -echo "(?x) -$CR $LF -| -$Control -| -$Prepend* -( - ( - ($L* ($V+ | $LV $V* | $LVT) $T*) - | - $L+ - | - $T+ - ) - | - $RI $RI - | - $ExtendPict ($Extend* $ZWJ $ExtendPict)* - | - [^$Control $CR $LF] -) -[$Extend $ZWJ $SpacingMark]* -| -$Any -" diff --git a/vendor/bstr/scripts/regex/sentence.sh b/vendor/bstr/scripts/regex/sentence.sh deleted file mode 100644 index 689d1849f..000000000 --- a/vendor/bstr/scripts/regex/sentence.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/bin/sh - -# vim: indentexpr= nosmartindent autoindent -# vim: tabstop=2 shiftwidth=2 softtabstop=2 - -# This is a regex that I reverse engineered from the sentence boundary chain -# rules in UAX #29. Unlike the grapheme regex, which is essentially provided -# for us in UAX #29, no such sentence regex exists. -# -# I looked into how ICU achieves this, since UAX #29 hints that producing -# finite state machines for grapheme/sentence/word/line breaking is possible, -# but only easy to do for graphemes. ICU does this by implementing their own -# DSL for describing the break algorithms in terms of the chaining rules -# directly. You can see an example for sentences in -# icu4c/source/data/brkitr/rules/sent.txt. ICU then builds a finite state -# machine from those rules in a mostly standard way, but implements the -# "chaining" aspect of the rules by connecting overlapping end and start -# states. For example, given SB7: -# -# (Upper | Lower) ATerm x Upper -# -# Then the naive way to convert this into a regex would be something like -# -# [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper} -# -# Unfortunately, this is incorrect. Why? Well, consider an example like so: -# -# U.S.A. -# -# A correct implementation of the sentence breaking algorithm should not insert -# any breaks here, exactly in accordance with repeatedly applying rule SB7 as -# given above. Our regex fails to do this because it will first match `U.S` -# without breaking them---which is correct---but will then start looking for -# its next rule beginning with a full stop (in ATerm) and followed by an -# uppercase letter (A). This will wind up triggering rule SB11 (without -# matching `A`), which inserts a break. -# -# The reason why this happens is because our initial application of rule SB7 -# "consumes" the next uppercase letter (S), which we want to reuse as a prefix -# in the next rule application. A natural way to express this would be with -# look-around, although it's not clear that works in every case since you -# ultimately might want to consume that ending uppercase letter. In any case, -# we can't use look-around in our truly regular regexes, so we must fix this. -# The approach we take is to explicitly repeat rules when a suffix of a rule -# is a prefix of another rule. In the case of SB7, the end of the rule, an -# uppercase letter, also happens to match the beginning of the rule. This can -# in turn be repeated indefinitely. Thus, our actual translation to a regex is: -# -# [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper}(\p{sb=ATerm}\p{sb=Upper}* -# -# It turns out that this is exactly what ICU does, but in their case, they do -# it automatically. In our case, we connect the chaining rules manually. It's -# tedious. With that said, we do no implement Unicode line breaking with this -# approach, which is a far scarier beast. In that case, it would probably be -# worth writing the code to do what ICU does. -# -# In the case of sentence breaks, there aren't *too* many overlaps of this -# nature. We list them out exhaustively to make this clear, because it's -# essentially impossible to easily observe this in the regex. (It took me a -# full day to figure all of this out.) Rules marked with N/A mean that they -# specify a break, and this strategy only really applies to stringing together -# non-breaks. -# -# SB1 - N/A -# SB2 - N/A -# SB3 - None -# SB4 - N/A -# SB5 - None -# SB6 - None -# SB7 - End overlaps with beginning of SB7 -# SB8 - End overlaps with beginning of SB7 -# SB8a - End overlaps with beginning of SB6, SB8, SB8a, SB9, SB10, SB11 -# SB9 - None -# SB10 - None -# SB11 - None -# SB998 - N/A -# -# SB8a is in particular quite tricky to get right without look-ahead, since it -# allows ping-ponging between match rules SB8a and SB9-11, where SB9-11 -# otherwise indicate that a break has been found. In the regex below, we tackle -# this by only permitting part of SB8a to match inside our core non-breaking -# repetition. In particular, we only allow the parts of SB8a to match that -# permit the non-breaking components to continue. If a part of SB8a matches -# that guarantees a pop out to SB9-11, (like `STerm STerm`), then we let it -# happen. This still isn't correct because an SContinue might be seen which -# would allow moving back into SB998 and thus the non-breaking repetition, so -# we handle that case as well. -# -# Finally, the last complication here is the sprinkling of $Ex* everywhere. -# This essentially corresponds to the implementation of SB5 by following -# UAX #29's recommendation in S6.2. Essentially, we use it avoid ever breaking -# in the middle of a grapheme cluster. - -CR="\p{sb=CR}" -LF="\p{sb=LF}" -Sep="\p{sb=Sep}" -Close="\p{sb=Close}" -Sp="\p{sb=Sp}" -STerm="\p{sb=STerm}" -ATerm="\p{sb=ATerm}" -SContinue="\p{sb=SContinue}" -Numeric="\p{sb=Numeric}" -Upper="\p{sb=Upper}" -Lower="\p{sb=Lower}" -OLetter="\p{sb=OLetter}" - -Ex="[\p{sb=Extend}\p{sb=Format}]" -ParaSep="[$Sep $CR $LF]" -SATerm="[$STerm $ATerm]" - -LetterSepTerm="[$OLetter $Upper $Lower $ParaSep $SATerm]" - -echo "(?x) -( - # SB6 - $ATerm $Ex* - $Numeric - | - # SB7 - [$Upper $Lower] $Ex* $ATerm $Ex* - $Upper $Ex* - # overlap with SB7 - ($ATerm $Ex* $Upper $Ex*)* - | - # SB8 - $ATerm $Ex* $Close* $Ex* $Sp* $Ex* - ([^$LetterSepTerm] $Ex*)* $Lower $Ex* - # overlap with SB7 - ($ATerm $Ex* $Upper $Ex*)* - | - # SB8a - $SATerm $Ex* $Close* $Ex* $Sp* $Ex* - ( - $SContinue - | - $ATerm $Ex* - # Permit repetition of SB8a - (($Close $Ex*)* ($Sp $Ex*)* $SATerm)* - # In order to continue non-breaking matching, we now must observe - # a match with a rule that keeps us in SB6-8a. Otherwise, we've entered - # one of SB9-11 and know that a break must follow. - ( - # overlap with SB6 - $Numeric - | - # overlap with SB8 - ($Close $Ex*)* ($Sp $Ex*)* - ([^$LetterSepTerm] $Ex*)* $Lower $Ex* - # overlap with SB7 - ($ATerm $Ex* $Upper $Ex*)* - | - # overlap with SB8a - ($Close $Ex*)* ($Sp $Ex*)* $SContinue - ) - | - $STerm $Ex* - # Permit repetition of SB8a - (($Close $Ex*)* ($Sp $Ex*)* $SATerm)* - # As with ATerm above, in order to continue non-breaking matching, we - # must now observe a match with a rule that keeps us out of SB9-11. - # For STerm, the only such possibility is to see an SContinue. Anything - # else will result in a break. - ($Close $Ex*)* ($Sp $Ex*)* $SContinue - ) - | - # SB998 - # The logic behind this catch-all is that if we get to this point and - # see a Sep, CR, LF, STerm or ATerm, then it has to fall into one of - # SB9, SB10 or SB11. In the cases of SB9-11, we always find a break since - # SB11 acts as a catch-all to induce a break following a SATerm that isn't - # handled by rules SB6-SB8a. - [^$ParaSep $SATerm] -)* -# The following collapses rules SB3, SB4, part of SB8a, SB9, SB10 and SB11. -($SATerm $Ex* ($Close $Ex*)* ($Sp $Ex*)*)* ($CR $LF | $ParaSep)? -" diff --git a/vendor/bstr/scripts/regex/word.sh b/vendor/bstr/scripts/regex/word.sh deleted file mode 100644 index 78c7a05cf..000000000 --- a/vendor/bstr/scripts/regex/word.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/sh - -# vim: indentexpr= nosmartindent autoindent -# vim: tabstop=2 shiftwidth=2 softtabstop=2 - -# See the comments in regex/sentence.sh for the general approach to how this -# regex was written. -# -# Writing the regex for this was *hard*. It took me two days of hacking to get -# this far, and that was after I had finished the sentence regex, so my brain -# was fully cached on this. Unlike the sentence regex, the rules in the regex -# below don't correspond as nicely to the rules in UAX #29. In particular, the -# UAX #29 rules have a ton of overlap with each other, which requires crazy -# stuff in the regex. I'm not even sure the regex below is 100% correct or even -# minimal, however, I did compare this with the ICU word segmenter on a few -# different corpora, and it produces identical results. (In addition to of -# course passing the UCD tests.) -# -# In general, I consider this approach to be a failure. Firstly, this is -# clearly a write-only regex. Secondly, building the minimized DFA for this is -# incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly, -# reversing this regex (for reverse word iteration) results in a >19MB DFA. -# Yes. That's MB. Wat. And it took 5 minutes to build. -# -# I think we might consider changing our approach to this problem. The normal -# path I've seen, I think, is to decode codepoints one at a time, and then -# thread them through a state machine in the code itself. We could take this -# approach, or possibly combine it with a DFA that tells us which Word_Break -# value a codepoint has. I'd prefer the latter approach, but it requires adding -# RegexSet support to regex-automata. Something that should definitely be done, -# but is a fair amount of work. -# -# Gah. - -CR="\p{wb=CR}" -LF="\p{wb=LF}" -Newline="\p{wb=Newline}" -ZWJ="\p{wb=ZWJ}" -RI="\p{wb=Regional_Indicator}" -Katakana="\p{wb=Katakana}" -HebrewLet="\p{wb=HebrewLetter}" -ALetter="\p{wb=ALetter}" -SingleQuote="\p{wb=SingleQuote}" -DoubleQuote="\p{wb=DoubleQuote}" -MidNumLet="\p{wb=MidNumLet}" -MidLetter="\p{wb=MidLetter}" -MidNum="\p{wb=MidNum}" -Numeric="\p{wb=Numeric}" -ExtendNumLet="\p{wb=ExtendNumLet}" -WSegSpace="\p{wb=WSegSpace}" - -Any="\p{any}" -Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]" -ExtendPict="\p{Extended_Pictographic}" -AHLetter="[$ALetter $HebrewLet]" -MidNumLetQ="[$MidNumLet $SingleQuote]" - -AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*" -NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*" - -echo "(?x) -$CR $LF -| -[$Newline $CR $LF] -| -$WSegSpace $WSegSpace+ -| -( - ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+ - | - ($ExtendNumLet $Ex*)* $AHLetter $Ex* - ( - ( - ($NumericRepeat | $ExtendNumLet $Ex*)* - | - [$MidLetter $MidNumLetQ] $Ex* - ) - $AHLetter $Ex* - )+ - ($NumericRepeat | $ExtendNumLet $Ex*)* - | - ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+ - | - ($ExtendNumLet $Ex*)* $Numeric $Ex* - ( - ( - ($AHLetterRepeat | $ExtendNumLet $Ex*)* - | - [$MidNum $MidNumLetQ] $Ex* - ) - $Numeric $Ex* - )+ - ($AHLetterRepeat | $ExtendNumLet $Ex*)* - | - ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+ - | - $Katakana $Ex* - (($Katakana | $ExtendNumLet) $Ex*)+ - | - $ExtendNumLet $Ex* - (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+ -)+ -| -$HebrewLet $Ex* $SingleQuote $Ex* -| -($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex* -| -$RI $Ex* $RI $Ex* -| -$Any $Ex* -" |