summaryrefslogtreecommitdiffstats
path: root/src/regexp/syntax/regexp.go
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:25:22 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:25:22 +0000
commitf6ad4dcef54c5ce997a4bad5a6d86de229015700 (patch)
tree7cfa4e31ace5c2bd95c72b154d15af494b2bcbef /src/regexp/syntax/regexp.go
parentInitial commit. (diff)
downloadgolang-1.22-f6ad4dcef54c5ce997a4bad5a6d86de229015700.tar.xz
golang-1.22-f6ad4dcef54c5ce997a4bad5a6d86de229015700.zip
Adding upstream version 1.22.1.upstream/1.22.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/regexp/syntax/regexp.go')
-rw-r--r--src/regexp/syntax/regexp.go477
1 files changed, 477 insertions, 0 deletions
diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go
new file mode 100644
index 0000000..4fa7d0e
--- /dev/null
+++ b/src/regexp/syntax/regexp.go
@@ -0,0 +1,477 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Note to implementers:
+// In this package, re is always a *Regexp and r is always a rune.
+
+import (
+ "strconv"
+ "strings"
+ "unicode"
+)
+
+// A Regexp is a node in a regular expression syntax tree.
+type Regexp struct {
+ Op Op // operator
+ Flags Flags
+ Sub []*Regexp // subexpressions, if any
+ Sub0 [1]*Regexp // storage for short Sub
+ Rune []rune // matched runes, for OpLiteral, OpCharClass
+ Rune0 [2]rune // storage for short Rune
+ Min, Max int // min, max for OpRepeat
+ Cap int // capturing index, for OpCapture
+ Name string // capturing name, for OpCapture
+}
+
+//go:generate stringer -type Op -trimprefix Op
+
+// An Op is a single regular expression operator.
+type Op uint8
+
+// Operators are listed in precedence order, tightest binding to weakest.
+// Character class operators are listed simplest to most complex
+// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
+
+const (
+ OpNoMatch Op = 1 + iota // matches no strings
+ OpEmptyMatch // matches empty string
+ OpLiteral // matches Runes sequence
+ OpCharClass // matches Runes interpreted as range pair list
+ OpAnyCharNotNL // matches any character except newline
+ OpAnyChar // matches any character
+ OpBeginLine // matches empty string at beginning of line
+ OpEndLine // matches empty string at end of line
+ OpBeginText // matches empty string at beginning of text
+ OpEndText // matches empty string at end of text
+ OpWordBoundary // matches word boundary `\b`
+ OpNoWordBoundary // matches word non-boundary `\B`
+ OpCapture // capturing subexpression with index Cap, optional name Name
+ OpStar // matches Sub[0] zero or more times
+ OpPlus // matches Sub[0] one or more times
+ OpQuest // matches Sub[0] zero or one times
+ OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
+ OpConcat // matches concatenation of Subs
+ OpAlternate // matches alternation of Subs
+)
+
+const opPseudo Op = 128 // where pseudo-ops start
+
+// Equal reports whether x and y have identical structure.
+func (x *Regexp) Equal(y *Regexp) bool {
+ if x == nil || y == nil {
+ return x == y
+ }
+ if x.Op != y.Op {
+ return false
+ }
+ switch x.Op {
+ case OpEndText:
+ // The parse flags remember whether this is \z or \Z.
+ if x.Flags&WasDollar != y.Flags&WasDollar {
+ return false
+ }
+
+ case OpLiteral, OpCharClass:
+ if len(x.Rune) != len(y.Rune) {
+ return false
+ }
+ for i, r := range x.Rune {
+ if r != y.Rune[i] {
+ return false
+ }
+ }
+
+ case OpAlternate, OpConcat:
+ if len(x.Sub) != len(y.Sub) {
+ return false
+ }
+ for i, sub := range x.Sub {
+ if !sub.Equal(y.Sub[i]) {
+ return false
+ }
+ }
+
+ case OpStar, OpPlus, OpQuest:
+ if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+
+ case OpRepeat:
+ if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+
+ case OpCapture:
+ if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+ }
+ return true
+}
+
+// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
+type printFlags uint8
+
+const (
+ flagI printFlags = 1 << iota // (?i:
+ flagM // (?m:
+ flagS // (?s:
+ flagOff // )
+ flagPrec // (?: )
+ negShift = 5 // flagI<<negShift is (?-i:
+)
+
+// addSpan enables the flags f around start..last,
+// by setting flags[start] = f and flags[last] = flagOff.
+func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
+ if *flags == nil {
+ *flags = make(map[*Regexp]printFlags)
+ }
+ (*flags)[start] = f
+ (*flags)[last] |= flagOff // maybe start==last
+}
+
+// calcFlags calculates the flags to print around each subexpression in re,
+// storing that information in (*flags)[sub] for each affected subexpression.
+// The first time an entry needs to be written to *flags, calcFlags allocates the map.
+// calcFlags also calculates the flags that must be active or can't be active
+// around re and returns those flags.
+func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
+ switch re.Op {
+ default:
+ return 0, 0
+
+ case OpLiteral:
+ // If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
+ // according to whether (?i) is active.
+ // If literal is not fold-sensitive, return 0, 0.
+ for _, r := range re.Rune {
+ if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
+ if re.Flags&FoldCase != 0 {
+ return flagI, 0
+ } else {
+ return 0, flagI
+ }
+ }
+ }
+ return 0, 0
+
+ case OpCharClass:
+ // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
+ // If literal is not fold-sensitive, return 0, 0.
+ for i := 0; i < len(re.Rune); i += 2 {
+ lo := max(minFold, re.Rune[i])
+ hi := min(maxFold, re.Rune[i+1])
+ for r := lo; r <= hi; r++ {
+ for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
+ if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
+ return 0, flagI
+ }
+ }
+ }
+ }
+ return 0, 0
+
+ case OpAnyCharNotNL: // (?-s).
+ return 0, flagS
+
+ case OpAnyChar: // (?s).
+ return flagS, 0
+
+ case OpBeginLine, OpEndLine: // (?m)^ (?m)$
+ return flagM, 0
+
+ case OpEndText:
+ if re.Flags&WasDollar != 0 { // (?-m)$
+ return 0, flagM
+ }
+ return 0, 0
+
+ case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
+ return calcFlags(re.Sub[0], flags)
+
+ case OpConcat, OpAlternate:
+ // Gather the must and cant for each subexpression.
+ // When we find a conflicting subexpression, insert the necessary
+ // flags around the previously identified span and start over.
+ var must, cant, allCant printFlags
+ start := 0
+ last := 0
+ did := false
+ for i, sub := range re.Sub {
+ subMust, subCant := calcFlags(sub, flags)
+ if must&subCant != 0 || subMust&cant != 0 {
+ if must != 0 {
+ addSpan(re.Sub[start], re.Sub[last], must, flags)
+ }
+ must = 0
+ cant = 0
+ start = i
+ did = true
+ }
+ must |= subMust
+ cant |= subCant
+ allCant |= subCant
+ if subMust != 0 {
+ last = i
+ }
+ if must == 0 && start == i {
+ start++
+ }
+ }
+ if !did {
+ // No conflicts: pass the accumulated must and cant upward.
+ return must, cant
+ }
+ if must != 0 {
+ // Conflicts found; need to finish final span.
+ addSpan(re.Sub[start], re.Sub[last], must, flags)
+ }
+ return 0, allCant
+ }
+}
+
+// writeRegexp writes the Perl syntax for the regular expression re to b.
+func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
+ f |= flags[re]
+ if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
+ // flagPrec is redundant with other flags being added and terminated
+ f &^= flagPrec
+ }
+ if f&^(flagOff|flagPrec) != 0 {
+ b.WriteString(`(?`)
+ if f&flagI != 0 {
+ b.WriteString(`i`)
+ }
+ if f&flagM != 0 {
+ b.WriteString(`m`)
+ }
+ if f&flagS != 0 {
+ b.WriteString(`s`)
+ }
+ if f&((flagM|flagS)<<negShift) != 0 {
+ b.WriteString(`-`)
+ if f&(flagM<<negShift) != 0 {
+ b.WriteString(`m`)
+ }
+ if f&(flagS<<negShift) != 0 {
+ b.WriteString(`s`)
+ }
+ }
+ b.WriteString(`:`)
+ }
+ if f&flagOff != 0 {
+ defer b.WriteString(`)`)
+ }
+ if f&flagPrec != 0 {
+ b.WriteString(`(?:`)
+ defer b.WriteString(`)`)
+ }
+
+ switch re.Op {
+ default:
+ b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
+ case OpNoMatch:
+ b.WriteString(`[^\x00-\x{10FFFF}]`)
+ case OpEmptyMatch:
+ b.WriteString(`(?:)`)
+ case OpLiteral:
+ for _, r := range re.Rune {
+ escape(b, r, false)
+ }
+ case OpCharClass:
+ if len(re.Rune)%2 != 0 {
+ b.WriteString(`[invalid char class]`)
+ break
+ }
+ b.WriteRune('[')
+ if len(re.Rune) == 0 {
+ b.WriteString(`^\x00-\x{10FFFF}`)
+ } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
+ // Contains 0 and MaxRune. Probably a negated class.
+ // Print the gaps.
+ b.WriteRune('^')
+ for i := 1; i < len(re.Rune)-1; i += 2 {
+ lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
+ escape(b, lo, lo == '-')
+ if lo != hi {
+ if hi != lo+1 {
+ b.WriteRune('-')
+ }
+ escape(b, hi, hi == '-')
+ }
+ }
+ } else {
+ for i := 0; i < len(re.Rune); i += 2 {
+ lo, hi := re.Rune[i], re.Rune[i+1]
+ escape(b, lo, lo == '-')
+ if lo != hi {
+ if hi != lo+1 {
+ b.WriteRune('-')
+ }
+ escape(b, hi, hi == '-')
+ }
+ }
+ }
+ b.WriteRune(']')
+ case OpAnyCharNotNL, OpAnyChar:
+ b.WriteString(`.`)
+ case OpBeginLine:
+ b.WriteString(`^`)
+ case OpEndLine:
+ b.WriteString(`$`)
+ case OpBeginText:
+ b.WriteString(`\A`)
+ case OpEndText:
+ if re.Flags&WasDollar != 0 {
+ b.WriteString(`$`)
+ } else {
+ b.WriteString(`\z`)
+ }
+ case OpWordBoundary:
+ b.WriteString(`\b`)
+ case OpNoWordBoundary:
+ b.WriteString(`\B`)
+ case OpCapture:
+ if re.Name != "" {
+ b.WriteString(`(?P<`)
+ b.WriteString(re.Name)
+ b.WriteRune('>')
+ } else {
+ b.WriteRune('(')
+ }
+ if re.Sub[0].Op != OpEmptyMatch {
+ writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
+ }
+ b.WriteRune(')')
+ case OpStar, OpPlus, OpQuest, OpRepeat:
+ p := printFlags(0)
+ sub := re.Sub[0]
+ if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
+ p = flagPrec
+ }
+ writeRegexp(b, sub, p, flags)
+
+ switch re.Op {
+ case OpStar:
+ b.WriteRune('*')
+ case OpPlus:
+ b.WriteRune('+')
+ case OpQuest:
+ b.WriteRune('?')
+ case OpRepeat:
+ b.WriteRune('{')
+ b.WriteString(strconv.Itoa(re.Min))
+ if re.Max != re.Min {
+ b.WriteRune(',')
+ if re.Max >= 0 {
+ b.WriteString(strconv.Itoa(re.Max))
+ }
+ }
+ b.WriteRune('}')
+ }
+ if re.Flags&NonGreedy != 0 {
+ b.WriteRune('?')
+ }
+ case OpConcat:
+ for _, sub := range re.Sub {
+ p := printFlags(0)
+ if sub.Op == OpAlternate {
+ p = flagPrec
+ }
+ writeRegexp(b, sub, p, flags)
+ }
+ case OpAlternate:
+ for i, sub := range re.Sub {
+ if i > 0 {
+ b.WriteRune('|')
+ }
+ writeRegexp(b, sub, 0, flags)
+ }
+ }
+}
+
+func (re *Regexp) String() string {
+ var b strings.Builder
+ var flags map[*Regexp]printFlags
+ must, cant := calcFlags(re, &flags)
+ must |= (cant &^ flagI) << negShift
+ if must != 0 {
+ must |= flagOff
+ }
+ writeRegexp(&b, re, must, flags)
+ return b.String()
+}
+
+const meta = `\.+*?()|[]{}^$`
+
+func escape(b *strings.Builder, r rune, force bool) {
+ if unicode.IsPrint(r) {
+ if strings.ContainsRune(meta, r) || force {
+ b.WriteRune('\\')
+ }
+ b.WriteRune(r)
+ return
+ }
+
+ switch r {
+ case '\a':
+ b.WriteString(`\a`)
+ case '\f':
+ b.WriteString(`\f`)
+ case '\n':
+ b.WriteString(`\n`)
+ case '\r':
+ b.WriteString(`\r`)
+ case '\t':
+ b.WriteString(`\t`)
+ case '\v':
+ b.WriteString(`\v`)
+ default:
+ if r < 0x100 {
+ b.WriteString(`\x`)
+ s := strconv.FormatInt(int64(r), 16)
+ if len(s) == 1 {
+ b.WriteRune('0')
+ }
+ b.WriteString(s)
+ break
+ }
+ b.WriteString(`\x{`)
+ b.WriteString(strconv.FormatInt(int64(r), 16))
+ b.WriteString(`}`)
+ }
+}
+
+// MaxCap walks the regexp to find the maximum capture index.
+func (re *Regexp) MaxCap() int {
+ m := 0
+ if re.Op == OpCapture {
+ m = re.Cap
+ }
+ for _, sub := range re.Sub {
+ if n := sub.MaxCap(); m < n {
+ m = n
+ }
+ }
+ return m
+}
+
+// CapNames walks the regexp to find the names of capturing groups.
+func (re *Regexp) CapNames() []string {
+ names := make([]string, re.MaxCap()+1)
+ re.capNames(names)
+ return names
+}
+
+func (re *Regexp) capNames(names []string) {
+ if re.Op == OpCapture {
+ names[re.Cap] = re.Name
+ }
+ for _, sub := range re.Sub {
+ sub.capNames(names)
+ }
+}