diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:25:22 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:25:22 +0000 |
commit | f6ad4dcef54c5ce997a4bad5a6d86de229015700 (patch) | |
tree | 7cfa4e31ace5c2bd95c72b154d15af494b2bcbef /src/regexp/syntax/regexp.go | |
parent | Initial commit. (diff) | |
download | golang-1.22-f6ad4dcef54c5ce997a4bad5a6d86de229015700.tar.xz golang-1.22-f6ad4dcef54c5ce997a4bad5a6d86de229015700.zip |
Adding upstream version 1.22.1.upstream/1.22.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/regexp/syntax/regexp.go')
-rw-r--r-- | src/regexp/syntax/regexp.go | 477 |
1 files changed, 477 insertions, 0 deletions
diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go new file mode 100644 index 0000000..4fa7d0e --- /dev/null +++ b/src/regexp/syntax/regexp.go @@ -0,0 +1,477 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +// Note to implementers: +// In this package, re is always a *Regexp and r is always a rune. + +import ( + "strconv" + "strings" + "unicode" +) + +// A Regexp is a node in a regular expression syntax tree. +type Regexp struct { + Op Op // operator + Flags Flags + Sub []*Regexp // subexpressions, if any + Sub0 [1]*Regexp // storage for short Sub + Rune []rune // matched runes, for OpLiteral, OpCharClass + Rune0 [2]rune // storage for short Rune + Min, Max int // min, max for OpRepeat + Cap int // capturing index, for OpCapture + Name string // capturing name, for OpCapture +} + +//go:generate stringer -type Op -trimprefix Op + +// An Op is a single regular expression operator. +type Op uint8 + +// Operators are listed in precedence order, tightest binding to weakest. +// Character class operators are listed simplest to most complex +// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). + +const ( + OpNoMatch Op = 1 + iota // matches no strings + OpEmptyMatch // matches empty string + OpLiteral // matches Runes sequence + OpCharClass // matches Runes interpreted as range pair list + OpAnyCharNotNL // matches any character except newline + OpAnyChar // matches any character + OpBeginLine // matches empty string at beginning of line + OpEndLine // matches empty string at end of line + OpBeginText // matches empty string at beginning of text + OpEndText // matches empty string at end of text + OpWordBoundary // matches word boundary `\b` + OpNoWordBoundary // matches word non-boundary `\B` + OpCapture // capturing subexpression with index Cap, optional name Name + OpStar // matches Sub[0] zero or more times + OpPlus // matches Sub[0] one or more times + OpQuest // matches Sub[0] zero or one times + OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) + OpConcat // matches concatenation of Subs + OpAlternate // matches alternation of Subs +) + +const opPseudo Op = 128 // where pseudo-ops start + +// Equal reports whether x and y have identical structure. +func (x *Regexp) Equal(y *Regexp) bool { + if x == nil || y == nil { + return x == y + } + if x.Op != y.Op { + return false + } + switch x.Op { + case OpEndText: + // The parse flags remember whether this is \z or \Z. + if x.Flags&WasDollar != y.Flags&WasDollar { + return false + } + + case OpLiteral, OpCharClass: + if len(x.Rune) != len(y.Rune) { + return false + } + for i, r := range x.Rune { + if r != y.Rune[i] { + return false + } + } + + case OpAlternate, OpConcat: + if len(x.Sub) != len(y.Sub) { + return false + } + for i, sub := range x.Sub { + if !sub.Equal(y.Sub[i]) { + return false + } + } + + case OpStar, OpPlus, OpQuest: + if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpRepeat: + if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpCapture: + if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + } + return true +} + +// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. +type printFlags uint8 + +const ( + flagI printFlags = 1 << iota // (?i: + flagM // (?m: + flagS // (?s: + flagOff // ) + flagPrec // (?: ) + negShift = 5 // flagI<<negShift is (?-i: +) + +// addSpan enables the flags f around start..last, +// by setting flags[start] = f and flags[last] = flagOff. +func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) { + if *flags == nil { + *flags = make(map[*Regexp]printFlags) + } + (*flags)[start] = f + (*flags)[last] |= flagOff // maybe start==last +} + +// calcFlags calculates the flags to print around each subexpression in re, +// storing that information in (*flags)[sub] for each affected subexpression. +// The first time an entry needs to be written to *flags, calcFlags allocates the map. +// calcFlags also calculates the flags that must be active or can't be active +// around re and returns those flags. +func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) { + switch re.Op { + default: + return 0, 0 + + case OpLiteral: + // If literal is fold-sensitive, return (flagI, 0) or (0, flagI) + // according to whether (?i) is active. + // If literal is not fold-sensitive, return 0, 0. + for _, r := range re.Rune { + if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r { + if re.Flags&FoldCase != 0 { + return flagI, 0 + } else { + return 0, flagI + } + } + } + return 0, 0 + + case OpCharClass: + // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out. + // If literal is not fold-sensitive, return 0, 0. + for i := 0; i < len(re.Rune); i += 2 { + lo := max(minFold, re.Rune[i]) + hi := min(maxFold, re.Rune[i+1]) + for r := lo; r <= hi; r++ { + for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) { + if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) { + return 0, flagI + } + } + } + } + return 0, 0 + + case OpAnyCharNotNL: // (?-s). + return 0, flagS + + case OpAnyChar: // (?s). + return flagS, 0 + + case OpBeginLine, OpEndLine: // (?m)^ (?m)$ + return flagM, 0 + + case OpEndText: + if re.Flags&WasDollar != 0 { // (?-m)$ + return 0, flagM + } + return 0, 0 + + case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat: + return calcFlags(re.Sub[0], flags) + + case OpConcat, OpAlternate: + // Gather the must and cant for each subexpression. + // When we find a conflicting subexpression, insert the necessary + // flags around the previously identified span and start over. + var must, cant, allCant printFlags + start := 0 + last := 0 + did := false + for i, sub := range re.Sub { + subMust, subCant := calcFlags(sub, flags) + if must&subCant != 0 || subMust&cant != 0 { + if must != 0 { + addSpan(re.Sub[start], re.Sub[last], must, flags) + } + must = 0 + cant = 0 + start = i + did = true + } + must |= subMust + cant |= subCant + allCant |= subCant + if subMust != 0 { + last = i + } + if must == 0 && start == i { + start++ + } + } + if !did { + // No conflicts: pass the accumulated must and cant upward. + return must, cant + } + if must != 0 { + // Conflicts found; need to finish final span. + addSpan(re.Sub[start], re.Sub[last], must, flags) + } + return 0, allCant + } +} + +// writeRegexp writes the Perl syntax for the regular expression re to b. +func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) { + f |= flags[re] + if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 { + // flagPrec is redundant with other flags being added and terminated + f &^= flagPrec + } + if f&^(flagOff|flagPrec) != 0 { + b.WriteString(`(?`) + if f&flagI != 0 { + b.WriteString(`i`) + } + if f&flagM != 0 { + b.WriteString(`m`) + } + if f&flagS != 0 { + b.WriteString(`s`) + } + if f&((flagM|flagS)<<negShift) != 0 { + b.WriteString(`-`) + if f&(flagM<<negShift) != 0 { + b.WriteString(`m`) + } + if f&(flagS<<negShift) != 0 { + b.WriteString(`s`) + } + } + b.WriteString(`:`) + } + if f&flagOff != 0 { + defer b.WriteString(`)`) + } + if f&flagPrec != 0 { + b.WriteString(`(?:`) + defer b.WriteString(`)`) + } + + switch re.Op { + default: + b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") + case OpNoMatch: + b.WriteString(`[^\x00-\x{10FFFF}]`) + case OpEmptyMatch: + b.WriteString(`(?:)`) + case OpLiteral: + for _, r := range re.Rune { + escape(b, r, false) + } + case OpCharClass: + if len(re.Rune)%2 != 0 { + b.WriteString(`[invalid char class]`) + break + } + b.WriteRune('[') + if len(re.Rune) == 0 { + b.WriteString(`^\x00-\x{10FFFF}`) + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { + // Contains 0 and MaxRune. Probably a negated class. + // Print the gaps. + b.WriteRune('^') + for i := 1; i < len(re.Rune)-1; i += 2 { + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 + escape(b, lo, lo == '-') + if lo != hi { + if hi != lo+1 { + b.WriteRune('-') + } + escape(b, hi, hi == '-') + } + } + } else { + for i := 0; i < len(re.Rune); i += 2 { + lo, hi := re.Rune[i], re.Rune[i+1] + escape(b, lo, lo == '-') + if lo != hi { + if hi != lo+1 { + b.WriteRune('-') + } + escape(b, hi, hi == '-') + } + } + } + b.WriteRune(']') + case OpAnyCharNotNL, OpAnyChar: + b.WriteString(`.`) + case OpBeginLine: + b.WriteString(`^`) + case OpEndLine: + b.WriteString(`$`) + case OpBeginText: + b.WriteString(`\A`) + case OpEndText: + if re.Flags&WasDollar != 0 { + b.WriteString(`$`) + } else { + b.WriteString(`\z`) + } + case OpWordBoundary: + b.WriteString(`\b`) + case OpNoWordBoundary: + b.WriteString(`\B`) + case OpCapture: + if re.Name != "" { + b.WriteString(`(?P<`) + b.WriteString(re.Name) + b.WriteRune('>') + } else { + b.WriteRune('(') + } + if re.Sub[0].Op != OpEmptyMatch { + writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) + } + b.WriteRune(')') + case OpStar, OpPlus, OpQuest, OpRepeat: + p := printFlags(0) + sub := re.Sub[0] + if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { + p = flagPrec + } + writeRegexp(b, sub, p, flags) + + switch re.Op { + case OpStar: + b.WriteRune('*') + case OpPlus: + b.WriteRune('+') + case OpQuest: + b.WriteRune('?') + case OpRepeat: + b.WriteRune('{') + b.WriteString(strconv.Itoa(re.Min)) + if re.Max != re.Min { + b.WriteRune(',') + if re.Max >= 0 { + b.WriteString(strconv.Itoa(re.Max)) + } + } + b.WriteRune('}') + } + if re.Flags&NonGreedy != 0 { + b.WriteRune('?') + } + case OpConcat: + for _, sub := range re.Sub { + p := printFlags(0) + if sub.Op == OpAlternate { + p = flagPrec + } + writeRegexp(b, sub, p, flags) + } + case OpAlternate: + for i, sub := range re.Sub { + if i > 0 { + b.WriteRune('|') + } + writeRegexp(b, sub, 0, flags) + } + } +} + +func (re *Regexp) String() string { + var b strings.Builder + var flags map[*Regexp]printFlags + must, cant := calcFlags(re, &flags) + must |= (cant &^ flagI) << negShift + if must != 0 { + must |= flagOff + } + writeRegexp(&b, re, must, flags) + return b.String() +} + +const meta = `\.+*?()|[]{}^$` + +func escape(b *strings.Builder, r rune, force bool) { + if unicode.IsPrint(r) { + if strings.ContainsRune(meta, r) || force { + b.WriteRune('\\') + } + b.WriteRune(r) + return + } + + switch r { + case '\a': + b.WriteString(`\a`) + case '\f': + b.WriteString(`\f`) + case '\n': + b.WriteString(`\n`) + case '\r': + b.WriteString(`\r`) + case '\t': + b.WriteString(`\t`) + case '\v': + b.WriteString(`\v`) + default: + if r < 0x100 { + b.WriteString(`\x`) + s := strconv.FormatInt(int64(r), 16) + if len(s) == 1 { + b.WriteRune('0') + } + b.WriteString(s) + break + } + b.WriteString(`\x{`) + b.WriteString(strconv.FormatInt(int64(r), 16)) + b.WriteString(`}`) + } +} + +// MaxCap walks the regexp to find the maximum capture index. +func (re *Regexp) MaxCap() int { + m := 0 + if re.Op == OpCapture { + m = re.Cap + } + for _, sub := range re.Sub { + if n := sub.MaxCap(); m < n { + m = n + } + } + return m +} + +// CapNames walks the regexp to find the names of capturing groups. +func (re *Regexp) CapNames() []string { + names := make([]string, re.MaxCap()+1) + re.capNames(names) + return names +} + +func (re *Regexp) capNames(names []string) { + if re.Op == OpCapture { + names[re.Cap] = re.Name + } + for _, sub := range re.Sub { + sub.capNames(names) + } +} |