summaryrefslogtreecommitdiffstats
path: root/src/regexp/syntax/regexp.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/regexp/syntax/regexp.go')
-rw-r--r--src/regexp/syntax/regexp.go320
1 files changed, 320 insertions, 0 deletions
diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go
new file mode 100644
index 0000000..3a4d2d2
--- /dev/null
+++ b/src/regexp/syntax/regexp.go
@@ -0,0 +1,320 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Note to implementers:
+// In this package, re is always a *Regexp and r is always a rune.
+
+import (
+ "strconv"
+ "strings"
+ "unicode"
+)
+
+// A Regexp is a node in a regular expression syntax tree.
+type Regexp struct {
+ Op Op // operator
+ Flags Flags
+ Sub []*Regexp // subexpressions, if any
+ Sub0 [1]*Regexp // storage for short Sub
+ Rune []rune // matched runes, for OpLiteral, OpCharClass
+ Rune0 [2]rune // storage for short Rune
+ Min, Max int // min, max for OpRepeat
+ Cap int // capturing index, for OpCapture
+ Name string // capturing name, for OpCapture
+}
+
+//go:generate stringer -type Op -trimprefix Op
+
+// An Op is a single regular expression operator.
+type Op uint8
+
+// Operators are listed in precedence order, tightest binding to weakest.
+// Character class operators are listed simplest to most complex
+// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
+
+const (
+ OpNoMatch Op = 1 + iota // matches no strings
+ OpEmptyMatch // matches empty string
+ OpLiteral // matches Runes sequence
+ OpCharClass // matches Runes interpreted as range pair list
+ OpAnyCharNotNL // matches any character except newline
+ OpAnyChar // matches any character
+ OpBeginLine // matches empty string at beginning of line
+ OpEndLine // matches empty string at end of line
+ OpBeginText // matches empty string at beginning of text
+ OpEndText // matches empty string at end of text
+ OpWordBoundary // matches word boundary `\b`
+ OpNoWordBoundary // matches word non-boundary `\B`
+ OpCapture // capturing subexpression with index Cap, optional name Name
+ OpStar // matches Sub[0] zero or more times
+ OpPlus // matches Sub[0] one or more times
+ OpQuest // matches Sub[0] zero or one times
+ OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
+ OpConcat // matches concatenation of Subs
+ OpAlternate // matches alternation of Subs
+)
+
+const opPseudo Op = 128 // where pseudo-ops start
+
+// Equal reports whether x and y have identical structure.
+func (x *Regexp) Equal(y *Regexp) bool {
+ if x == nil || y == nil {
+ return x == y
+ }
+ if x.Op != y.Op {
+ return false
+ }
+ switch x.Op {
+ case OpEndText:
+ // The parse flags remember whether this is \z or \Z.
+ if x.Flags&WasDollar != y.Flags&WasDollar {
+ return false
+ }
+
+ case OpLiteral, OpCharClass:
+ if len(x.Rune) != len(y.Rune) {
+ return false
+ }
+ for i, r := range x.Rune {
+ if r != y.Rune[i] {
+ return false
+ }
+ }
+
+ case OpAlternate, OpConcat:
+ if len(x.Sub) != len(y.Sub) {
+ return false
+ }
+ for i, sub := range x.Sub {
+ if !sub.Equal(y.Sub[i]) {
+ return false
+ }
+ }
+
+ case OpStar, OpPlus, OpQuest:
+ if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+
+ case OpRepeat:
+ if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+
+ case OpCapture:
+ if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+ }
+ return true
+}
+
+// writeRegexp writes the Perl syntax for the regular expression re to b.
+func writeRegexp(b *strings.Builder, re *Regexp) {
+ switch re.Op {
+ default:
+ b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
+ case OpNoMatch:
+ b.WriteString(`[^\x00-\x{10FFFF}]`)
+ case OpEmptyMatch:
+ b.WriteString(`(?:)`)
+ case OpLiteral:
+ if re.Flags&FoldCase != 0 {
+ b.WriteString(`(?i:`)
+ }
+ for _, r := range re.Rune {
+ escape(b, r, false)
+ }
+ if re.Flags&FoldCase != 0 {
+ b.WriteString(`)`)
+ }
+ case OpCharClass:
+ if len(re.Rune)%2 != 0 {
+ b.WriteString(`[invalid char class]`)
+ break
+ }
+ b.WriteRune('[')
+ if len(re.Rune) == 0 {
+ b.WriteString(`^\x00-\x{10FFFF}`)
+ } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
+ // Contains 0 and MaxRune. Probably a negated class.
+ // Print the gaps.
+ b.WriteRune('^')
+ for i := 1; i < len(re.Rune)-1; i += 2 {
+ lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
+ escape(b, lo, lo == '-')
+ if lo != hi {
+ b.WriteRune('-')
+ escape(b, hi, hi == '-')
+ }
+ }
+ } else {
+ for i := 0; i < len(re.Rune); i += 2 {
+ lo, hi := re.Rune[i], re.Rune[i+1]
+ escape(b, lo, lo == '-')
+ if lo != hi {
+ b.WriteRune('-')
+ escape(b, hi, hi == '-')
+ }
+ }
+ }
+ b.WriteRune(']')
+ case OpAnyCharNotNL:
+ b.WriteString(`(?-s:.)`)
+ case OpAnyChar:
+ b.WriteString(`(?s:.)`)
+ case OpBeginLine:
+ b.WriteString(`(?m:^)`)
+ case OpEndLine:
+ b.WriteString(`(?m:$)`)
+ case OpBeginText:
+ b.WriteString(`\A`)
+ case OpEndText:
+ if re.Flags&WasDollar != 0 {
+ b.WriteString(`(?-m:$)`)
+ } else {
+ b.WriteString(`\z`)
+ }
+ case OpWordBoundary:
+ b.WriteString(`\b`)
+ case OpNoWordBoundary:
+ b.WriteString(`\B`)
+ case OpCapture:
+ if re.Name != "" {
+ b.WriteString(`(?P<`)
+ b.WriteString(re.Name)
+ b.WriteRune('>')
+ } else {
+ b.WriteRune('(')
+ }
+ if re.Sub[0].Op != OpEmptyMatch {
+ writeRegexp(b, re.Sub[0])
+ }
+ b.WriteRune(')')
+ case OpStar, OpPlus, OpQuest, OpRepeat:
+ if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
+ b.WriteString(`(?:`)
+ writeRegexp(b, sub)
+ b.WriteString(`)`)
+ } else {
+ writeRegexp(b, sub)
+ }
+ switch re.Op {
+ case OpStar:
+ b.WriteRune('*')
+ case OpPlus:
+ b.WriteRune('+')
+ case OpQuest:
+ b.WriteRune('?')
+ case OpRepeat:
+ b.WriteRune('{')
+ b.WriteString(strconv.Itoa(re.Min))
+ if re.Max != re.Min {
+ b.WriteRune(',')
+ if re.Max >= 0 {
+ b.WriteString(strconv.Itoa(re.Max))
+ }
+ }
+ b.WriteRune('}')
+ }
+ if re.Flags&NonGreedy != 0 {
+ b.WriteRune('?')
+ }
+ case OpConcat:
+ for _, sub := range re.Sub {
+ if sub.Op == OpAlternate {
+ b.WriteString(`(?:`)
+ writeRegexp(b, sub)
+ b.WriteString(`)`)
+ } else {
+ writeRegexp(b, sub)
+ }
+ }
+ case OpAlternate:
+ for i, sub := range re.Sub {
+ if i > 0 {
+ b.WriteRune('|')
+ }
+ writeRegexp(b, sub)
+ }
+ }
+}
+
+func (re *Regexp) String() string {
+ var b strings.Builder
+ writeRegexp(&b, re)
+ return b.String()
+}
+
+const meta = `\.+*?()|[]{}^$`
+
+func escape(b *strings.Builder, r rune, force bool) {
+ if unicode.IsPrint(r) {
+ if strings.ContainsRune(meta, r) || force {
+ b.WriteRune('\\')
+ }
+ b.WriteRune(r)
+ return
+ }
+
+ switch r {
+ case '\a':
+ b.WriteString(`\a`)
+ case '\f':
+ b.WriteString(`\f`)
+ case '\n':
+ b.WriteString(`\n`)
+ case '\r':
+ b.WriteString(`\r`)
+ case '\t':
+ b.WriteString(`\t`)
+ case '\v':
+ b.WriteString(`\v`)
+ default:
+ if r < 0x100 {
+ b.WriteString(`\x`)
+ s := strconv.FormatInt(int64(r), 16)
+ if len(s) == 1 {
+ b.WriteRune('0')
+ }
+ b.WriteString(s)
+ break
+ }
+ b.WriteString(`\x{`)
+ b.WriteString(strconv.FormatInt(int64(r), 16))
+ b.WriteString(`}`)
+ }
+}
+
+// MaxCap walks the regexp to find the maximum capture index.
+func (re *Regexp) MaxCap() int {
+ m := 0
+ if re.Op == OpCapture {
+ m = re.Cap
+ }
+ for _, sub := range re.Sub {
+ if n := sub.MaxCap(); m < n {
+ m = n
+ }
+ }
+ return m
+}
+
+// CapNames walks the regexp to find the names of capturing groups.
+func (re *Regexp) CapNames() []string {
+ names := make([]string, re.MaxCap()+1)
+ re.capNames(names)
+ return names
+}
+
+func (re *Regexp) capNames(names []string) {
+ if re.Op == OpCapture {
+ names[re.Cap] = re.Name
+ }
+ for _, sub := range re.Sub {
+ sub.capNames(names)
+ }
+}