summaryrefslogtreecommitdiffstats
path: root/src/regexp/syntax
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:23:18 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:23:18 +0000
commit43a123c1ae6613b3efeed291fa552ecd909d3acf (patch)
treefd92518b7024bc74031f78a1cf9e454b65e73665 /src/regexp/syntax
parentInitial commit. (diff)
downloadgolang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.tar.xz
golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.zip
Adding upstream version 1.20.14.upstream/1.20.14upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/regexp/syntax')
-rw-r--r--src/regexp/syntax/compile.go296
-rw-r--r--src/regexp/syntax/doc.go141
-rwxr-xr-xsrc/regexp/syntax/make_perl_groups.pl113
-rw-r--r--src/regexp/syntax/op_string.go26
-rw-r--r--src/regexp/syntax/parse.go2115
-rw-r--r--src/regexp/syntax/parse_test.go586
-rw-r--r--src/regexp/syntax/perl_groups.go134
-rw-r--r--src/regexp/syntax/prog.go347
-rw-r--r--src/regexp/syntax/prog_test.go129
-rw-r--r--src/regexp/syntax/regexp.go320
-rw-r--r--src/regexp/syntax/simplify.go151
-rw-r--r--src/regexp/syntax/simplify_test.go151
12 files changed, 4509 insertions, 0 deletions
diff --git a/src/regexp/syntax/compile.go b/src/regexp/syntax/compile.go
new file mode 100644
index 0000000..c9f9fa0
--- /dev/null
+++ b/src/regexp/syntax/compile.go
@@ -0,0 +1,296 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import "unicode"
+
+// A patchList is a list of instruction pointers that need to be filled in (patched).
+// Because the pointers haven't been filled in yet, we can reuse their storage
+// to hold the list. It's kind of sleazy, but works well in practice.
+// See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
+//
+// These aren't really pointers: they're integers, so we can reinterpret them
+// this way without using package unsafe. A value l.head denotes
+// p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
+// head == 0 denotes the empty list, okay because we start every program
+// with a fail instruction, so we'll never want to point at its output link.
+type patchList struct {
+ head, tail uint32
+}
+
+func makePatchList(n uint32) patchList {
+ return patchList{n, n}
+}
+
+func (l patchList) patch(p *Prog, val uint32) {
+ head := l.head
+ for head != 0 {
+ i := &p.Inst[head>>1]
+ if head&1 == 0 {
+ head = i.Out
+ i.Out = val
+ } else {
+ head = i.Arg
+ i.Arg = val
+ }
+ }
+}
+
+func (l1 patchList) append(p *Prog, l2 patchList) patchList {
+ if l1.head == 0 {
+ return l2
+ }
+ if l2.head == 0 {
+ return l1
+ }
+
+ i := &p.Inst[l1.tail>>1]
+ if l1.tail&1 == 0 {
+ i.Out = l2.head
+ } else {
+ i.Arg = l2.head
+ }
+ return patchList{l1.head, l2.tail}
+}
+
+// A frag represents a compiled program fragment.
+type frag struct {
+ i uint32 // index of first instruction
+ out patchList // where to record end instruction
+ nullable bool // whether fragment can match empty string
+}
+
+type compiler struct {
+ p *Prog
+}
+
+// Compile compiles the regexp into a program to be executed.
+// The regexp should have been simplified already (returned from re.Simplify).
+func Compile(re *Regexp) (*Prog, error) {
+ var c compiler
+ c.init()
+ f := c.compile(re)
+ f.out.patch(c.p, c.inst(InstMatch).i)
+ c.p.Start = int(f.i)
+ return c.p, nil
+}
+
+func (c *compiler) init() {
+ c.p = new(Prog)
+ c.p.NumCap = 2 // implicit ( and ) for whole match $0
+ c.inst(InstFail)
+}
+
+var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
+var anyRune = []rune{0, unicode.MaxRune}
+
+func (c *compiler) compile(re *Regexp) frag {
+ switch re.Op {
+ case OpNoMatch:
+ return c.fail()
+ case OpEmptyMatch:
+ return c.nop()
+ case OpLiteral:
+ if len(re.Rune) == 0 {
+ return c.nop()
+ }
+ var f frag
+ for j := range re.Rune {
+ f1 := c.rune(re.Rune[j:j+1], re.Flags)
+ if j == 0 {
+ f = f1
+ } else {
+ f = c.cat(f, f1)
+ }
+ }
+ return f
+ case OpCharClass:
+ return c.rune(re.Rune, re.Flags)
+ case OpAnyCharNotNL:
+ return c.rune(anyRuneNotNL, 0)
+ case OpAnyChar:
+ return c.rune(anyRune, 0)
+ case OpBeginLine:
+ return c.empty(EmptyBeginLine)
+ case OpEndLine:
+ return c.empty(EmptyEndLine)
+ case OpBeginText:
+ return c.empty(EmptyBeginText)
+ case OpEndText:
+ return c.empty(EmptyEndText)
+ case OpWordBoundary:
+ return c.empty(EmptyWordBoundary)
+ case OpNoWordBoundary:
+ return c.empty(EmptyNoWordBoundary)
+ case OpCapture:
+ bra := c.cap(uint32(re.Cap << 1))
+ sub := c.compile(re.Sub[0])
+ ket := c.cap(uint32(re.Cap<<1 | 1))
+ return c.cat(c.cat(bra, sub), ket)
+ case OpStar:
+ return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
+ case OpPlus:
+ return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
+ case OpQuest:
+ return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
+ case OpConcat:
+ if len(re.Sub) == 0 {
+ return c.nop()
+ }
+ var f frag
+ for i, sub := range re.Sub {
+ if i == 0 {
+ f = c.compile(sub)
+ } else {
+ f = c.cat(f, c.compile(sub))
+ }
+ }
+ return f
+ case OpAlternate:
+ var f frag
+ for _, sub := range re.Sub {
+ f = c.alt(f, c.compile(sub))
+ }
+ return f
+ }
+ panic("regexp: unhandled case in compile")
+}
+
+func (c *compiler) inst(op InstOp) frag {
+ // TODO: impose length limit
+ f := frag{i: uint32(len(c.p.Inst)), nullable: true}
+ c.p.Inst = append(c.p.Inst, Inst{Op: op})
+ return f
+}
+
+func (c *compiler) nop() frag {
+ f := c.inst(InstNop)
+ f.out = makePatchList(f.i << 1)
+ return f
+}
+
+func (c *compiler) fail() frag {
+ return frag{}
+}
+
+func (c *compiler) cap(arg uint32) frag {
+ f := c.inst(InstCapture)
+ f.out = makePatchList(f.i << 1)
+ c.p.Inst[f.i].Arg = arg
+
+ if c.p.NumCap < int(arg)+1 {
+ c.p.NumCap = int(arg) + 1
+ }
+ return f
+}
+
+func (c *compiler) cat(f1, f2 frag) frag {
+ // concat of failure is failure
+ if f1.i == 0 || f2.i == 0 {
+ return frag{}
+ }
+
+ // TODO: elide nop
+
+ f1.out.patch(c.p, f2.i)
+ return frag{f1.i, f2.out, f1.nullable && f2.nullable}
+}
+
+func (c *compiler) alt(f1, f2 frag) frag {
+ // alt of failure is other
+ if f1.i == 0 {
+ return f2
+ }
+ if f2.i == 0 {
+ return f1
+ }
+
+ f := c.inst(InstAlt)
+ i := &c.p.Inst[f.i]
+ i.Out = f1.i
+ i.Arg = f2.i
+ f.out = f1.out.append(c.p, f2.out)
+ f.nullable = f1.nullable || f2.nullable
+ return f
+}
+
+func (c *compiler) quest(f1 frag, nongreedy bool) frag {
+ f := c.inst(InstAlt)
+ i := &c.p.Inst[f.i]
+ if nongreedy {
+ i.Arg = f1.i
+ f.out = makePatchList(f.i << 1)
+ } else {
+ i.Out = f1.i
+ f.out = makePatchList(f.i<<1 | 1)
+ }
+ f.out = f.out.append(c.p, f1.out)
+ return f
+}
+
+// loop returns the fragment for the main loop of a plus or star.
+// For plus, it can be used after changing the entry to f1.i.
+// For star, it can be used directly when f1 can't match an empty string.
+// (When f1 can match an empty string, f1* must be implemented as (f1+)?
+// to get the priority match order correct.)
+func (c *compiler) loop(f1 frag, nongreedy bool) frag {
+ f := c.inst(InstAlt)
+ i := &c.p.Inst[f.i]
+ if nongreedy {
+ i.Arg = f1.i
+ f.out = makePatchList(f.i << 1)
+ } else {
+ i.Out = f1.i
+ f.out = makePatchList(f.i<<1 | 1)
+ }
+ f1.out.patch(c.p, f.i)
+ return f
+}
+
+func (c *compiler) star(f1 frag, nongreedy bool) frag {
+ if f1.nullable {
+ // Use (f1+)? to get priority match order correct.
+ // See golang.org/issue/46123.
+ return c.quest(c.plus(f1, nongreedy), nongreedy)
+ }
+ return c.loop(f1, nongreedy)
+}
+
+func (c *compiler) plus(f1 frag, nongreedy bool) frag {
+ return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
+}
+
+func (c *compiler) empty(op EmptyOp) frag {
+ f := c.inst(InstEmptyWidth)
+ c.p.Inst[f.i].Arg = uint32(op)
+ f.out = makePatchList(f.i << 1)
+ return f
+}
+
+func (c *compiler) rune(r []rune, flags Flags) frag {
+ f := c.inst(InstRune)
+ f.nullable = false
+ i := &c.p.Inst[f.i]
+ i.Rune = r
+ flags &= FoldCase // only relevant flag is FoldCase
+ if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
+ // and sometimes not even that
+ flags &^= FoldCase
+ }
+ i.Arg = uint32(flags)
+ f.out = makePatchList(f.i << 1)
+
+ // Special cases for exec machine.
+ switch {
+ case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
+ i.Op = InstRune1
+ case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
+ i.Op = InstRuneAny
+ case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
+ i.Op = InstRuneAnyNotNL
+ }
+
+ return f
+}
diff --git a/src/regexp/syntax/doc.go b/src/regexp/syntax/doc.go
new file mode 100644
index 0000000..f6a4b43
--- /dev/null
+++ b/src/regexp/syntax/doc.go
@@ -0,0 +1,141 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
+
+/*
+Package syntax parses regular expressions into parse trees and compiles
+parse trees into programs. Most clients of regular expressions will use the
+facilities of package regexp (such as Compile and Match) instead of this package.
+
+# Syntax
+
+The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
+Parts of the syntax can be disabled by passing alternate flags to Parse.
+
+Single characters:
+
+ . any character, possibly including newline (flag s=true)
+ [xyz] character class
+ [^xyz] negated character class
+ \d Perl character class
+ \D negated Perl character class
+ [[:alpha:]] ASCII character class
+ [[:^alpha:]] negated ASCII character class
+ \pN Unicode character class (one-letter name)
+ \p{Greek} Unicode character class
+ \PN negated Unicode character class (one-letter name)
+ \P{Greek} negated Unicode character class
+
+Composites:
+
+ xy x followed by y
+ x|y x or y (prefer x)
+
+Repetitions:
+
+ x* zero or more x, prefer more
+ x+ one or more x, prefer more
+ x? zero or one x, prefer one
+ x{n,m} n or n+1 or ... or m x, prefer more
+ x{n,} n or more x, prefer more
+ x{n} exactly n x
+ x*? zero or more x, prefer fewer
+ x+? one or more x, prefer fewer
+ x?? zero or one x, prefer zero
+ x{n,m}? n or n+1 or ... or m x, prefer fewer
+ x{n,}? n or more x, prefer fewer
+ x{n}? exactly n x
+
+Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n}
+reject forms that create a minimum or maximum repetition count above 1000.
+Unlimited repetitions are not subject to this restriction.
+
+Grouping:
+
+ (re) numbered capturing group (submatch)
+ (?P<name>re) named & numbered capturing group (submatch)
+ (?:re) non-capturing group
+ (?flags) set flags within current group; non-capturing
+ (?flags:re) set flags during re; non-capturing
+
+ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
+
+ i case-insensitive (default false)
+ m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
+ s let . match \n (default false)
+ U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
+
+Empty strings:
+
+ ^ at beginning of text or line (flag m=true)
+ $ at end of text (like \z not \Z) or line (flag m=true)
+ \A at beginning of text
+ \b at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
+ \B not at ASCII word boundary
+ \z at end of text
+
+Escape sequences:
+
+ \a bell (== \007)
+ \f form feed (== \014)
+ \t horizontal tab (== \011)
+ \n newline (== \012)
+ \r carriage return (== \015)
+ \v vertical tab character (== \013)
+ \* literal *, for any punctuation character *
+ \123 octal character code (up to three digits)
+ \x7F hex character code (exactly two digits)
+ \x{10FFFF} hex character code
+ \Q...\E literal text ... even if ... has punctuation
+
+Character class elements:
+
+ x single character
+ A-Z character range (inclusive)
+ \d Perl character class
+ [:foo:] ASCII character class foo
+ \p{Foo} Unicode character class Foo
+ \pF Unicode character class F (one-letter name)
+
+Named character classes as character class elements:
+
+ [\d] digits (== \d)
+ [^\d] not digits (== \D)
+ [\D] not digits (== \D)
+ [^\D] not not digits (== \d)
+ [[:name:]] named ASCII class inside character class (== [:name:])
+ [^[:name:]] named ASCII class inside negated character class (== [:^name:])
+ [\p{Name}] named Unicode property inside character class (== \p{Name})
+ [^\p{Name}] named Unicode property inside negated character class (== \P{Name})
+
+Perl character classes (all ASCII-only):
+
+ \d digits (== [0-9])
+ \D not digits (== [^0-9])
+ \s whitespace (== [\t\n\f\r ])
+ \S not whitespace (== [^\t\n\f\r ])
+ \w word characters (== [0-9A-Za-z_])
+ \W not word characters (== [^0-9A-Za-z_])
+
+ASCII character classes:
+
+ [[:alnum:]] alphanumeric (== [0-9A-Za-z])
+ [[:alpha:]] alphabetic (== [A-Za-z])
+ [[:ascii:]] ASCII (== [\x00-\x7F])
+ [[:blank:]] blank (== [\t ])
+ [[:cntrl:]] control (== [\x00-\x1F\x7F])
+ [[:digit:]] digits (== [0-9])
+ [[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
+ [[:lower:]] lower case (== [a-z])
+ [[:print:]] printable (== [ -~] == [ [:graph:]])
+ [[:punct:]] punctuation (== [!-/:-@[-`{-~])
+ [[:space:]] whitespace (== [\t\n\v\f\r ])
+ [[:upper:]] upper case (== [A-Z])
+ [[:word:]] word characters (== [0-9A-Za-z_])
+ [[:xdigit:]] hex digit (== [0-9A-Fa-f])
+
+Unicode character classes are those in unicode.Categories and unicode.Scripts.
+*/
+package syntax
diff --git a/src/regexp/syntax/make_perl_groups.pl b/src/regexp/syntax/make_perl_groups.pl
new file mode 100755
index 0000000..80a2c9a
--- /dev/null
+++ b/src/regexp/syntax/make_perl_groups.pl
@@ -0,0 +1,113 @@
+#!/usr/bin/perl
+# Copyright 2008 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Modified version of RE2's make_perl_groups.pl.
+
+# Generate table entries giving character ranges
+# for POSIX/Perl character classes. Rather than
+# figure out what the definition is, it is easier to ask
+# Perl about each letter from 0-128 and write down
+# its answer.
+
+@posixclasses = (
+ "[:alnum:]",
+ "[:alpha:]",
+ "[:ascii:]",
+ "[:blank:]",
+ "[:cntrl:]",
+ "[:digit:]",
+ "[:graph:]",
+ "[:lower:]",
+ "[:print:]",
+ "[:punct:]",
+ "[:space:]",
+ "[:upper:]",
+ "[:word:]",
+ "[:xdigit:]",
+);
+
+@perlclasses = (
+ "\\d",
+ "\\s",
+ "\\w",
+);
+
+%overrides = (
+ # Prior to Perl 5.18, \s did not match vertical tab.
+ # RE2 preserves that original behaviour.
+ "\\s:11" => 0,
+);
+
+sub ComputeClass($) {
+ my @ranges;
+ my ($class) = @_;
+ my $regexp = "[$class]";
+ my $start = -1;
+ for (my $i=0; $i<=129; $i++) {
+ if ($i == 129) { $i = 256; }
+ if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) {
+ if ($start < 0) {
+ $start = $i;
+ }
+ } else {
+ if ($start >= 0) {
+ push @ranges, [$start, $i-1];
+ }
+ $start = -1;
+ }
+ }
+ return @ranges;
+}
+
+sub PrintClass($$@) {
+ my ($cname, $name, @ranges) = @_;
+ print "var code$cname = []rune{ /* $name */\n";
+ for (my $i=0; $i<@ranges; $i++) {
+ my @a = @{$ranges[$i]};
+ printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
+ }
+ print "}\n\n";
+ my $n = @ranges;
+ $negname = $name;
+ if ($negname =~ /:/) {
+ $negname =~ s/:/:^/;
+ } else {
+ $negname =~ y/a-z/A-Z/;
+ }
+ return "\t`$name`: {+1, code$cname},\n" .
+ "\t`$negname`: {-1, code$cname},\n";
+}
+
+my $gen = 0;
+
+sub PrintClasses($@) {
+ my ($cname, @classes) = @_;
+ my @entries;
+ foreach my $cl (@classes) {
+ my @ranges = ComputeClass($cl);
+ push @entries, PrintClass(++$gen, $cl, @ranges);
+ }
+ print "var ${cname}Group = map[string]charGroup{\n";
+ foreach my $e (@entries) {
+ print $e;
+ }
+ print "}\n";
+ my $count = @entries;
+}
+
+print <<EOF;
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.go
+
+package syntax
+
+EOF
+
+PrintClasses("perl", @perlclasses);
+PrintClasses("posix", @posixclasses);
diff --git a/src/regexp/syntax/op_string.go b/src/regexp/syntax/op_string.go
new file mode 100644
index 0000000..3952b2b
--- /dev/null
+++ b/src/regexp/syntax/op_string.go
@@ -0,0 +1,26 @@
+// Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT.
+
+package syntax
+
+import "strconv"
+
+const (
+ _Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"
+ _Op_name_1 = "opPseudo"
+)
+
+var (
+ _Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151}
+)
+
+func (i Op) String() string {
+ switch {
+ case 1 <= i && i <= 19:
+ i -= 1
+ return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]]
+ case i == 128:
+ return _Op_name_1
+ default:
+ return "Op(" + strconv.FormatInt(int64(i), 10) + ")"
+ }
+}
diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go
new file mode 100644
index 0000000..accee9a
--- /dev/null
+++ b/src/regexp/syntax/parse.go
@@ -0,0 +1,2115 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+ "sort"
+ "strings"
+ "unicode"
+ "unicode/utf8"
+)
+
+// An Error describes a failure to parse a regular expression
+// and gives the offending expression.
+type Error struct {
+ Code ErrorCode
+ Expr string
+}
+
+func (e *Error) Error() string {
+ return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
+}
+
+// An ErrorCode describes a failure to parse a regular expression.
+type ErrorCode string
+
+const (
+ // Unexpected error
+ ErrInternalError ErrorCode = "regexp/syntax: internal error"
+
+ // Parse errors
+ ErrInvalidCharClass ErrorCode = "invalid character class"
+ ErrInvalidCharRange ErrorCode = "invalid character class range"
+ ErrInvalidEscape ErrorCode = "invalid escape sequence"
+ ErrInvalidNamedCapture ErrorCode = "invalid named capture"
+ ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
+ ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
+ ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
+ ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
+ ErrMissingBracket ErrorCode = "missing closing ]"
+ ErrMissingParen ErrorCode = "missing closing )"
+ ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
+ ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
+ ErrUnexpectedParen ErrorCode = "unexpected )"
+ ErrNestingDepth ErrorCode = "expression nests too deeply"
+ ErrLarge ErrorCode = "expression too large"
+)
+
+func (e ErrorCode) String() string {
+ return string(e)
+}
+
+// Flags control the behavior of the parser and record information about regexp context.
+type Flags uint16
+
+const (
+ FoldCase Flags = 1 << iota // case-insensitive match
+ Literal // treat pattern as literal string
+ ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
+ DotNL // allow . to match newline
+ OneLine // treat ^ and $ as only matching at beginning and end of text
+ NonGreedy // make repetition operators default to non-greedy
+ PerlX // allow Perl extensions
+ UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
+ WasDollar // regexp OpEndText was $, not \z
+ Simple // regexp contains no counted repetition
+
+ MatchNL = ClassNL | DotNL
+
+ Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
+ POSIX Flags = 0 // POSIX syntax
+)
+
+// Pseudo-ops for parsing stack.
+const (
+ opLeftParen = opPseudo + iota
+ opVerticalBar
+)
+
+// maxHeight is the maximum height of a regexp parse tree.
+// It is somewhat arbitrarily chosen, but the idea is to be large enough
+// that no one will actually hit in real use but at the same time small enough
+// that recursion on the Regexp tree will not hit the 1GB Go stack limit.
+// The maximum amount of stack for a single recursive frame is probably
+// closer to 1kB, so this could potentially be raised, but it seems unlikely
+// that people have regexps nested even this deeply.
+// We ran a test on Google's C++ code base and turned up only
+// a single use case with depth > 100; it had depth 128.
+// Using depth 1000 should be plenty of margin.
+// As an optimization, we don't even bother calculating heights
+// until we've allocated at least maxHeight Regexp structures.
+const maxHeight = 1000
+
+// maxSize is the maximum size of a compiled regexp in Insts.
+// It too is somewhat arbitrarily chosen, but the idea is to be large enough
+// to allow significant regexps while at the same time small enough that
+// the compiled form will not take up too much memory.
+// 128 MB is enough for a 3.3 million Inst structures, which roughly
+// corresponds to a 3.3 MB regexp.
+const (
+ maxSize = 128 << 20 / instSize
+ instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words
+)
+
+// maxRunes is the maximum number of runes allowed in a regexp tree
+// counting the runes in all the nodes.
+// Ignoring character classes p.numRunes is always less than the length of the regexp.
+// Character classes can make it much larger: each \pL adds 1292 runes.
+// 128 MB is enough for 32M runes, which is over 26k \pL instances.
+// Note that repetitions do not make copies of the rune slices,
+// so \pL{1000} is only one rune slice, not 1000.
+// We could keep a cache of character classes we've seen,
+// so that all the \pL we see use the same rune list,
+// but that doesn't remove the problem entirely:
+// consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
+// And because the Rune slice is exposed directly in the Regexp,
+// there is not an opportunity to change the representation to allow
+// partial sharing between different character classes.
+// So the limit is the best we can do.
+const (
+ maxRunes = 128 << 20 / runeSize
+ runeSize = 4 // rune is int32
+)
+
+type parser struct {
+ flags Flags // parse mode flags
+ stack []*Regexp // stack of parsed expressions
+ free *Regexp
+ numCap int // number of capturing groups seen
+ wholeRegexp string
+ tmpClass []rune // temporary char class work space
+ numRegexp int // number of regexps allocated
+ numRunes int // number of runes in char classes
+ repeats int64 // product of all repetitions seen
+ height map[*Regexp]int // regexp height, for height limit check
+ size map[*Regexp]int64 // regexp compiled size, for size limit check
+}
+
+func (p *parser) newRegexp(op Op) *Regexp {
+ re := p.free
+ if re != nil {
+ p.free = re.Sub0[0]
+ *re = Regexp{}
+ } else {
+ re = new(Regexp)
+ p.numRegexp++
+ }
+ re.Op = op
+ return re
+}
+
+func (p *parser) reuse(re *Regexp) {
+ if p.height != nil {
+ delete(p.height, re)
+ }
+ re.Sub0[0] = p.free
+ p.free = re
+}
+
+func (p *parser) checkLimits(re *Regexp) {
+ if p.numRunes > maxRunes {
+ panic(ErrLarge)
+ }
+ p.checkSize(re)
+ p.checkHeight(re)
+}
+
+func (p *parser) checkSize(re *Regexp) {
+ if p.size == nil {
+ // We haven't started tracking size yet.
+ // Do a relatively cheap check to see if we need to start.
+ // Maintain the product of all the repeats we've seen
+ // and don't track if the total number of regexp nodes
+ // we've seen times the repeat product is in budget.
+ if p.repeats == 0 {
+ p.repeats = 1
+ }
+ if re.Op == OpRepeat {
+ n := re.Max
+ if n == -1 {
+ n = re.Min
+ }
+ if n <= 0 {
+ n = 1
+ }
+ if int64(n) > maxSize/p.repeats {
+ p.repeats = maxSize
+ } else {
+ p.repeats *= int64(n)
+ }
+ }
+ if int64(p.numRegexp) < maxSize/p.repeats {
+ return
+ }
+
+ // We need to start tracking size.
+ // Make the map and belatedly populate it
+ // with info about everything we've constructed so far.
+ p.size = make(map[*Regexp]int64)
+ for _, re := range p.stack {
+ p.checkSize(re)
+ }
+ }
+
+ if p.calcSize(re, true) > maxSize {
+ panic(ErrLarge)
+ }
+}
+
+func (p *parser) calcSize(re *Regexp, force bool) int64 {
+ if !force {
+ if size, ok := p.size[re]; ok {
+ return size
+ }
+ }
+
+ var size int64
+ switch re.Op {
+ case OpLiteral:
+ size = int64(len(re.Rune))
+ case OpCapture, OpStar:
+ // star can be 1+ or 2+; assume 2 pessimistically
+ size = 2 + p.calcSize(re.Sub[0], false)
+ case OpPlus, OpQuest:
+ size = 1 + p.calcSize(re.Sub[0], false)
+ case OpConcat:
+ for _, sub := range re.Sub {
+ size += p.calcSize(sub, false)
+ }
+ case OpAlternate:
+ for _, sub := range re.Sub {
+ size += p.calcSize(sub, false)
+ }
+ if len(re.Sub) > 1 {
+ size += int64(len(re.Sub)) - 1
+ }
+ case OpRepeat:
+ sub := p.calcSize(re.Sub[0], false)
+ if re.Max == -1 {
+ if re.Min == 0 {
+ size = 2 + sub // x*
+ } else {
+ size = 1 + int64(re.Min)*sub // xxx+
+ }
+ break
+ }
+ // x{2,5} = xx(x(x(x)?)?)?
+ size = int64(re.Max)*sub + int64(re.Max-re.Min)
+ }
+
+ if size < 1 {
+ size = 1
+ }
+ p.size[re] = size
+ return size
+}
+
+func (p *parser) checkHeight(re *Regexp) {
+ if p.numRegexp < maxHeight {
+ return
+ }
+ if p.height == nil {
+ p.height = make(map[*Regexp]int)
+ for _, re := range p.stack {
+ p.checkHeight(re)
+ }
+ }
+ if p.calcHeight(re, true) > maxHeight {
+ panic(ErrNestingDepth)
+ }
+}
+
+func (p *parser) calcHeight(re *Regexp, force bool) int {
+ if !force {
+ if h, ok := p.height[re]; ok {
+ return h
+ }
+ }
+ h := 1
+ for _, sub := range re.Sub {
+ hsub := p.calcHeight(sub, false)
+ if h < 1+hsub {
+ h = 1 + hsub
+ }
+ }
+ p.height[re] = h
+ return h
+}
+
+// Parse stack manipulation.
+
+// push pushes the regexp re onto the parse stack and returns the regexp.
+func (p *parser) push(re *Regexp) *Regexp {
+ p.numRunes += len(re.Rune)
+ if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
+ // Single rune.
+ if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
+ return nil
+ }
+ re.Op = OpLiteral
+ re.Rune = re.Rune[:1]
+ re.Flags = p.flags &^ FoldCase
+ } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
+ re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
+ unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
+ unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
+ re.Op == OpCharClass && len(re.Rune) == 2 &&
+ re.Rune[0]+1 == re.Rune[1] &&
+ unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
+ unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
+ // Case-insensitive rune like [Aa] or [Δδ].
+ if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
+ return nil
+ }
+
+ // Rewrite as (case-insensitive) literal.
+ re.Op = OpLiteral
+ re.Rune = re.Rune[:1]
+ re.Flags = p.flags | FoldCase
+ } else {
+ // Incremental concatenation.
+ p.maybeConcat(-1, 0)
+ }
+
+ p.stack = append(p.stack, re)
+ p.checkLimits(re)
+ return re
+}
+
+// maybeConcat implements incremental concatenation
+// of literal runes into string nodes. The parser calls this
+// before each push, so only the top fragment of the stack
+// might need processing. Since this is called before a push,
+// the topmost literal is no longer subject to operators like *
+// (Otherwise ab* would turn into (ab)*.)
+// If r >= 0 and there's a node left over, maybeConcat uses it
+// to push r with the given flags.
+// maybeConcat reports whether r was pushed.
+func (p *parser) maybeConcat(r rune, flags Flags) bool {
+ n := len(p.stack)
+ if n < 2 {
+ return false
+ }
+
+ re1 := p.stack[n-1]
+ re2 := p.stack[n-2]
+ if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
+ return false
+ }
+
+ // Push re1 into re2.
+ re2.Rune = append(re2.Rune, re1.Rune...)
+
+ // Reuse re1 if possible.
+ if r >= 0 {
+ re1.Rune = re1.Rune0[:1]
+ re1.Rune[0] = r
+ re1.Flags = flags
+ return true
+ }
+
+ p.stack = p.stack[:n-1]
+ p.reuse(re1)
+ return false // did not push r
+}
+
+// literal pushes a literal regexp for the rune r on the stack.
+func (p *parser) literal(r rune) {
+ re := p.newRegexp(OpLiteral)
+ re.Flags = p.flags
+ if p.flags&FoldCase != 0 {
+ r = minFoldRune(r)
+ }
+ re.Rune0[0] = r
+ re.Rune = re.Rune0[:1]
+ p.push(re)
+}
+
+// minFoldRune returns the minimum rune fold-equivalent to r.
+func minFoldRune(r rune) rune {
+ if r < minFold || r > maxFold {
+ return r
+ }
+ min := r
+ r0 := r
+ for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
+ if min > r {
+ min = r
+ }
+ }
+ return min
+}
+
+// op pushes a regexp with the given op onto the stack
+// and returns that regexp.
+func (p *parser) op(op Op) *Regexp {
+ re := p.newRegexp(op)
+ re.Flags = p.flags
+ return p.push(re)
+}
+
+// repeat replaces the top stack element with itself repeated according to op, min, max.
+// before is the regexp suffix starting at the repetition operator.
+// after is the regexp suffix following after the repetition operator.
+// repeat returns an updated 'after' and an error, if any.
+func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
+ flags := p.flags
+ if p.flags&PerlX != 0 {
+ if len(after) > 0 && after[0] == '?' {
+ after = after[1:]
+ flags ^= NonGreedy
+ }
+ if lastRepeat != "" {
+ // In Perl it is not allowed to stack repetition operators:
+ // a** is a syntax error, not a doubled star, and a++ means
+ // something else entirely, which we don't support!
+ return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
+ }
+ }
+ n := len(p.stack)
+ if n == 0 {
+ return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
+ }
+ sub := p.stack[n-1]
+ if sub.Op >= opPseudo {
+ return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
+ }
+
+ re := p.newRegexp(op)
+ re.Min = min
+ re.Max = max
+ re.Flags = flags
+ re.Sub = re.Sub0[:1]
+ re.Sub[0] = sub
+ p.stack[n-1] = re
+ p.checkLimits(re)
+
+ if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
+ return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
+ }
+
+ return after, nil
+}
+
+// repeatIsValid reports whether the repetition re is valid.
+// Valid means that the combination of the top-level repetition
+// and any inner repetitions does not exceed n copies of the
+// innermost thing.
+// This function rewalks the regexp tree and is called for every repetition,
+// so we have to worry about inducing quadratic behavior in the parser.
+// We avoid this by only calling repeatIsValid when min or max >= 2.
+// In that case the depth of any >= 2 nesting can only get to 9 without
+// triggering a parse error, so each subtree can only be rewalked 9 times.
+func repeatIsValid(re *Regexp, n int) bool {
+ if re.Op == OpRepeat {
+ m := re.Max
+ if m == 0 {
+ return true
+ }
+ if m < 0 {
+ m = re.Min
+ }
+ if m > n {
+ return false
+ }
+ if m > 0 {
+ n /= m
+ }
+ }
+ for _, sub := range re.Sub {
+ if !repeatIsValid(sub, n) {
+ return false
+ }
+ }
+ return true
+}
+
+// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
+func (p *parser) concat() *Regexp {
+ p.maybeConcat(-1, 0)
+
+ // Scan down to find pseudo-operator | or (.
+ i := len(p.stack)
+ for i > 0 && p.stack[i-1].Op < opPseudo {
+ i--
+ }
+ subs := p.stack[i:]
+ p.stack = p.stack[:i]
+
+ // Empty concatenation is special case.
+ if len(subs) == 0 {
+ return p.push(p.newRegexp(OpEmptyMatch))
+ }
+
+ return p.push(p.collapse(subs, OpConcat))
+}
+
+// alternate replaces the top of the stack (above the topmost '(') with its alternation.
+func (p *parser) alternate() *Regexp {
+ // Scan down to find pseudo-operator (.
+ // There are no | above (.
+ i := len(p.stack)
+ for i > 0 && p.stack[i-1].Op < opPseudo {
+ i--
+ }
+ subs := p.stack[i:]
+ p.stack = p.stack[:i]
+
+ // Make sure top class is clean.
+ // All the others already are (see swapVerticalBar).
+ if len(subs) > 0 {
+ cleanAlt(subs[len(subs)-1])
+ }
+
+ // Empty alternate is special case
+ // (shouldn't happen but easy to handle).
+ if len(subs) == 0 {
+ return p.push(p.newRegexp(OpNoMatch))
+ }
+
+ return p.push(p.collapse(subs, OpAlternate))
+}
+
+// cleanAlt cleans re for eventual inclusion in an alternation.
+func cleanAlt(re *Regexp) {
+ switch re.Op {
+ case OpCharClass:
+ re.Rune = cleanClass(&re.Rune)
+ if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
+ re.Rune = nil
+ re.Op = OpAnyChar
+ return
+ }
+ if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
+ re.Rune = nil
+ re.Op = OpAnyCharNotNL
+ return
+ }
+ if cap(re.Rune)-len(re.Rune) > 100 {
+ // re.Rune will not grow any more.
+ // Make a copy or inline to reclaim storage.
+ re.Rune = append(re.Rune0[:0], re.Rune...)
+ }
+ }
+}
+
+// collapse returns the result of applying op to sub.
+// If sub contains op nodes, they all get hoisted up
+// so that there is never a concat of a concat or an
+// alternate of an alternate.
+func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
+ if len(subs) == 1 {
+ return subs[0]
+ }
+ re := p.newRegexp(op)
+ re.Sub = re.Sub0[:0]
+ for _, sub := range subs {
+ if sub.Op == op {
+ re.Sub = append(re.Sub, sub.Sub...)
+ p.reuse(sub)
+ } else {
+ re.Sub = append(re.Sub, sub)
+ }
+ }
+ if op == OpAlternate {
+ re.Sub = p.factor(re.Sub)
+ if len(re.Sub) == 1 {
+ old := re
+ re = re.Sub[0]
+ p.reuse(old)
+ }
+ }
+ return re
+}
+
+// factor factors common prefixes from the alternation list sub.
+// It returns a replacement list that reuses the same storage and
+// frees (passes to p.reuse) any removed *Regexps.
+//
+// For example,
+//
+// ABC|ABD|AEF|BCX|BCY
+//
+// simplifies by literal prefix extraction to
+//
+// A(B(C|D)|EF)|BC(X|Y)
+//
+// which simplifies by character class introduction to
+//
+// A(B[CD]|EF)|BC[XY]
+func (p *parser) factor(sub []*Regexp) []*Regexp {
+ if len(sub) < 2 {
+ return sub
+ }
+
+ // Round 1: Factor out common literal prefixes.
+ var str []rune
+ var strflags Flags
+ start := 0
+ out := sub[:0]
+ for i := 0; i <= len(sub); i++ {
+ // Invariant: the Regexps that were in sub[0:start] have been
+ // used or marked for reuse, and the slice space has been reused
+ // for out (len(out) <= start).
+ //
+ // Invariant: sub[start:i] consists of regexps that all begin
+ // with str as modified by strflags.
+ var istr []rune
+ var iflags Flags
+ if i < len(sub) {
+ istr, iflags = p.leadingString(sub[i])
+ if iflags == strflags {
+ same := 0
+ for same < len(str) && same < len(istr) && str[same] == istr[same] {
+ same++
+ }
+ if same > 0 {
+ // Matches at least one rune in current range.
+ // Keep going around.
+ str = str[:same]
+ continue
+ }
+ }
+ }
+
+ // Found end of a run with common leading literal string:
+ // sub[start:i] all begin with str[0:len(str)], but sub[i]
+ // does not even begin with str[0].
+ //
+ // Factor out common string and append factored expression to out.
+ if i == start {
+ // Nothing to do - run of length 0.
+ } else if i == start+1 {
+ // Just one: don't bother factoring.
+ out = append(out, sub[start])
+ } else {
+ // Construct factored form: prefix(suffix1|suffix2|...)
+ prefix := p.newRegexp(OpLiteral)
+ prefix.Flags = strflags
+ prefix.Rune = append(prefix.Rune[:0], str...)
+
+ for j := start; j < i; j++ {
+ sub[j] = p.removeLeadingString(sub[j], len(str))
+ p.checkLimits(sub[j])
+ }
+ suffix := p.collapse(sub[start:i], OpAlternate) // recurse
+
+ re := p.newRegexp(OpConcat)
+ re.Sub = append(re.Sub[:0], prefix, suffix)
+ out = append(out, re)
+ }
+
+ // Prepare for next iteration.
+ start = i
+ str = istr
+ strflags = iflags
+ }
+ sub = out
+
+ // Round 2: Factor out common simple prefixes,
+ // just the first piece of each concatenation.
+ // This will be good enough a lot of the time.
+ //
+ // Complex subexpressions (e.g. involving quantifiers)
+ // are not safe to factor because that collapses their
+ // distinct paths through the automaton, which affects
+ // correctness in some cases.
+ start = 0
+ out = sub[:0]
+ var first *Regexp
+ for i := 0; i <= len(sub); i++ {
+ // Invariant: the Regexps that were in sub[0:start] have been
+ // used or marked for reuse, and the slice space has been reused
+ // for out (len(out) <= start).
+ //
+ // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
+ var ifirst *Regexp
+ if i < len(sub) {
+ ifirst = p.leadingRegexp(sub[i])
+ if first != nil && first.Equal(ifirst) &&
+ // first must be a character class OR a fixed repeat of a character class.
+ (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
+ continue
+ }
+ }
+
+ // Found end of a run with common leading regexp:
+ // sub[start:i] all begin with first but sub[i] does not.
+ //
+ // Factor out common regexp and append factored expression to out.
+ if i == start {
+ // Nothing to do - run of length 0.
+ } else if i == start+1 {
+ // Just one: don't bother factoring.
+ out = append(out, sub[start])
+ } else {
+ // Construct factored form: prefix(suffix1|suffix2|...)
+ prefix := first
+ for j := start; j < i; j++ {
+ reuse := j != start // prefix came from sub[start]
+ sub[j] = p.removeLeadingRegexp(sub[j], reuse)
+ p.checkLimits(sub[j])
+ }
+ suffix := p.collapse(sub[start:i], OpAlternate) // recurse
+
+ re := p.newRegexp(OpConcat)
+ re.Sub = append(re.Sub[:0], prefix, suffix)
+ out = append(out, re)
+ }
+
+ // Prepare for next iteration.
+ start = i
+ first = ifirst
+ }
+ sub = out
+
+ // Round 3: Collapse runs of single literals into character classes.
+ start = 0
+ out = sub[:0]
+ for i := 0; i <= len(sub); i++ {
+ // Invariant: the Regexps that were in sub[0:start] have been
+ // used or marked for reuse, and the slice space has been reused
+ // for out (len(out) <= start).
+ //
+ // Invariant: sub[start:i] consists of regexps that are either
+ // literal runes or character classes.
+ if i < len(sub) && isCharClass(sub[i]) {
+ continue
+ }
+
+ // sub[i] is not a char or char class;
+ // emit char class for sub[start:i]...
+ if i == start {
+ // Nothing to do - run of length 0.
+ } else if i == start+1 {
+ out = append(out, sub[start])
+ } else {
+ // Make new char class.
+ // Start with most complex regexp in sub[start].
+ max := start
+ for j := start + 1; j < i; j++ {
+ if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
+ max = j
+ }
+ }
+ sub[start], sub[max] = sub[max], sub[start]
+
+ for j := start + 1; j < i; j++ {
+ mergeCharClass(sub[start], sub[j])
+ p.reuse(sub[j])
+ }
+ cleanAlt(sub[start])
+ out = append(out, sub[start])
+ }
+
+ // ... and then emit sub[i].
+ if i < len(sub) {
+ out = append(out, sub[i])
+ }
+ start = i + 1
+ }
+ sub = out
+
+ // Round 4: Collapse runs of empty matches into a single empty match.
+ start = 0
+ out = sub[:0]
+ for i := range sub {
+ if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
+ continue
+ }
+ out = append(out, sub[i])
+ }
+ sub = out
+
+ return sub
+}
+
+// leadingString returns the leading literal string that re begins with.
+// The string refers to storage in re or its children.
+func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
+ if re.Op == OpConcat && len(re.Sub) > 0 {
+ re = re.Sub[0]
+ }
+ if re.Op != OpLiteral {
+ return nil, 0
+ }
+ return re.Rune, re.Flags & FoldCase
+}
+
+// removeLeadingString removes the first n leading runes
+// from the beginning of re. It returns the replacement for re.
+func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
+ if re.Op == OpConcat && len(re.Sub) > 0 {
+ // Removing a leading string in a concatenation
+ // might simplify the concatenation.
+ sub := re.Sub[0]
+ sub = p.removeLeadingString(sub, n)
+ re.Sub[0] = sub
+ if sub.Op == OpEmptyMatch {
+ p.reuse(sub)
+ switch len(re.Sub) {
+ case 0, 1:
+ // Impossible but handle.
+ re.Op = OpEmptyMatch
+ re.Sub = nil
+ case 2:
+ old := re
+ re = re.Sub[1]
+ p.reuse(old)
+ default:
+ copy(re.Sub, re.Sub[1:])
+ re.Sub = re.Sub[:len(re.Sub)-1]
+ }
+ }
+ return re
+ }
+
+ if re.Op == OpLiteral {
+ re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
+ if len(re.Rune) == 0 {
+ re.Op = OpEmptyMatch
+ }
+ }
+ return re
+}
+
+// leadingRegexp returns the leading regexp that re begins with.
+// The regexp refers to storage in re or its children.
+func (p *parser) leadingRegexp(re *Regexp) *Regexp {
+ if re.Op == OpEmptyMatch {
+ return nil
+ }
+ if re.Op == OpConcat && len(re.Sub) > 0 {
+ sub := re.Sub[0]
+ if sub.Op == OpEmptyMatch {
+ return nil
+ }
+ return sub
+ }
+ return re
+}
+
+// removeLeadingRegexp removes the leading regexp in re.
+// It returns the replacement for re.
+// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
+func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
+ if re.Op == OpConcat && len(re.Sub) > 0 {
+ if reuse {
+ p.reuse(re.Sub[0])
+ }
+ re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
+ switch len(re.Sub) {
+ case 0:
+ re.Op = OpEmptyMatch
+ re.Sub = nil
+ case 1:
+ old := re
+ re = re.Sub[0]
+ p.reuse(old)
+ }
+ return re
+ }
+ if reuse {
+ p.reuse(re)
+ }
+ return p.newRegexp(OpEmptyMatch)
+}
+
+func literalRegexp(s string, flags Flags) *Regexp {
+ re := &Regexp{Op: OpLiteral}
+ re.Flags = flags
+ re.Rune = re.Rune0[:0] // use local storage for small strings
+ for _, c := range s {
+ if len(re.Rune) >= cap(re.Rune) {
+ // string is too long to fit in Rune0. let Go handle it
+ re.Rune = []rune(s)
+ break
+ }
+ re.Rune = append(re.Rune, c)
+ }
+ return re
+}
+
+// Parsing.
+
+// Parse parses a regular expression string s, controlled by the specified
+// Flags, and returns a regular expression parse tree. The syntax is
+// described in the top-level comment.
+func Parse(s string, flags Flags) (*Regexp, error) {
+ return parse(s, flags)
+}
+
+func parse(s string, flags Flags) (_ *Regexp, err error) {
+ defer func() {
+ switch r := recover(); r {
+ default:
+ panic(r)
+ case nil:
+ // ok
+ case ErrLarge: // too big
+ err = &Error{Code: ErrLarge, Expr: s}
+ case ErrNestingDepth:
+ err = &Error{Code: ErrNestingDepth, Expr: s}
+ }
+ }()
+
+ if flags&Literal != 0 {
+ // Trivial parser for literal string.
+ if err := checkUTF8(s); err != nil {
+ return nil, err
+ }
+ return literalRegexp(s, flags), nil
+ }
+
+ // Otherwise, must do real work.
+ var (
+ p parser
+ c rune
+ op Op
+ lastRepeat string
+ )
+ p.flags = flags
+ p.wholeRegexp = s
+ t := s
+ for t != "" {
+ repeat := ""
+ BigSwitch:
+ switch t[0] {
+ default:
+ if c, t, err = nextRune(t); err != nil {
+ return nil, err
+ }
+ p.literal(c)
+
+ case '(':
+ if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
+ // Flag changes and non-capturing groups.
+ if t, err = p.parsePerlFlags(t); err != nil {
+ return nil, err
+ }
+ break
+ }
+ p.numCap++
+ p.op(opLeftParen).Cap = p.numCap
+ t = t[1:]
+ case '|':
+ if err = p.parseVerticalBar(); err != nil {
+ return nil, err
+ }
+ t = t[1:]
+ case ')':
+ if err = p.parseRightParen(); err != nil {
+ return nil, err
+ }
+ t = t[1:]
+ case '^':
+ if p.flags&OneLine != 0 {
+ p.op(OpBeginText)
+ } else {
+ p.op(OpBeginLine)
+ }
+ t = t[1:]
+ case '$':
+ if p.flags&OneLine != 0 {
+ p.op(OpEndText).Flags |= WasDollar
+ } else {
+ p.op(OpEndLine)
+ }
+ t = t[1:]
+ case '.':
+ if p.flags&DotNL != 0 {
+ p.op(OpAnyChar)
+ } else {
+ p.op(OpAnyCharNotNL)
+ }
+ t = t[1:]
+ case '[':
+ if t, err = p.parseClass(t); err != nil {
+ return nil, err
+ }
+ case '*', '+', '?':
+ before := t
+ switch t[0] {
+ case '*':
+ op = OpStar
+ case '+':
+ op = OpPlus
+ case '?':
+ op = OpQuest
+ }
+ after := t[1:]
+ if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
+ return nil, err
+ }
+ repeat = before
+ t = after
+ case '{':
+ op = OpRepeat
+ before := t
+ min, max, after, ok := p.parseRepeat(t)
+ if !ok {
+ // If the repeat cannot be parsed, { is a literal.
+ p.literal('{')
+ t = t[1:]
+ break
+ }
+ if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
+ // Numbers were too big, or max is present and min > max.
+ return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
+ }
+ if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
+ return nil, err
+ }
+ repeat = before
+ t = after
+ case '\\':
+ if p.flags&PerlX != 0 && len(t) >= 2 {
+ switch t[1] {
+ case 'A':
+ p.op(OpBeginText)
+ t = t[2:]
+ break BigSwitch
+ case 'b':
+ p.op(OpWordBoundary)
+ t = t[2:]
+ break BigSwitch
+ case 'B':
+ p.op(OpNoWordBoundary)
+ t = t[2:]
+ break BigSwitch
+ case 'C':
+ // any byte; not supported
+ return nil, &Error{ErrInvalidEscape, t[:2]}
+ case 'Q':
+ // \Q ... \E: the ... is always literals
+ var lit string
+ lit, t, _ = strings.Cut(t[2:], `\E`)
+ for lit != "" {
+ c, rest, err := nextRune(lit)
+ if err != nil {
+ return nil, err
+ }
+ p.literal(c)
+ lit = rest
+ }
+ break BigSwitch
+ case 'z':
+ p.op(OpEndText)
+ t = t[2:]
+ break BigSwitch
+ }
+ }
+
+ re := p.newRegexp(OpCharClass)
+ re.Flags = p.flags
+
+ // Look for Unicode character group like \p{Han}
+ if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
+ r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
+ if err != nil {
+ return nil, err
+ }
+ if r != nil {
+ re.Rune = r
+ t = rest
+ p.push(re)
+ break BigSwitch
+ }
+ }
+
+ // Perl character class escape.
+ if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
+ re.Rune = r
+ t = rest
+ p.push(re)
+ break BigSwitch
+ }
+ p.reuse(re)
+
+ // Ordinary single-character escape.
+ if c, t, err = p.parseEscape(t); err != nil {
+ return nil, err
+ }
+ p.literal(c)
+ }
+ lastRepeat = repeat
+ }
+
+ p.concat()
+ if p.swapVerticalBar() {
+ // pop vertical bar
+ p.stack = p.stack[:len(p.stack)-1]
+ }
+ p.alternate()
+
+ n := len(p.stack)
+ if n != 1 {
+ return nil, &Error{ErrMissingParen, s}
+ }
+ return p.stack[0], nil
+}
+
+// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
+// If s is not of that form, it returns ok == false.
+// If s has the right form but the values are too big, it returns min == -1, ok == true.
+func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
+ if s == "" || s[0] != '{' {
+ return
+ }
+ s = s[1:]
+ var ok1 bool
+ if min, s, ok1 = p.parseInt(s); !ok1 {
+ return
+ }
+ if s == "" {
+ return
+ }
+ if s[0] != ',' {
+ max = min
+ } else {
+ s = s[1:]
+ if s == "" {
+ return
+ }
+ if s[0] == '}' {
+ max = -1
+ } else if max, s, ok1 = p.parseInt(s); !ok1 {
+ return
+ } else if max < 0 {
+ // parseInt found too big a number
+ min = -1
+ }
+ }
+ if s == "" || s[0] != '}' {
+ return
+ }
+ rest = s[1:]
+ ok = true
+ return
+}
+
+// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
+// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
+// The caller must have ensured that s begins with "(?".
+func (p *parser) parsePerlFlags(s string) (rest string, err error) {
+ t := s
+
+ // Check for named captures, first introduced in Python's regexp library.
+ // As usual, there are three slightly different syntaxes:
+ //
+ // (?P<name>expr) the original, introduced by Python
+ // (?<name>expr) the .NET alteration, adopted by Perl 5.10
+ // (?'name'expr) another .NET alteration, adopted by Perl 5.10
+ //
+ // Perl 5.10 gave in and implemented the Python version too,
+ // but they claim that the last two are the preferred forms.
+ // PCRE and languages based on it (specifically, PHP and Ruby)
+ // support all three as well. EcmaScript 4 uses only the Python form.
+ //
+ // In both the open source world (via Code Search) and the
+ // Google source tree, (?P<expr>name) is the dominant form,
+ // so that's the one we implement. One is enough.
+ if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
+ // Pull out name.
+ end := strings.IndexRune(t, '>')
+ if end < 0 {
+ if err = checkUTF8(t); err != nil {
+ return "", err
+ }
+ return "", &Error{ErrInvalidNamedCapture, s}
+ }
+
+ capture := t[:end+1] // "(?P<name>"
+ name := t[4:end] // "name"
+ if err = checkUTF8(name); err != nil {
+ return "", err
+ }
+ if !isValidCaptureName(name) {
+ return "", &Error{ErrInvalidNamedCapture, capture}
+ }
+
+ // Like ordinary capture, but named.
+ p.numCap++
+ re := p.op(opLeftParen)
+ re.Cap = p.numCap
+ re.Name = name
+ return t[end+1:], nil
+ }
+
+ // Non-capturing group. Might also twiddle Perl flags.
+ var c rune
+ t = t[2:] // skip (?
+ flags := p.flags
+ sign := +1
+ sawFlag := false
+Loop:
+ for t != "" {
+ if c, t, err = nextRune(t); err != nil {
+ return "", err
+ }
+ switch c {
+ default:
+ break Loop
+
+ // Flags.
+ case 'i':
+ flags |= FoldCase
+ sawFlag = true
+ case 'm':
+ flags &^= OneLine
+ sawFlag = true
+ case 's':
+ flags |= DotNL
+ sawFlag = true
+ case 'U':
+ flags |= NonGreedy
+ sawFlag = true
+
+ // Switch to negation.
+ case '-':
+ if sign < 0 {
+ break Loop
+ }
+ sign = -1
+ // Invert flags so that | above turn into &^ and vice versa.
+ // We'll invert flags again before using it below.
+ flags = ^flags
+ sawFlag = false
+
+ // End of flags, starting group or not.
+ case ':', ')':
+ if sign < 0 {
+ if !sawFlag {
+ break Loop
+ }
+ flags = ^flags
+ }
+ if c == ':' {
+ // Open new group
+ p.op(opLeftParen)
+ }
+ p.flags = flags
+ return t, nil
+ }
+ }
+
+ return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
+}
+
+// isValidCaptureName reports whether name
+// is a valid capture name: [A-Za-z0-9_]+.
+// PCRE limits names to 32 bytes.
+// Python rejects names starting with digits.
+// We don't enforce either of those.
+func isValidCaptureName(name string) bool {
+ if name == "" {
+ return false
+ }
+ for _, c := range name {
+ if c != '_' && !isalnum(c) {
+ return false
+ }
+ }
+ return true
+}
+
+// parseInt parses a decimal integer.
+func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
+ if s == "" || s[0] < '0' || '9' < s[0] {
+ return
+ }
+ // Disallow leading zeros.
+ if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
+ return
+ }
+ t := s
+ for s != "" && '0' <= s[0] && s[0] <= '9' {
+ s = s[1:]
+ }
+ rest = s
+ ok = true
+ // Have digits, compute value.
+ t = t[:len(t)-len(s)]
+ for i := 0; i < len(t); i++ {
+ // Avoid overflow.
+ if n >= 1e8 {
+ n = -1
+ break
+ }
+ n = n*10 + int(t[i]) - '0'
+ }
+ return
+}
+
+// can this be represented as a character class?
+// single-rune literal string, char class, ., and .|\n.
+func isCharClass(re *Regexp) bool {
+ return re.Op == OpLiteral && len(re.Rune) == 1 ||
+ re.Op == OpCharClass ||
+ re.Op == OpAnyCharNotNL ||
+ re.Op == OpAnyChar
+}
+
+// does re match r?
+func matchRune(re *Regexp, r rune) bool {
+ switch re.Op {
+ case OpLiteral:
+ return len(re.Rune) == 1 && re.Rune[0] == r
+ case OpCharClass:
+ for i := 0; i < len(re.Rune); i += 2 {
+ if re.Rune[i] <= r && r <= re.Rune[i+1] {
+ return true
+ }
+ }
+ return false
+ case OpAnyCharNotNL:
+ return r != '\n'
+ case OpAnyChar:
+ return true
+ }
+ return false
+}
+
+// parseVerticalBar handles a | in the input.
+func (p *parser) parseVerticalBar() error {
+ p.concat()
+
+ // The concatenation we just parsed is on top of the stack.
+ // If it sits above an opVerticalBar, swap it below
+ // (things below an opVerticalBar become an alternation).
+ // Otherwise, push a new vertical bar.
+ if !p.swapVerticalBar() {
+ p.op(opVerticalBar)
+ }
+
+ return nil
+}
+
+// mergeCharClass makes dst = dst|src.
+// The caller must ensure that dst.Op >= src.Op,
+// to reduce the amount of copying.
+func mergeCharClass(dst, src *Regexp) {
+ switch dst.Op {
+ case OpAnyChar:
+ // src doesn't add anything.
+ case OpAnyCharNotNL:
+ // src might add \n
+ if matchRune(src, '\n') {
+ dst.Op = OpAnyChar
+ }
+ case OpCharClass:
+ // src is simpler, so either literal or char class
+ if src.Op == OpLiteral {
+ dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
+ } else {
+ dst.Rune = appendClass(dst.Rune, src.Rune)
+ }
+ case OpLiteral:
+ // both literal
+ if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
+ break
+ }
+ dst.Op = OpCharClass
+ dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
+ dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
+ }
+}
+
+// If the top of the stack is an element followed by an opVerticalBar
+// swapVerticalBar swaps the two and returns true.
+// Otherwise it returns false.
+func (p *parser) swapVerticalBar() bool {
+ // If above and below vertical bar are literal or char class,
+ // can merge into a single char class.
+ n := len(p.stack)
+ if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
+ re1 := p.stack[n-1]
+ re3 := p.stack[n-3]
+ // Make re3 the more complex of the two.
+ if re1.Op > re3.Op {
+ re1, re3 = re3, re1
+ p.stack[n-3] = re3
+ }
+ mergeCharClass(re3, re1)
+ p.reuse(re1)
+ p.stack = p.stack[:n-1]
+ return true
+ }
+
+ if n >= 2 {
+ re1 := p.stack[n-1]
+ re2 := p.stack[n-2]
+ if re2.Op == opVerticalBar {
+ if n >= 3 {
+ // Now out of reach.
+ // Clean opportunistically.
+ cleanAlt(p.stack[n-3])
+ }
+ p.stack[n-2] = re1
+ p.stack[n-1] = re2
+ return true
+ }
+ }
+ return false
+}
+
+// parseRightParen handles a ) in the input.
+func (p *parser) parseRightParen() error {
+ p.concat()
+ if p.swapVerticalBar() {
+ // pop vertical bar
+ p.stack = p.stack[:len(p.stack)-1]
+ }
+ p.alternate()
+
+ n := len(p.stack)
+ if n < 2 {
+ return &Error{ErrUnexpectedParen, p.wholeRegexp}
+ }
+ re1 := p.stack[n-1]
+ re2 := p.stack[n-2]
+ p.stack = p.stack[:n-2]
+ if re2.Op != opLeftParen {
+ return &Error{ErrUnexpectedParen, p.wholeRegexp}
+ }
+ // Restore flags at time of paren.
+ p.flags = re2.Flags
+ if re2.Cap == 0 {
+ // Just for grouping.
+ p.push(re1)
+ } else {
+ re2.Op = OpCapture
+ re2.Sub = re2.Sub0[:1]
+ re2.Sub[0] = re1
+ p.push(re2)
+ }
+ return nil
+}
+
+// parseEscape parses an escape sequence at the beginning of s
+// and returns the rune.
+func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
+ t := s[1:]
+ if t == "" {
+ return 0, "", &Error{ErrTrailingBackslash, ""}
+ }
+ c, t, err := nextRune(t)
+ if err != nil {
+ return 0, "", err
+ }
+
+Switch:
+ switch c {
+ default:
+ if c < utf8.RuneSelf && !isalnum(c) {
+ // Escaped non-word characters are always themselves.
+ // PCRE is not quite so rigorous: it accepts things like
+ // \q, but we don't. We once rejected \_, but too many
+ // programs and people insist on using it, so allow \_.
+ return c, t, nil
+ }
+
+ // Octal escapes.
+ case '1', '2', '3', '4', '5', '6', '7':
+ // Single non-zero digit is a backreference; not supported
+ if t == "" || t[0] < '0' || t[0] > '7' {
+ break
+ }
+ fallthrough
+ case '0':
+ // Consume up to three octal digits; already have one.
+ r = c - '0'
+ for i := 1; i < 3; i++ {
+ if t == "" || t[0] < '0' || t[0] > '7' {
+ break
+ }
+ r = r*8 + rune(t[0]) - '0'
+ t = t[1:]
+ }
+ return r, t, nil
+
+ // Hexadecimal escapes.
+ case 'x':
+ if t == "" {
+ break
+ }
+ if c, t, err = nextRune(t); err != nil {
+ return 0, "", err
+ }
+ if c == '{' {
+ // Any number of digits in braces.
+ // Perl accepts any text at all; it ignores all text
+ // after the first non-hex digit. We require only hex digits,
+ // and at least one.
+ nhex := 0
+ r = 0
+ for {
+ if t == "" {
+ break Switch
+ }
+ if c, t, err = nextRune(t); err != nil {
+ return 0, "", err
+ }
+ if c == '}' {
+ break
+ }
+ v := unhex(c)
+ if v < 0 {
+ break Switch
+ }
+ r = r*16 + v
+ if r > unicode.MaxRune {
+ break Switch
+ }
+ nhex++
+ }
+ if nhex == 0 {
+ break Switch
+ }
+ return r, t, nil
+ }
+
+ // Easy case: two hex digits.
+ x := unhex(c)
+ if c, t, err = nextRune(t); err != nil {
+ return 0, "", err
+ }
+ y := unhex(c)
+ if x < 0 || y < 0 {
+ break
+ }
+ return x*16 + y, t, nil
+
+ // C escapes. There is no case 'b', to avoid misparsing
+ // the Perl word-boundary \b as the C backspace \b
+ // when in POSIX mode. In Perl, /\b/ means word-boundary
+ // but /[\b]/ means backspace. We don't support that.
+ // If you want a backspace, embed a literal backspace
+ // character or use \x08.
+ case 'a':
+ return '\a', t, err
+ case 'f':
+ return '\f', t, err
+ case 'n':
+ return '\n', t, err
+ case 'r':
+ return '\r', t, err
+ case 't':
+ return '\t', t, err
+ case 'v':
+ return '\v', t, err
+ }
+ return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
+}
+
+// parseClassChar parses a character class character at the beginning of s
+// and returns it.
+func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
+ if s == "" {
+ return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
+ }
+
+ // Allow regular escape sequences even though
+ // many need not be escaped in this context.
+ if s[0] == '\\' {
+ return p.parseEscape(s)
+ }
+
+ return nextRune(s)
+}
+
+type charGroup struct {
+ sign int
+ class []rune
+}
+
+// parsePerlClassEscape parses a leading Perl character class escape like \d
+// from the beginning of s. If one is present, it appends the characters to r
+// and returns the new slice r and the remainder of the string.
+func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
+ if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
+ return
+ }
+ g := perlGroup[s[0:2]]
+ if g.sign == 0 {
+ return
+ }
+ return p.appendGroup(r, g), s[2:]
+}
+
+// parseNamedClass parses a leading POSIX named character class like [:alnum:]
+// from the beginning of s. If one is present, it appends the characters to r
+// and returns the new slice r and the remainder of the string.
+func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
+ if len(s) < 2 || s[0] != '[' || s[1] != ':' {
+ return
+ }
+
+ i := strings.Index(s[2:], ":]")
+ if i < 0 {
+ return
+ }
+ i += 2
+ name, s := s[0:i+2], s[i+2:]
+ g := posixGroup[name]
+ if g.sign == 0 {
+ return nil, "", &Error{ErrInvalidCharRange, name}
+ }
+ return p.appendGroup(r, g), s, nil
+}
+
+func (p *parser) appendGroup(r []rune, g charGroup) []rune {
+ if p.flags&FoldCase == 0 {
+ if g.sign < 0 {
+ r = appendNegatedClass(r, g.class)
+ } else {
+ r = appendClass(r, g.class)
+ }
+ } else {
+ tmp := p.tmpClass[:0]
+ tmp = appendFoldedClass(tmp, g.class)
+ p.tmpClass = tmp
+ tmp = cleanClass(&p.tmpClass)
+ if g.sign < 0 {
+ r = appendNegatedClass(r, tmp)
+ } else {
+ r = appendClass(r, tmp)
+ }
+ }
+ return r
+}
+
+var anyTable = &unicode.RangeTable{
+ R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
+ R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
+}
+
+// unicodeTable returns the unicode.RangeTable identified by name
+// and the table of additional fold-equivalent code points.
+func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
+ // Special case: "Any" means any.
+ if name == "Any" {
+ return anyTable, anyTable
+ }
+ if t := unicode.Categories[name]; t != nil {
+ return t, unicode.FoldCategory[name]
+ }
+ if t := unicode.Scripts[name]; t != nil {
+ return t, unicode.FoldScript[name]
+ }
+ return nil, nil
+}
+
+// parseUnicodeClass parses a leading Unicode character class like \p{Han}
+// from the beginning of s. If one is present, it appends the characters to r
+// and returns the new slice r and the remainder of the string.
+func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
+ if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
+ return
+ }
+
+ // Committed to parse or return error.
+ sign := +1
+ if s[1] == 'P' {
+ sign = -1
+ }
+ t := s[2:]
+ c, t, err := nextRune(t)
+ if err != nil {
+ return
+ }
+ var seq, name string
+ if c != '{' {
+ // Single-letter name.
+ seq = s[:len(s)-len(t)]
+ name = seq[2:]
+ } else {
+ // Name is in braces.
+ end := strings.IndexRune(s, '}')
+ if end < 0 {
+ if err = checkUTF8(s); err != nil {
+ return
+ }
+ return nil, "", &Error{ErrInvalidCharRange, s}
+ }
+ seq, t = s[:end+1], s[end+1:]
+ name = s[3:end]
+ if err = checkUTF8(name); err != nil {
+ return
+ }
+ }
+
+ // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
+ if name != "" && name[0] == '^' {
+ sign = -sign
+ name = name[1:]
+ }
+
+ tab, fold := unicodeTable(name)
+ if tab == nil {
+ return nil, "", &Error{ErrInvalidCharRange, seq}
+ }
+
+ if p.flags&FoldCase == 0 || fold == nil {
+ if sign > 0 {
+ r = appendTable(r, tab)
+ } else {
+ r = appendNegatedTable(r, tab)
+ }
+ } else {
+ // Merge and clean tab and fold in a temporary buffer.
+ // This is necessary for the negative case and just tidy
+ // for the positive case.
+ tmp := p.tmpClass[:0]
+ tmp = appendTable(tmp, tab)
+ tmp = appendTable(tmp, fold)
+ p.tmpClass = tmp
+ tmp = cleanClass(&p.tmpClass)
+ if sign > 0 {
+ r = appendClass(r, tmp)
+ } else {
+ r = appendNegatedClass(r, tmp)
+ }
+ }
+ return r, t, nil
+}
+
+// parseClass parses a character class at the beginning of s
+// and pushes it onto the parse stack.
+func (p *parser) parseClass(s string) (rest string, err error) {
+ t := s[1:] // chop [
+ re := p.newRegexp(OpCharClass)
+ re.Flags = p.flags
+ re.Rune = re.Rune0[:0]
+
+ sign := +1
+ if t != "" && t[0] == '^' {
+ sign = -1
+ t = t[1:]
+
+ // If character class does not match \n, add it here,
+ // so that negation later will do the right thing.
+ if p.flags&ClassNL == 0 {
+ re.Rune = append(re.Rune, '\n', '\n')
+ }
+ }
+
+ class := re.Rune
+ first := true // ] and - are okay as first char in class
+ for t == "" || t[0] != ']' || first {
+ // POSIX: - is only okay unescaped as first or last in class.
+ // Perl: - is okay anywhere.
+ if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
+ _, size := utf8.DecodeRuneInString(t[1:])
+ return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
+ }
+ first = false
+
+ // Look for POSIX [:alnum:] etc.
+ if len(t) > 2 && t[0] == '[' && t[1] == ':' {
+ nclass, nt, err := p.parseNamedClass(t, class)
+ if err != nil {
+ return "", err
+ }
+ if nclass != nil {
+ class, t = nclass, nt
+ continue
+ }
+ }
+
+ // Look for Unicode character group like \p{Han}.
+ nclass, nt, err := p.parseUnicodeClass(t, class)
+ if err != nil {
+ return "", err
+ }
+ if nclass != nil {
+ class, t = nclass, nt
+ continue
+ }
+
+ // Look for Perl character class symbols (extension).
+ if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
+ class, t = nclass, nt
+ continue
+ }
+
+ // Single character or simple range.
+ rng := t
+ var lo, hi rune
+ if lo, t, err = p.parseClassChar(t, s); err != nil {
+ return "", err
+ }
+ hi = lo
+ // [a-] means (a|-) so check for final ].
+ if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
+ t = t[1:]
+ if hi, t, err = p.parseClassChar(t, s); err != nil {
+ return "", err
+ }
+ if hi < lo {
+ rng = rng[:len(rng)-len(t)]
+ return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
+ }
+ }
+ if p.flags&FoldCase == 0 {
+ class = appendRange(class, lo, hi)
+ } else {
+ class = appendFoldedRange(class, lo, hi)
+ }
+ }
+ t = t[1:] // chop ]
+
+ // Use &re.Rune instead of &class to avoid allocation.
+ re.Rune = class
+ class = cleanClass(&re.Rune)
+ if sign < 0 {
+ class = negateClass(class)
+ }
+ re.Rune = class
+ p.push(re)
+ return t, nil
+}
+
+// cleanClass sorts the ranges (pairs of elements of r),
+// merges them, and eliminates duplicates.
+func cleanClass(rp *[]rune) []rune {
+
+ // Sort by lo increasing, hi decreasing to break ties.
+ sort.Sort(ranges{rp})
+
+ r := *rp
+ if len(r) < 2 {
+ return r
+ }
+
+ // Merge abutting, overlapping.
+ w := 2 // write index
+ for i := 2; i < len(r); i += 2 {
+ lo, hi := r[i], r[i+1]
+ if lo <= r[w-1]+1 {
+ // merge with previous range
+ if hi > r[w-1] {
+ r[w-1] = hi
+ }
+ continue
+ }
+ // new disjoint range
+ r[w] = lo
+ r[w+1] = hi
+ w += 2
+ }
+
+ return r[:w]
+}
+
+// appendLiteral returns the result of appending the literal x to the class r.
+func appendLiteral(r []rune, x rune, flags Flags) []rune {
+ if flags&FoldCase != 0 {
+ return appendFoldedRange(r, x, x)
+ }
+ return appendRange(r, x, x)
+}
+
+// appendRange returns the result of appending the range lo-hi to the class r.
+func appendRange(r []rune, lo, hi rune) []rune {
+ // Expand last range or next to last range if it overlaps or abuts.
+ // Checking two ranges helps when appending case-folded
+ // alphabets, so that one range can be expanding A-Z and the
+ // other expanding a-z.
+ n := len(r)
+ for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
+ if n >= i {
+ rlo, rhi := r[n-i], r[n-i+1]
+ if lo <= rhi+1 && rlo <= hi+1 {
+ if lo < rlo {
+ r[n-i] = lo
+ }
+ if hi > rhi {
+ r[n-i+1] = hi
+ }
+ return r
+ }
+ }
+ }
+
+ return append(r, lo, hi)
+}
+
+const (
+ // minimum and maximum runes involved in folding.
+ // checked during test.
+ minFold = 0x0041
+ maxFold = 0x1e943
+)
+
+// appendFoldedRange returns the result of appending the range lo-hi
+// and its case folding-equivalent runes to the class r.
+func appendFoldedRange(r []rune, lo, hi rune) []rune {
+ // Optimizations.
+ if lo <= minFold && hi >= maxFold {
+ // Range is full: folding can't add more.
+ return appendRange(r, lo, hi)
+ }
+ if hi < minFold || lo > maxFold {
+ // Range is outside folding possibilities.
+ return appendRange(r, lo, hi)
+ }
+ if lo < minFold {
+ // [lo, minFold-1] needs no folding.
+ r = appendRange(r, lo, minFold-1)
+ lo = minFold
+ }
+ if hi > maxFold {
+ // [maxFold+1, hi] needs no folding.
+ r = appendRange(r, maxFold+1, hi)
+ hi = maxFold
+ }
+
+ // Brute force. Depend on appendRange to coalesce ranges on the fly.
+ for c := lo; c <= hi; c++ {
+ r = appendRange(r, c, c)
+ f := unicode.SimpleFold(c)
+ for f != c {
+ r = appendRange(r, f, f)
+ f = unicode.SimpleFold(f)
+ }
+ }
+ return r
+}
+
+// appendClass returns the result of appending the class x to the class r.
+// It assume x is clean.
+func appendClass(r []rune, x []rune) []rune {
+ for i := 0; i < len(x); i += 2 {
+ r = appendRange(r, x[i], x[i+1])
+ }
+ return r
+}
+
+// appendFoldedClass returns the result of appending the case folding of the class x to the class r.
+func appendFoldedClass(r []rune, x []rune) []rune {
+ for i := 0; i < len(x); i += 2 {
+ r = appendFoldedRange(r, x[i], x[i+1])
+ }
+ return r
+}
+
+// appendNegatedClass returns the result of appending the negation of the class x to the class r.
+// It assumes x is clean.
+func appendNegatedClass(r []rune, x []rune) []rune {
+ nextLo := '\u0000'
+ for i := 0; i < len(x); i += 2 {
+ lo, hi := x[i], x[i+1]
+ if nextLo <= lo-1 {
+ r = appendRange(r, nextLo, lo-1)
+ }
+ nextLo = hi + 1
+ }
+ if nextLo <= unicode.MaxRune {
+ r = appendRange(r, nextLo, unicode.MaxRune)
+ }
+ return r
+}
+
+// appendTable returns the result of appending x to the class r.
+func appendTable(r []rune, x *unicode.RangeTable) []rune {
+ for _, xr := range x.R16 {
+ lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
+ if stride == 1 {
+ r = appendRange(r, lo, hi)
+ continue
+ }
+ for c := lo; c <= hi; c += stride {
+ r = appendRange(r, c, c)
+ }
+ }
+ for _, xr := range x.R32 {
+ lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
+ if stride == 1 {
+ r = appendRange(r, lo, hi)
+ continue
+ }
+ for c := lo; c <= hi; c += stride {
+ r = appendRange(r, c, c)
+ }
+ }
+ return r
+}
+
+// appendNegatedTable returns the result of appending the negation of x to the class r.
+func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
+ nextLo := '\u0000' // lo end of next class to add
+ for _, xr := range x.R16 {
+ lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
+ if stride == 1 {
+ if nextLo <= lo-1 {
+ r = appendRange(r, nextLo, lo-1)
+ }
+ nextLo = hi + 1
+ continue
+ }
+ for c := lo; c <= hi; c += stride {
+ if nextLo <= c-1 {
+ r = appendRange(r, nextLo, c-1)
+ }
+ nextLo = c + 1
+ }
+ }
+ for _, xr := range x.R32 {
+ lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
+ if stride == 1 {
+ if nextLo <= lo-1 {
+ r = appendRange(r, nextLo, lo-1)
+ }
+ nextLo = hi + 1
+ continue
+ }
+ for c := lo; c <= hi; c += stride {
+ if nextLo <= c-1 {
+ r = appendRange(r, nextLo, c-1)
+ }
+ nextLo = c + 1
+ }
+ }
+ if nextLo <= unicode.MaxRune {
+ r = appendRange(r, nextLo, unicode.MaxRune)
+ }
+ return r
+}
+
+// negateClass overwrites r and returns r's negation.
+// It assumes the class r is already clean.
+func negateClass(r []rune) []rune {
+ nextLo := '\u0000' // lo end of next class to add
+ w := 0 // write index
+ for i := 0; i < len(r); i += 2 {
+ lo, hi := r[i], r[i+1]
+ if nextLo <= lo-1 {
+ r[w] = nextLo
+ r[w+1] = lo - 1
+ w += 2
+ }
+ nextLo = hi + 1
+ }
+ r = r[:w]
+ if nextLo <= unicode.MaxRune {
+ // It's possible for the negation to have one more
+ // range - this one - than the original class, so use append.
+ r = append(r, nextLo, unicode.MaxRune)
+ }
+ return r
+}
+
+// ranges implements sort.Interface on a []rune.
+// The choice of receiver type definition is strange
+// but avoids an allocation since we already have
+// a *[]rune.
+type ranges struct {
+ p *[]rune
+}
+
+func (ra ranges) Less(i, j int) bool {
+ p := *ra.p
+ i *= 2
+ j *= 2
+ return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
+}
+
+func (ra ranges) Len() int {
+ return len(*ra.p) / 2
+}
+
+func (ra ranges) Swap(i, j int) {
+ p := *ra.p
+ i *= 2
+ j *= 2
+ p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
+}
+
+func checkUTF8(s string) error {
+ for s != "" {
+ rune, size := utf8.DecodeRuneInString(s)
+ if rune == utf8.RuneError && size == 1 {
+ return &Error{Code: ErrInvalidUTF8, Expr: s}
+ }
+ s = s[size:]
+ }
+ return nil
+}
+
+func nextRune(s string) (c rune, t string, err error) {
+ c, size := utf8.DecodeRuneInString(s)
+ if c == utf8.RuneError && size == 1 {
+ return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
+ }
+ return c, s[size:], nil
+}
+
+func isalnum(c rune) bool {
+ return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
+}
+
+func unhex(c rune) rune {
+ if '0' <= c && c <= '9' {
+ return c - '0'
+ }
+ if 'a' <= c && c <= 'f' {
+ return c - 'a' + 10
+ }
+ if 'A' <= c && c <= 'F' {
+ return c - 'A' + 10
+ }
+ return -1
+}
diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go
new file mode 100644
index 0000000..67e3c56
--- /dev/null
+++ b/src/regexp/syntax/parse_test.go
@@ -0,0 +1,586 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+ "fmt"
+ "strings"
+ "testing"
+ "unicode"
+)
+
+type parseTest struct {
+ Regexp string
+ Dump string
+}
+
+var parseTests = []parseTest{
+ // Base cases
+ {`a`, `lit{a}`},
+ {`a.`, `cat{lit{a}dot{}}`},
+ {`a.b`, `cat{lit{a}dot{}lit{b}}`},
+ {`ab`, `str{ab}`},
+ {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
+ {`abc`, `str{abc}`},
+ {`a|^`, `alt{lit{a}bol{}}`},
+ {`a|b`, `cc{0x61-0x62}`},
+ {`(a)`, `cap{lit{a}}`},
+ {`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
+ {`a*`, `star{lit{a}}`},
+ {`a+`, `plus{lit{a}}`},
+ {`a?`, `que{lit{a}}`},
+ {`a{2}`, `rep{2,2 lit{a}}`},
+ {`a{2,3}`, `rep{2,3 lit{a}}`},
+ {`a{2,}`, `rep{2,-1 lit{a}}`},
+ {`a*?`, `nstar{lit{a}}`},
+ {`a+?`, `nplus{lit{a}}`},
+ {`a??`, `nque{lit{a}}`},
+ {`a{2}?`, `nrep{2,2 lit{a}}`},
+ {`a{2,3}?`, `nrep{2,3 lit{a}}`},
+ {`a{2,}?`, `nrep{2,-1 lit{a}}`},
+ // Malformed { } are treated as literals.
+ {`x{1001`, `str{x{1001}`},
+ {`x{9876543210`, `str{x{9876543210}`},
+ {`x{9876543210,`, `str{x{9876543210,}`},
+ {`x{2,1`, `str{x{2,1}`},
+ {`x{1,9876543210`, `str{x{1,9876543210}`},
+ {``, `emp{}`},
+ {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
+ {`|x|`, `alt{emp{}lit{x}emp{}}`},
+ {`.`, `dot{}`},
+ {`^`, `bol{}`},
+ {`$`, `eol{}`},
+ {`\|`, `lit{|}`},
+ {`\(`, `lit{(}`},
+ {`\)`, `lit{)}`},
+ {`\*`, `lit{*}`},
+ {`\+`, `lit{+}`},
+ {`\?`, `lit{?}`},
+ {`{`, `lit{{}`},
+ {`}`, `lit{}}`},
+ {`\.`, `lit{.}`},
+ {`\^`, `lit{^}`},
+ {`\$`, `lit{$}`},
+ {`\\`, `lit{\}`},
+ {`[ace]`, `cc{0x61 0x63 0x65}`},
+ {`[abc]`, `cc{0x61-0x63}`},
+ {`[a-z]`, `cc{0x61-0x7a}`},
+ {`[a]`, `lit{a}`},
+ {`\-`, `lit{-}`},
+ {`-`, `lit{-}`},
+ {`\_`, `lit{_}`},
+ {`abc`, `str{abc}`},
+ {`abc|def`, `alt{str{abc}str{def}}`},
+ {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
+
+ // Posix and Perl extensions
+ {`[[:lower:]]`, `cc{0x61-0x7a}`},
+ {`[a-z]`, `cc{0x61-0x7a}`},
+ {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
+ {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
+ {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+ {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+ {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+ {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+ {`\d`, `cc{0x30-0x39}`},
+ {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
+ {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
+ {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
+ {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
+ {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
+ {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
+ {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+ {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
+ // { `\C`, `byte{}` }, // probably never
+
+ // Unicode, negatives, and a double negative.
+ {`\p{Braille}`, `cc{0x2800-0x28ff}`},
+ {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+ {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+ {`\P{^Braille}`, `cc{0x2800-0x28ff}`},
+ {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
+ {`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
+ {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+ {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+ {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
+ {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
+ {`\p{Lu}`, mkCharClass(unicode.IsUpper)},
+ {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
+ {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
+ {`\p{Any}`, `dot{}`},
+ {`\p{^Any}`, `cc{}`},
+
+ // Hex, octal.
+ {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
+ {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
+
+ // More interesting regular expressions.
+ {`a{,2}`, `str{a{,2}}`},
+ {`\.\^\$\\`, `str{.^$\}`},
+ {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
+ {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
+ {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
+ {`a*{`, `cat{star{lit{a}}lit{{}}`},
+
+ // Test precedences
+ {`(?:ab)*`, `star{str{ab}}`},
+ {`(ab)*`, `star{cap{str{ab}}}`},
+ {`ab|cd`, `alt{str{ab}str{cd}}`},
+ {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
+
+ // Test flattening.
+ {`(?:a)`, `lit{a}`},
+ {`(?:ab)(?:cd)`, `str{abcd}`},
+ {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
+ {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
+ {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
+ {`a|.`, `dot{}`},
+ {`.|a`, `dot{}`},
+ {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
+ {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
+
+ // Test Perl quoted literals
+ {`\Q+|*?{[\E`, `str{+|*?{[}`},
+ {`\Q+\E+`, `plus{lit{+}}`},
+ {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`},
+ {`\Q\\E`, `lit{\}`},
+ {`\Q\\\E`, `str{\\}`},
+
+ // Test Perl \A and \z
+ {`(?m)^`, `bol{}`},
+ {`(?m)$`, `eol{}`},
+ {`(?-m)^`, `bot{}`},
+ {`(?-m)$`, `eot{}`},
+ {`(?m)\A`, `bot{}`},
+ {`(?m)\z`, `eot{\z}`},
+ {`(?-m)\A`, `bot{}`},
+ {`(?-m)\z`, `eot{\z}`},
+
+ // Test named captures
+ {`(?P<name>a)`, `cap{name:lit{a}}`},
+
+ // Case-folded literals
+ {`[Aa]`, `litfold{A}`},
+ {`[\x{100}\x{101}]`, `litfold{Ā}`},
+ {`[Δδ]`, `litfold{Δ}`},
+
+ // Strings
+ {`abcde`, `str{abcde}`},
+ {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
+
+ // Factoring.
+ {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
+ {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
+
+ // Bug fixes.
+ {`(?:.)`, `dot{}`},
+ {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
+ {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
+ {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
+ {`(?:A|a)`, `litfold{A}`},
+ {`A|(?:A|a)`, `litfold{A}`},
+ {`(?s).`, `dot{}`},
+ {`(?-s).`, `dnl{}`},
+ {`(?:(?:^).)`, `cat{bol{}dot{}}`},
+ {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
+ {`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`},
+
+ // RE2 prefix_tests
+ {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
+ {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
+ {`abc|abd|aef|bcx|bcy`,
+ `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
+ `cat{str{bc}cc{0x78-0x79}}}`},
+ {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
+ {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
+ {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
+ {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
+ {`x{2}|x{2}[0-9]`,
+ `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
+ {`x{2}y|x{2}[0-9]y`,
+ `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
+ {`a.*?c|a.*?b`,
+ `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
+
+ // Valid repetitions.
+ {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
+ {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``},
+
+ // Valid nesting.
+ {strings.Repeat("(", 999) + strings.Repeat(")", 999), ``},
+ {strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``},
+ {"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all
+}
+
+const testFlags = MatchNL | PerlX | UnicodeGroups
+
+func TestParseSimple(t *testing.T) {
+ testParseDump(t, parseTests, testFlags)
+}
+
+var foldcaseTests = []parseTest{
+ {`AbCdE`, `strfold{ABCDE}`},
+ {`[Aa]`, `litfold{A}`},
+ {`a`, `litfold{A}`},
+
+ // 0x17F is an old English long s (looks like an f) and folds to s.
+ // 0x212A is the Kelvin symbol and folds to k.
+ {`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
+ {`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+ {`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+}
+
+func TestParseFoldCase(t *testing.T) {
+ testParseDump(t, foldcaseTests, FoldCase)
+}
+
+var literalTests = []parseTest{
+ {"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
+}
+
+func TestParseLiteral(t *testing.T) {
+ testParseDump(t, literalTests, Literal)
+}
+
+var matchnlTests = []parseTest{
+ {`.`, `dot{}`},
+ {"\n", "lit{\n}"},
+ {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
+ {`[a\n]`, `cc{0xa 0x61}`},
+}
+
+func TestParseMatchNL(t *testing.T) {
+ testParseDump(t, matchnlTests, MatchNL)
+}
+
+var nomatchnlTests = []parseTest{
+ {`.`, `dnl{}`},
+ {"\n", "lit{\n}"},
+ {`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
+ {`[a\n]`, `cc{0xa 0x61}`},
+}
+
+func TestParseNoMatchNL(t *testing.T) {
+ testParseDump(t, nomatchnlTests, 0)
+}
+
+// Test Parse -> Dump.
+func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
+ for _, tt := range tests {
+ re, err := Parse(tt.Regexp, flags)
+ if err != nil {
+ t.Errorf("Parse(%#q): %v", tt.Regexp, err)
+ continue
+ }
+ if tt.Dump == "" {
+ // It parsed. That's all we care about.
+ continue
+ }
+ d := dump(re)
+ if d != tt.Dump {
+ t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
+ }
+ }
+}
+
+// dump prints a string representation of the regexp showing
+// the structure explicitly.
+func dump(re *Regexp) string {
+ var b strings.Builder
+ dumpRegexp(&b, re)
+ return b.String()
+}
+
+var opNames = []string{
+ OpNoMatch: "no",
+ OpEmptyMatch: "emp",
+ OpLiteral: "lit",
+ OpCharClass: "cc",
+ OpAnyCharNotNL: "dnl",
+ OpAnyChar: "dot",
+ OpBeginLine: "bol",
+ OpEndLine: "eol",
+ OpBeginText: "bot",
+ OpEndText: "eot",
+ OpWordBoundary: "wb",
+ OpNoWordBoundary: "nwb",
+ OpCapture: "cap",
+ OpStar: "star",
+ OpPlus: "plus",
+ OpQuest: "que",
+ OpRepeat: "rep",
+ OpConcat: "cat",
+ OpAlternate: "alt",
+}
+
+// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
+// It is used during testing to distinguish between parses that might print
+// the same using re's String method.
+func dumpRegexp(b *strings.Builder, re *Regexp) {
+ if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
+ fmt.Fprintf(b, "op%d", re.Op)
+ } else {
+ switch re.Op {
+ default:
+ b.WriteString(opNames[re.Op])
+ case OpStar, OpPlus, OpQuest, OpRepeat:
+ if re.Flags&NonGreedy != 0 {
+ b.WriteByte('n')
+ }
+ b.WriteString(opNames[re.Op])
+ case OpLiteral:
+ if len(re.Rune) > 1 {
+ b.WriteString("str")
+ } else {
+ b.WriteString("lit")
+ }
+ if re.Flags&FoldCase != 0 {
+ for _, r := range re.Rune {
+ if unicode.SimpleFold(r) != r {
+ b.WriteString("fold")
+ break
+ }
+ }
+ }
+ }
+ }
+ b.WriteByte('{')
+ switch re.Op {
+ case OpEndText:
+ if re.Flags&WasDollar == 0 {
+ b.WriteString(`\z`)
+ }
+ case OpLiteral:
+ for _, r := range re.Rune {
+ b.WriteRune(r)
+ }
+ case OpConcat, OpAlternate:
+ for _, sub := range re.Sub {
+ dumpRegexp(b, sub)
+ }
+ case OpStar, OpPlus, OpQuest:
+ dumpRegexp(b, re.Sub[0])
+ case OpRepeat:
+ fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
+ dumpRegexp(b, re.Sub[0])
+ case OpCapture:
+ if re.Name != "" {
+ b.WriteString(re.Name)
+ b.WriteByte(':')
+ }
+ dumpRegexp(b, re.Sub[0])
+ case OpCharClass:
+ sep := ""
+ for i := 0; i < len(re.Rune); i += 2 {
+ b.WriteString(sep)
+ sep = " "
+ lo, hi := re.Rune[i], re.Rune[i+1]
+ if lo == hi {
+ fmt.Fprintf(b, "%#x", lo)
+ } else {
+ fmt.Fprintf(b, "%#x-%#x", lo, hi)
+ }
+ }
+ }
+ b.WriteByte('}')
+}
+
+func mkCharClass(f func(rune) bool) string {
+ re := &Regexp{Op: OpCharClass}
+ lo := rune(-1)
+ for i := rune(0); i <= unicode.MaxRune; i++ {
+ if f(i) {
+ if lo < 0 {
+ lo = i
+ }
+ } else {
+ if lo >= 0 {
+ re.Rune = append(re.Rune, lo, i-1)
+ lo = -1
+ }
+ }
+ }
+ if lo >= 0 {
+ re.Rune = append(re.Rune, lo, unicode.MaxRune)
+ }
+ return dump(re)
+}
+
+func isUpperFold(r rune) bool {
+ if unicode.IsUpper(r) {
+ return true
+ }
+ c := unicode.SimpleFold(r)
+ for c != r {
+ if unicode.IsUpper(c) {
+ return true
+ }
+ c = unicode.SimpleFold(c)
+ }
+ return false
+}
+
+func TestFoldConstants(t *testing.T) {
+ last := rune(-1)
+ for i := rune(0); i <= unicode.MaxRune; i++ {
+ if unicode.SimpleFold(i) == i {
+ continue
+ }
+ if last == -1 && minFold != i {
+ t.Errorf("minFold=%#U should be %#U", minFold, i)
+ }
+ last = i
+ }
+ if maxFold != last {
+ t.Errorf("maxFold=%#U should be %#U", maxFold, last)
+ }
+}
+
+func TestAppendRangeCollapse(t *testing.T) {
+ // AppendRange should collapse each of the new ranges
+ // into the earlier ones (it looks back two ranges), so that
+ // the slice never grows very large.
+ // Note that we are not calling cleanClass.
+ var r []rune
+ for i := rune('A'); i <= 'Z'; i++ {
+ r = appendRange(r, i, i)
+ r = appendRange(r, i+'a'-'A', i+'a'-'A')
+ }
+ if string(r) != "AZaz" {
+ t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
+ }
+}
+
+var invalidRegexps = []string{
+ `(`,
+ `)`,
+ `(a`,
+ `a)`,
+ `(a))`,
+ `(a|b|`,
+ `a|b|)`,
+ `(a|b|))`,
+ `(a|b`,
+ `a|b)`,
+ `(a|b))`,
+ `[a-z`,
+ `([a-z)`,
+ `[a-z)`,
+ `([a-z]))`,
+ `x{1001}`,
+ `x{9876543210}`,
+ `x{2,1}`,
+ `x{1,9876543210}`,
+ "\xff", // Invalid UTF-8
+ "[\xff]",
+ "[\\\xff]",
+ "\\\xff",
+ `(?P<name>a`,
+ `(?P<name>`,
+ `(?P<name`,
+ `(?P<x y>a)`,
+ `(?P<>a)`,
+ `[a-Z]`,
+ `(?i)[a-Z]`,
+ `\Q\E*`,
+ `a{100000}`, // too much repetition
+ `a{100000,}`, // too much repetition
+ "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", // too much repetition
+ strings.Repeat("(", 1000) + strings.Repeat(")", 1000), // too deep
+ strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000), // too deep
+ "(" + strings.Repeat("(xx?)", 1000) + "){1000}", // too long
+ strings.Repeat("(xx?){1000}", 1000), // too long
+ strings.Repeat(`\pL`, 27000), // too many runes
+}
+
+var onlyPerl = []string{
+ `[a-b-c]`,
+ `\Qabc\E`,
+ `\Q*+?{[\E`,
+ `\Q\\E`,
+ `\Q\\\E`,
+ `\Q\\\\E`,
+ `\Q\\\\\E`,
+ `(?:a)`,
+ `(?P<name>a)`,
+}
+
+var onlyPOSIX = []string{
+ "a++",
+ "a**",
+ "a?*",
+ "a+*",
+ "a{1}*",
+ ".{1}{2}.{3}",
+}
+
+func TestParseInvalidRegexps(t *testing.T) {
+ for _, regexp := range invalidRegexps {
+ if re, err := Parse(regexp, Perl); err == nil {
+ t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
+ }
+ if re, err := Parse(regexp, POSIX); err == nil {
+ t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
+ }
+ }
+ for _, regexp := range onlyPerl {
+ if _, err := Parse(regexp, Perl); err != nil {
+ t.Errorf("Parse(%#q, Perl): %v", regexp, err)
+ }
+ if re, err := Parse(regexp, POSIX); err == nil {
+ t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
+ }
+ }
+ for _, regexp := range onlyPOSIX {
+ if re, err := Parse(regexp, Perl); err == nil {
+ t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
+ }
+ if _, err := Parse(regexp, POSIX); err != nil {
+ t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
+ }
+ }
+}
+
+func TestToStringEquivalentParse(t *testing.T) {
+ for _, tt := range parseTests {
+ re, err := Parse(tt.Regexp, testFlags)
+ if err != nil {
+ t.Errorf("Parse(%#q): %v", tt.Regexp, err)
+ continue
+ }
+ if tt.Dump == "" {
+ // It parsed. That's all we care about.
+ continue
+ }
+ d := dump(re)
+ if d != tt.Dump {
+ t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
+ continue
+ }
+
+ s := re.String()
+ if s != tt.Regexp {
+ // If ToString didn't return the original regexp,
+ // it must have found one with fewer parens.
+ // Unfortunately we can't check the length here, because
+ // ToString produces "\\{" for a literal brace,
+ // but "{" is a shorter equivalent in some contexts.
+ nre, err := Parse(s, testFlags)
+ if err != nil {
+ t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
+ continue
+ }
+ nd := dump(nre)
+ if d != nd {
+ t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
+ }
+
+ ns := nre.String()
+ if s != ns {
+ t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
+ }
+ }
+ }
+}
diff --git a/src/regexp/syntax/perl_groups.go b/src/regexp/syntax/perl_groups.go
new file mode 100644
index 0000000..effe4e6
--- /dev/null
+++ b/src/regexp/syntax/perl_groups.go
@@ -0,0 +1,134 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.go
+
+package syntax
+
+var code1 = []rune{ /* \d */
+ 0x30, 0x39,
+}
+
+var code2 = []rune{ /* \s */
+ 0x9, 0xa,
+ 0xc, 0xd,
+ 0x20, 0x20,
+}
+
+var code3 = []rune{ /* \w */
+ 0x30, 0x39,
+ 0x41, 0x5a,
+ 0x5f, 0x5f,
+ 0x61, 0x7a,
+}
+
+var perlGroup = map[string]charGroup{
+ `\d`: {+1, code1},
+ `\D`: {-1, code1},
+ `\s`: {+1, code2},
+ `\S`: {-1, code2},
+ `\w`: {+1, code3},
+ `\W`: {-1, code3},
+}
+var code4 = []rune{ /* [:alnum:] */
+ 0x30, 0x39,
+ 0x41, 0x5a,
+ 0x61, 0x7a,
+}
+
+var code5 = []rune{ /* [:alpha:] */
+ 0x41, 0x5a,
+ 0x61, 0x7a,
+}
+
+var code6 = []rune{ /* [:ascii:] */
+ 0x0, 0x7f,
+}
+
+var code7 = []rune{ /* [:blank:] */
+ 0x9, 0x9,
+ 0x20, 0x20,
+}
+
+var code8 = []rune{ /* [:cntrl:] */
+ 0x0, 0x1f,
+ 0x7f, 0x7f,
+}
+
+var code9 = []rune{ /* [:digit:] */
+ 0x30, 0x39,
+}
+
+var code10 = []rune{ /* [:graph:] */
+ 0x21, 0x7e,
+}
+
+var code11 = []rune{ /* [:lower:] */
+ 0x61, 0x7a,
+}
+
+var code12 = []rune{ /* [:print:] */
+ 0x20, 0x7e,
+}
+
+var code13 = []rune{ /* [:punct:] */
+ 0x21, 0x2f,
+ 0x3a, 0x40,
+ 0x5b, 0x60,
+ 0x7b, 0x7e,
+}
+
+var code14 = []rune{ /* [:space:] */
+ 0x9, 0xd,
+ 0x20, 0x20,
+}
+
+var code15 = []rune{ /* [:upper:] */
+ 0x41, 0x5a,
+}
+
+var code16 = []rune{ /* [:word:] */
+ 0x30, 0x39,
+ 0x41, 0x5a,
+ 0x5f, 0x5f,
+ 0x61, 0x7a,
+}
+
+var code17 = []rune{ /* [:xdigit:] */
+ 0x30, 0x39,
+ 0x41, 0x46,
+ 0x61, 0x66,
+}
+
+var posixGroup = map[string]charGroup{
+ `[:alnum:]`: {+1, code4},
+ `[:^alnum:]`: {-1, code4},
+ `[:alpha:]`: {+1, code5},
+ `[:^alpha:]`: {-1, code5},
+ `[:ascii:]`: {+1, code6},
+ `[:^ascii:]`: {-1, code6},
+ `[:blank:]`: {+1, code7},
+ `[:^blank:]`: {-1, code7},
+ `[:cntrl:]`: {+1, code8},
+ `[:^cntrl:]`: {-1, code8},
+ `[:digit:]`: {+1, code9},
+ `[:^digit:]`: {-1, code9},
+ `[:graph:]`: {+1, code10},
+ `[:^graph:]`: {-1, code10},
+ `[:lower:]`: {+1, code11},
+ `[:^lower:]`: {-1, code11},
+ `[:print:]`: {+1, code12},
+ `[:^print:]`: {-1, code12},
+ `[:punct:]`: {+1, code13},
+ `[:^punct:]`: {-1, code13},
+ `[:space:]`: {+1, code14},
+ `[:^space:]`: {-1, code14},
+ `[:upper:]`: {+1, code15},
+ `[:^upper:]`: {-1, code15},
+ `[:word:]`: {+1, code16},
+ `[:^word:]`: {-1, code16},
+ `[:xdigit:]`: {+1, code17},
+ `[:^xdigit:]`: {-1, code17},
+}
diff --git a/src/regexp/syntax/prog.go b/src/regexp/syntax/prog.go
new file mode 100644
index 0000000..896cdc4
--- /dev/null
+++ b/src/regexp/syntax/prog.go
@@ -0,0 +1,347 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+ "strconv"
+ "strings"
+ "unicode"
+ "unicode/utf8"
+)
+
+// Compiled program.
+// May not belong in this package, but convenient for now.
+
+// A Prog is a compiled regular expression program.
+type Prog struct {
+ Inst []Inst
+ Start int // index of start instruction
+ NumCap int // number of InstCapture insts in re
+}
+
+// An InstOp is an instruction opcode.
+type InstOp uint8
+
+const (
+ InstAlt InstOp = iota
+ InstAltMatch
+ InstCapture
+ InstEmptyWidth
+ InstMatch
+ InstFail
+ InstNop
+ InstRune
+ InstRune1
+ InstRuneAny
+ InstRuneAnyNotNL
+)
+
+var instOpNames = []string{
+ "InstAlt",
+ "InstAltMatch",
+ "InstCapture",
+ "InstEmptyWidth",
+ "InstMatch",
+ "InstFail",
+ "InstNop",
+ "InstRune",
+ "InstRune1",
+ "InstRuneAny",
+ "InstRuneAnyNotNL",
+}
+
+func (i InstOp) String() string {
+ if uint(i) >= uint(len(instOpNames)) {
+ return ""
+ }
+ return instOpNames[i]
+}
+
+// An EmptyOp specifies a kind or mixture of zero-width assertions.
+type EmptyOp uint8
+
+const (
+ EmptyBeginLine EmptyOp = 1 << iota
+ EmptyEndLine
+ EmptyBeginText
+ EmptyEndText
+ EmptyWordBoundary
+ EmptyNoWordBoundary
+)
+
+// EmptyOpContext returns the zero-width assertions
+// satisfied at the position between the runes r1 and r2.
+// Passing r1 == -1 indicates that the position is
+// at the beginning of the text.
+// Passing r2 == -1 indicates that the position is
+// at the end of the text.
+func EmptyOpContext(r1, r2 rune) EmptyOp {
+ var op EmptyOp = EmptyNoWordBoundary
+ var boundary byte
+ switch {
+ case IsWordChar(r1):
+ boundary = 1
+ case r1 == '\n':
+ op |= EmptyBeginLine
+ case r1 < 0:
+ op |= EmptyBeginText | EmptyBeginLine
+ }
+ switch {
+ case IsWordChar(r2):
+ boundary ^= 1
+ case r2 == '\n':
+ op |= EmptyEndLine
+ case r2 < 0:
+ op |= EmptyEndText | EmptyEndLine
+ }
+ if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
+ op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
+ }
+ return op
+}
+
+// IsWordChar reports whether r is considered a “word character”
+// during the evaluation of the \b and \B zero-width assertions.
+// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
+func IsWordChar(r rune) bool {
+ return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
+}
+
+// An Inst is a single instruction in a regular expression program.
+type Inst struct {
+ Op InstOp
+ Out uint32 // all but InstMatch, InstFail
+ Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
+ Rune []rune
+}
+
+func (p *Prog) String() string {
+ var b strings.Builder
+ dumpProg(&b, p)
+ return b.String()
+}
+
+// skipNop follows any no-op or capturing instructions.
+func (p *Prog) skipNop(pc uint32) *Inst {
+ i := &p.Inst[pc]
+ for i.Op == InstNop || i.Op == InstCapture {
+ i = &p.Inst[i.Out]
+ }
+ return i
+}
+
+// op returns i.Op but merges all the Rune special cases into InstRune
+func (i *Inst) op() InstOp {
+ op := i.Op
+ switch op {
+ case InstRune1, InstRuneAny, InstRuneAnyNotNL:
+ op = InstRune
+ }
+ return op
+}
+
+// Prefix returns a literal string that all matches for the
+// regexp must start with. Complete is true if the prefix
+// is the entire match.
+func (p *Prog) Prefix() (prefix string, complete bool) {
+ i := p.skipNop(uint32(p.Start))
+
+ // Avoid allocation of buffer if prefix is empty.
+ if i.op() != InstRune || len(i.Rune) != 1 {
+ return "", i.Op == InstMatch
+ }
+
+ // Have prefix; gather characters.
+ var buf strings.Builder
+ for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
+ buf.WriteRune(i.Rune[0])
+ i = p.skipNop(i.Out)
+ }
+ return buf.String(), i.Op == InstMatch
+}
+
+// StartCond returns the leading empty-width conditions that must
+// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
+func (p *Prog) StartCond() EmptyOp {
+ var flag EmptyOp
+ pc := uint32(p.Start)
+ i := &p.Inst[pc]
+Loop:
+ for {
+ switch i.Op {
+ case InstEmptyWidth:
+ flag |= EmptyOp(i.Arg)
+ case InstFail:
+ return ^EmptyOp(0)
+ case InstCapture, InstNop:
+ // skip
+ default:
+ break Loop
+ }
+ pc = i.Out
+ i = &p.Inst[pc]
+ }
+ return flag
+}
+
+const noMatch = -1
+
+// MatchRune reports whether the instruction matches (and consumes) r.
+// It should only be called when i.Op == InstRune.
+func (i *Inst) MatchRune(r rune) bool {
+ return i.MatchRunePos(r) != noMatch
+}
+
+// MatchRunePos checks whether the instruction matches (and consumes) r.
+// If so, MatchRunePos returns the index of the matching rune pair
+// (or, when len(i.Rune) == 1, rune singleton).
+// If not, MatchRunePos returns -1.
+// MatchRunePos should only be called when i.Op == InstRune.
+func (i *Inst) MatchRunePos(r rune) int {
+ rune := i.Rune
+
+ switch len(rune) {
+ case 0:
+ return noMatch
+
+ case 1:
+ // Special case: single-rune slice is from literal string, not char class.
+ r0 := rune[0]
+ if r == r0 {
+ return 0
+ }
+ if Flags(i.Arg)&FoldCase != 0 {
+ for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
+ if r == r1 {
+ return 0
+ }
+ }
+ }
+ return noMatch
+
+ case 2:
+ if r >= rune[0] && r <= rune[1] {
+ return 0
+ }
+ return noMatch
+
+ case 4, 6, 8:
+ // Linear search for a few pairs.
+ // Should handle ASCII well.
+ for j := 0; j < len(rune); j += 2 {
+ if r < rune[j] {
+ return noMatch
+ }
+ if r <= rune[j+1] {
+ return j / 2
+ }
+ }
+ return noMatch
+ }
+
+ // Otherwise binary search.
+ lo := 0
+ hi := len(rune) / 2
+ for lo < hi {
+ m := lo + (hi-lo)/2
+ if c := rune[2*m]; c <= r {
+ if r <= rune[2*m+1] {
+ return m
+ }
+ lo = m + 1
+ } else {
+ hi = m
+ }
+ }
+ return noMatch
+}
+
+// MatchEmptyWidth reports whether the instruction matches
+// an empty string between the runes before and after.
+// It should only be called when i.Op == InstEmptyWidth.
+func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
+ switch EmptyOp(i.Arg) {
+ case EmptyBeginLine:
+ return before == '\n' || before == -1
+ case EmptyEndLine:
+ return after == '\n' || after == -1
+ case EmptyBeginText:
+ return before == -1
+ case EmptyEndText:
+ return after == -1
+ case EmptyWordBoundary:
+ return IsWordChar(before) != IsWordChar(after)
+ case EmptyNoWordBoundary:
+ return IsWordChar(before) == IsWordChar(after)
+ }
+ panic("unknown empty width arg")
+}
+
+func (i *Inst) String() string {
+ var b strings.Builder
+ dumpInst(&b, i)
+ return b.String()
+}
+
+func bw(b *strings.Builder, args ...string) {
+ for _, s := range args {
+ b.WriteString(s)
+ }
+}
+
+func dumpProg(b *strings.Builder, p *Prog) {
+ for j := range p.Inst {
+ i := &p.Inst[j]
+ pc := strconv.Itoa(j)
+ if len(pc) < 3 {
+ b.WriteString(" "[len(pc):])
+ }
+ if j == p.Start {
+ pc += "*"
+ }
+ bw(b, pc, "\t")
+ dumpInst(b, i)
+ bw(b, "\n")
+ }
+}
+
+func u32(i uint32) string {
+ return strconv.FormatUint(uint64(i), 10)
+}
+
+func dumpInst(b *strings.Builder, i *Inst) {
+ switch i.Op {
+ case InstAlt:
+ bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
+ case InstAltMatch:
+ bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
+ case InstCapture:
+ bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
+ case InstEmptyWidth:
+ bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
+ case InstMatch:
+ bw(b, "match")
+ case InstFail:
+ bw(b, "fail")
+ case InstNop:
+ bw(b, "nop -> ", u32(i.Out))
+ case InstRune:
+ if i.Rune == nil {
+ // shouldn't happen
+ bw(b, "rune <nil>")
+ }
+ bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
+ if Flags(i.Arg)&FoldCase != 0 {
+ bw(b, "/i")
+ }
+ bw(b, " -> ", u32(i.Out))
+ case InstRune1:
+ bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
+ case InstRuneAny:
+ bw(b, "any -> ", u32(i.Out))
+ case InstRuneAnyNotNL:
+ bw(b, "anynotnl -> ", u32(i.Out))
+ }
+}
diff --git a/src/regexp/syntax/prog_test.go b/src/regexp/syntax/prog_test.go
new file mode 100644
index 0000000..5603aea
--- /dev/null
+++ b/src/regexp/syntax/prog_test.go
@@ -0,0 +1,129 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import "testing"
+
+var compileTests = []struct {
+ Regexp string
+ Prog string
+}{
+ {"a", ` 0 fail
+ 1* rune1 "a" -> 2
+ 2 match
+`},
+ {"[A-M][n-z]", ` 0 fail
+ 1* rune "AM" -> 2
+ 2 rune "nz" -> 3
+ 3 match
+`},
+ {"", ` 0 fail
+ 1* nop -> 2
+ 2 match
+`},
+ {"a?", ` 0 fail
+ 1 rune1 "a" -> 3
+ 2* alt -> 1, 3
+ 3 match
+`},
+ {"a??", ` 0 fail
+ 1 rune1 "a" -> 3
+ 2* alt -> 3, 1
+ 3 match
+`},
+ {"a+", ` 0 fail
+ 1* rune1 "a" -> 2
+ 2 alt -> 1, 3
+ 3 match
+`},
+ {"a+?", ` 0 fail
+ 1* rune1 "a" -> 2
+ 2 alt -> 3, 1
+ 3 match
+`},
+ {"a*", ` 0 fail
+ 1 rune1 "a" -> 2
+ 2* alt -> 1, 3
+ 3 match
+`},
+ {"a*?", ` 0 fail
+ 1 rune1 "a" -> 2
+ 2* alt -> 3, 1
+ 3 match
+`},
+ {"a+b+", ` 0 fail
+ 1* rune1 "a" -> 2
+ 2 alt -> 1, 3
+ 3 rune1 "b" -> 4
+ 4 alt -> 3, 5
+ 5 match
+`},
+ {"(a+)(b+)", ` 0 fail
+ 1* cap 2 -> 2
+ 2 rune1 "a" -> 3
+ 3 alt -> 2, 4
+ 4 cap 3 -> 5
+ 5 cap 4 -> 6
+ 6 rune1 "b" -> 7
+ 7 alt -> 6, 8
+ 8 cap 5 -> 9
+ 9 match
+`},
+ {"a+|b+", ` 0 fail
+ 1 rune1 "a" -> 2
+ 2 alt -> 1, 6
+ 3 rune1 "b" -> 4
+ 4 alt -> 3, 6
+ 5* alt -> 1, 3
+ 6 match
+`},
+ {"A[Aa]", ` 0 fail
+ 1* rune1 "A" -> 2
+ 2 rune "A"/i -> 3
+ 3 match
+`},
+ {"(?:(?:^).)", ` 0 fail
+ 1* empty 4 -> 2
+ 2 anynotnl -> 3
+ 3 match
+`},
+ {"(?:|a)+", ` 0 fail
+ 1 nop -> 4
+ 2 rune1 "a" -> 4
+ 3* alt -> 1, 2
+ 4 alt -> 3, 5
+ 5 match
+`},
+ {"(?:|a)*", ` 0 fail
+ 1 nop -> 4
+ 2 rune1 "a" -> 4
+ 3 alt -> 1, 2
+ 4 alt -> 3, 6
+ 5* alt -> 3, 6
+ 6 match
+`},
+}
+
+func TestCompile(t *testing.T) {
+ for _, tt := range compileTests {
+ re, _ := Parse(tt.Regexp, Perl)
+ p, _ := Compile(re)
+ s := p.String()
+ if s != tt.Prog {
+ t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog)
+ }
+ }
+}
+
+func BenchmarkEmptyOpContext(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ var r1 rune = -1
+ for _, r2 := range "foo, bar, baz\nsome input text.\n" {
+ EmptyOpContext(r1, r2)
+ r1 = r2
+ }
+ EmptyOpContext(r1, -1)
+ }
+}
diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go
new file mode 100644
index 0000000..3a4d2d2
--- /dev/null
+++ b/src/regexp/syntax/regexp.go
@@ -0,0 +1,320 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Note to implementers:
+// In this package, re is always a *Regexp and r is always a rune.
+
+import (
+ "strconv"
+ "strings"
+ "unicode"
+)
+
+// A Regexp is a node in a regular expression syntax tree.
+type Regexp struct {
+ Op Op // operator
+ Flags Flags
+ Sub []*Regexp // subexpressions, if any
+ Sub0 [1]*Regexp // storage for short Sub
+ Rune []rune // matched runes, for OpLiteral, OpCharClass
+ Rune0 [2]rune // storage for short Rune
+ Min, Max int // min, max for OpRepeat
+ Cap int // capturing index, for OpCapture
+ Name string // capturing name, for OpCapture
+}
+
+//go:generate stringer -type Op -trimprefix Op
+
+// An Op is a single regular expression operator.
+type Op uint8
+
+// Operators are listed in precedence order, tightest binding to weakest.
+// Character class operators are listed simplest to most complex
+// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
+
+const (
+ OpNoMatch Op = 1 + iota // matches no strings
+ OpEmptyMatch // matches empty string
+ OpLiteral // matches Runes sequence
+ OpCharClass // matches Runes interpreted as range pair list
+ OpAnyCharNotNL // matches any character except newline
+ OpAnyChar // matches any character
+ OpBeginLine // matches empty string at beginning of line
+ OpEndLine // matches empty string at end of line
+ OpBeginText // matches empty string at beginning of text
+ OpEndText // matches empty string at end of text
+ OpWordBoundary // matches word boundary `\b`
+ OpNoWordBoundary // matches word non-boundary `\B`
+ OpCapture // capturing subexpression with index Cap, optional name Name
+ OpStar // matches Sub[0] zero or more times
+ OpPlus // matches Sub[0] one or more times
+ OpQuest // matches Sub[0] zero or one times
+ OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
+ OpConcat // matches concatenation of Subs
+ OpAlternate // matches alternation of Subs
+)
+
+const opPseudo Op = 128 // where pseudo-ops start
+
+// Equal reports whether x and y have identical structure.
+func (x *Regexp) Equal(y *Regexp) bool {
+ if x == nil || y == nil {
+ return x == y
+ }
+ if x.Op != y.Op {
+ return false
+ }
+ switch x.Op {
+ case OpEndText:
+ // The parse flags remember whether this is \z or \Z.
+ if x.Flags&WasDollar != y.Flags&WasDollar {
+ return false
+ }
+
+ case OpLiteral, OpCharClass:
+ if len(x.Rune) != len(y.Rune) {
+ return false
+ }
+ for i, r := range x.Rune {
+ if r != y.Rune[i] {
+ return false
+ }
+ }
+
+ case OpAlternate, OpConcat:
+ if len(x.Sub) != len(y.Sub) {
+ return false
+ }
+ for i, sub := range x.Sub {
+ if !sub.Equal(y.Sub[i]) {
+ return false
+ }
+ }
+
+ case OpStar, OpPlus, OpQuest:
+ if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+
+ case OpRepeat:
+ if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+
+ case OpCapture:
+ if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
+ return false
+ }
+ }
+ return true
+}
+
+// writeRegexp writes the Perl syntax for the regular expression re to b.
+func writeRegexp(b *strings.Builder, re *Regexp) {
+ switch re.Op {
+ default:
+ b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
+ case OpNoMatch:
+ b.WriteString(`[^\x00-\x{10FFFF}]`)
+ case OpEmptyMatch:
+ b.WriteString(`(?:)`)
+ case OpLiteral:
+ if re.Flags&FoldCase != 0 {
+ b.WriteString(`(?i:`)
+ }
+ for _, r := range re.Rune {
+ escape(b, r, false)
+ }
+ if re.Flags&FoldCase != 0 {
+ b.WriteString(`)`)
+ }
+ case OpCharClass:
+ if len(re.Rune)%2 != 0 {
+ b.WriteString(`[invalid char class]`)
+ break
+ }
+ b.WriteRune('[')
+ if len(re.Rune) == 0 {
+ b.WriteString(`^\x00-\x{10FFFF}`)
+ } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
+ // Contains 0 and MaxRune. Probably a negated class.
+ // Print the gaps.
+ b.WriteRune('^')
+ for i := 1; i < len(re.Rune)-1; i += 2 {
+ lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
+ escape(b, lo, lo == '-')
+ if lo != hi {
+ b.WriteRune('-')
+ escape(b, hi, hi == '-')
+ }
+ }
+ } else {
+ for i := 0; i < len(re.Rune); i += 2 {
+ lo, hi := re.Rune[i], re.Rune[i+1]
+ escape(b, lo, lo == '-')
+ if lo != hi {
+ b.WriteRune('-')
+ escape(b, hi, hi == '-')
+ }
+ }
+ }
+ b.WriteRune(']')
+ case OpAnyCharNotNL:
+ b.WriteString(`(?-s:.)`)
+ case OpAnyChar:
+ b.WriteString(`(?s:.)`)
+ case OpBeginLine:
+ b.WriteString(`(?m:^)`)
+ case OpEndLine:
+ b.WriteString(`(?m:$)`)
+ case OpBeginText:
+ b.WriteString(`\A`)
+ case OpEndText:
+ if re.Flags&WasDollar != 0 {
+ b.WriteString(`(?-m:$)`)
+ } else {
+ b.WriteString(`\z`)
+ }
+ case OpWordBoundary:
+ b.WriteString(`\b`)
+ case OpNoWordBoundary:
+ b.WriteString(`\B`)
+ case OpCapture:
+ if re.Name != "" {
+ b.WriteString(`(?P<`)
+ b.WriteString(re.Name)
+ b.WriteRune('>')
+ } else {
+ b.WriteRune('(')
+ }
+ if re.Sub[0].Op != OpEmptyMatch {
+ writeRegexp(b, re.Sub[0])
+ }
+ b.WriteRune(')')
+ case OpStar, OpPlus, OpQuest, OpRepeat:
+ if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
+ b.WriteString(`(?:`)
+ writeRegexp(b, sub)
+ b.WriteString(`)`)
+ } else {
+ writeRegexp(b, sub)
+ }
+ switch re.Op {
+ case OpStar:
+ b.WriteRune('*')
+ case OpPlus:
+ b.WriteRune('+')
+ case OpQuest:
+ b.WriteRune('?')
+ case OpRepeat:
+ b.WriteRune('{')
+ b.WriteString(strconv.Itoa(re.Min))
+ if re.Max != re.Min {
+ b.WriteRune(',')
+ if re.Max >= 0 {
+ b.WriteString(strconv.Itoa(re.Max))
+ }
+ }
+ b.WriteRune('}')
+ }
+ if re.Flags&NonGreedy != 0 {
+ b.WriteRune('?')
+ }
+ case OpConcat:
+ for _, sub := range re.Sub {
+ if sub.Op == OpAlternate {
+ b.WriteString(`(?:`)
+ writeRegexp(b, sub)
+ b.WriteString(`)`)
+ } else {
+ writeRegexp(b, sub)
+ }
+ }
+ case OpAlternate:
+ for i, sub := range re.Sub {
+ if i > 0 {
+ b.WriteRune('|')
+ }
+ writeRegexp(b, sub)
+ }
+ }
+}
+
+func (re *Regexp) String() string {
+ var b strings.Builder
+ writeRegexp(&b, re)
+ return b.String()
+}
+
+const meta = `\.+*?()|[]{}^$`
+
+func escape(b *strings.Builder, r rune, force bool) {
+ if unicode.IsPrint(r) {
+ if strings.ContainsRune(meta, r) || force {
+ b.WriteRune('\\')
+ }
+ b.WriteRune(r)
+ return
+ }
+
+ switch r {
+ case '\a':
+ b.WriteString(`\a`)
+ case '\f':
+ b.WriteString(`\f`)
+ case '\n':
+ b.WriteString(`\n`)
+ case '\r':
+ b.WriteString(`\r`)
+ case '\t':
+ b.WriteString(`\t`)
+ case '\v':
+ b.WriteString(`\v`)
+ default:
+ if r < 0x100 {
+ b.WriteString(`\x`)
+ s := strconv.FormatInt(int64(r), 16)
+ if len(s) == 1 {
+ b.WriteRune('0')
+ }
+ b.WriteString(s)
+ break
+ }
+ b.WriteString(`\x{`)
+ b.WriteString(strconv.FormatInt(int64(r), 16))
+ b.WriteString(`}`)
+ }
+}
+
+// MaxCap walks the regexp to find the maximum capture index.
+func (re *Regexp) MaxCap() int {
+ m := 0
+ if re.Op == OpCapture {
+ m = re.Cap
+ }
+ for _, sub := range re.Sub {
+ if n := sub.MaxCap(); m < n {
+ m = n
+ }
+ }
+ return m
+}
+
+// CapNames walks the regexp to find the names of capturing groups.
+func (re *Regexp) CapNames() []string {
+ names := make([]string, re.MaxCap()+1)
+ re.capNames(names)
+ return names
+}
+
+func (re *Regexp) capNames(names []string) {
+ if re.Op == OpCapture {
+ names[re.Cap] = re.Name
+ }
+ for _, sub := range re.Sub {
+ sub.capNames(names)
+ }
+}
diff --git a/src/regexp/syntax/simplify.go b/src/regexp/syntax/simplify.go
new file mode 100644
index 0000000..e439325
--- /dev/null
+++ b/src/regexp/syntax/simplify.go
@@ -0,0 +1,151 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Simplify returns a regexp equivalent to re but without counted repetitions
+// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
+// The resulting regexp will execute correctly but its string representation
+// will not produce the same parse tree, because capturing parentheses
+// may have been duplicated or removed. For example, the simplified form
+// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
+// The returned regexp may share structure with or be the original.
+func (re *Regexp) Simplify() *Regexp {
+ if re == nil {
+ return nil
+ }
+ switch re.Op {
+ case OpCapture, OpConcat, OpAlternate:
+ // Simplify children, building new Regexp if children change.
+ nre := re
+ for i, sub := range re.Sub {
+ nsub := sub.Simplify()
+ if nre == re && nsub != sub {
+ // Start a copy.
+ nre = new(Regexp)
+ *nre = *re
+ nre.Rune = nil
+ nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
+ }
+ if nre != re {
+ nre.Sub = append(nre.Sub, nsub)
+ }
+ }
+ return nre
+
+ case OpStar, OpPlus, OpQuest:
+ sub := re.Sub[0].Simplify()
+ return simplify1(re.Op, re.Flags, sub, re)
+
+ case OpRepeat:
+ // Special special case: x{0} matches the empty string
+ // and doesn't even need to consider x.
+ if re.Min == 0 && re.Max == 0 {
+ return &Regexp{Op: OpEmptyMatch}
+ }
+
+ // The fun begins.
+ sub := re.Sub[0].Simplify()
+
+ // x{n,} means at least n matches of x.
+ if re.Max == -1 {
+ // Special case: x{0,} is x*.
+ if re.Min == 0 {
+ return simplify1(OpStar, re.Flags, sub, nil)
+ }
+
+ // Special case: x{1,} is x+.
+ if re.Min == 1 {
+ return simplify1(OpPlus, re.Flags, sub, nil)
+ }
+
+ // General case: x{4,} is xxxx+.
+ nre := &Regexp{Op: OpConcat}
+ nre.Sub = nre.Sub0[:0]
+ for i := 0; i < re.Min-1; i++ {
+ nre.Sub = append(nre.Sub, sub)
+ }
+ nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
+ return nre
+ }
+
+ // Special case x{0} handled above.
+
+ // Special case: x{1} is just x.
+ if re.Min == 1 && re.Max == 1 {
+ return sub
+ }
+
+ // General case: x{n,m} means n copies of x and m copies of x?
+ // The machine will do less work if we nest the final m copies,
+ // so that x{2,5} = xx(x(x(x)?)?)?
+
+ // Build leading prefix: xx.
+ var prefix *Regexp
+ if re.Min > 0 {
+ prefix = &Regexp{Op: OpConcat}
+ prefix.Sub = prefix.Sub0[:0]
+ for i := 0; i < re.Min; i++ {
+ prefix.Sub = append(prefix.Sub, sub)
+ }
+ }
+
+ // Build and attach suffix: (x(x(x)?)?)?
+ if re.Max > re.Min {
+ suffix := simplify1(OpQuest, re.Flags, sub, nil)
+ for i := re.Min + 1; i < re.Max; i++ {
+ nre2 := &Regexp{Op: OpConcat}
+ nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
+ suffix = simplify1(OpQuest, re.Flags, nre2, nil)
+ }
+ if prefix == nil {
+ return suffix
+ }
+ prefix.Sub = append(prefix.Sub, suffix)
+ }
+ if prefix != nil {
+ return prefix
+ }
+
+ // Some degenerate case like min > max or min < max < 0.
+ // Handle as impossible match.
+ return &Regexp{Op: OpNoMatch}
+ }
+
+ return re
+}
+
+// simplify1 implements Simplify for the unary OpStar,
+// OpPlus, and OpQuest operators. It returns the simple regexp
+// equivalent to
+//
+// Regexp{Op: op, Flags: flags, Sub: {sub}}
+//
+// under the assumption that sub is already simple, and
+// without first allocating that structure. If the regexp
+// to be returned turns out to be equivalent to re, simplify1
+// returns re instead.
+//
+// simplify1 is factored out of Simplify because the implementation
+// for other operators generates these unary expressions.
+// Letting them call simplify1 makes sure the expressions they
+// generate are simple.
+func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
+ // Special case: repeat the empty string as much as
+ // you want, but it's still the empty string.
+ if sub.Op == OpEmptyMatch {
+ return sub
+ }
+ // The operators are idempotent if the flags match.
+ if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
+ return sub
+ }
+ if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
+ return re
+ }
+
+ re = &Regexp{Op: op, Flags: flags}
+ re.Sub = append(re.Sub0[:0], sub)
+ return re
+}
diff --git a/src/regexp/syntax/simplify_test.go b/src/regexp/syntax/simplify_test.go
new file mode 100644
index 0000000..9877db3
--- /dev/null
+++ b/src/regexp/syntax/simplify_test.go
@@ -0,0 +1,151 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import "testing"
+
+var simplifyTests = []struct {
+ Regexp string
+ Simple string
+}{
+ // Already-simple constructs
+ {`a`, `a`},
+ {`ab`, `ab`},
+ {`a|b`, `[a-b]`},
+ {`ab|cd`, `ab|cd`},
+ {`(ab)*`, `(ab)*`},
+ {`(ab)+`, `(ab)+`},
+ {`(ab)?`, `(ab)?`},
+ {`.`, `(?s:.)`},
+ {`^`, `(?m:^)`},
+ {`$`, `(?m:$)`},
+ {`[ac]`, `[ac]`},
+ {`[^ac]`, `[^ac]`},
+
+ // Posix character classes
+ {`[[:alnum:]]`, `[0-9A-Za-z]`},
+ {`[[:alpha:]]`, `[A-Za-z]`},
+ {`[[:blank:]]`, `[\t ]`},
+ {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
+ {`[[:digit:]]`, `[0-9]`},
+ {`[[:graph:]]`, `[!-~]`},
+ {`[[:lower:]]`, `[a-z]`},
+ {`[[:print:]]`, `[ -~]`},
+ {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
+ {`[[:space:]]`, `[\t-\r ]`},
+ {`[[:upper:]]`, `[A-Z]`},
+ {`[[:xdigit:]]`, `[0-9A-Fa-f]`},
+
+ // Perl character classes
+ {`\d`, `[0-9]`},
+ {`\s`, `[\t-\n\f-\r ]`},
+ {`\w`, `[0-9A-Z_a-z]`},
+ {`\D`, `[^0-9]`},
+ {`\S`, `[^\t-\n\f-\r ]`},
+ {`\W`, `[^0-9A-Z_a-z]`},
+ {`[\d]`, `[0-9]`},
+ {`[\s]`, `[\t-\n\f-\r ]`},
+ {`[\w]`, `[0-9A-Z_a-z]`},
+ {`[\D]`, `[^0-9]`},
+ {`[\S]`, `[^\t-\n\f-\r ]`},
+ {`[\W]`, `[^0-9A-Z_a-z]`},
+
+ // Posix repetitions
+ {`a{1}`, `a`},
+ {`a{2}`, `aa`},
+ {`a{5}`, `aaaaa`},
+ {`a{0,1}`, `a?`},
+ // The next three are illegible because Simplify inserts (?:)
+ // parens instead of () parens to avoid creating extra
+ // captured subexpressions. The comments show a version with fewer parens.
+ {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)?
+ {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)?
+ {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
+ {`a{0,2}`, `(?:aa?)?`}, // (aa?)?
+ {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)?
+ {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)?
+ {`a{0,}`, `a*`},
+ {`a{1,}`, `a+`},
+ {`a{2,}`, `aa+`},
+ {`a{5,}`, `aaaaa+`},
+
+ // Test that operators simplify their arguments.
+ {`(?:a{1,}){1,}`, `a+`},
+ {`(a{1,}b{1,})`, `(a+b+)`},
+ {`a{1,}|b{1,}`, `a+|b+`},
+ {`(?:a{1,})*`, `(?:a+)*`},
+ {`(?:a{1,})+`, `a+`},
+ {`(?:a{1,})?`, `(?:a+)?`},
+ {``, `(?:)`},
+ {`a{0}`, `(?:)`},
+
+ // Character class simplification
+ {`[ab]`, `[a-b]`},
+ {`[a-za-za-z]`, `[a-z]`},
+ {`[A-Za-zA-Za-z]`, `[A-Za-z]`},
+ {`[ABCDEFGH]`, `[A-H]`},
+ {`[AB-CD-EF-GH]`, `[A-H]`},
+ {`[W-ZP-XE-R]`, `[E-Z]`},
+ {`[a-ee-gg-m]`, `[a-m]`},
+ {`[a-ea-ha-m]`, `[a-m]`},
+ {`[a-ma-ha-e]`, `[a-m]`},
+ {`[a-zA-Z0-9 -~]`, `[ -~]`},
+
+ // Empty character classes
+ {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
+
+ // Full character classes
+ {`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
+
+ // Unicode case folding.
+ {`(?i)A`, `(?i:A)`},
+ {`(?i)a`, `(?i:A)`},
+ {`(?i)[A]`, `(?i:A)`},
+ {`(?i)[a]`, `(?i:A)`},
+ {`(?i)K`, `(?i:K)`},
+ {`(?i)k`, `(?i:K)`},
+ {`(?i)\x{212a}`, "(?i:K)"},
+ {`(?i)[K]`, "[Kk\u212A]"},
+ {`(?i)[k]`, "[Kk\u212A]"},
+ {`(?i)[\x{212a}]`, "[Kk\u212A]"},
+ {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
+ {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
+ {`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
+
+ // Empty string as a regular expression.
+ // The empty string must be preserved inside parens in order
+ // to make submatches work right, so these tests are less
+ // interesting than they might otherwise be. String inserts
+ // explicit (?:) in place of non-parenthesized empty strings,
+ // to make them easier to spot for other parsers.
+ {`(a|b|)`, `([a-b]|(?:))`},
+ {`(|)`, `()`},
+ {`a()`, `a()`},
+ {`(()|())`, `(()|())`},
+ {`(a|)`, `(a|(?:))`},
+ {`ab()cd()`, `ab()cd()`},
+ {`()`, `()`},
+ {`()*`, `()*`},
+ {`()+`, `()+`},
+ {`()?`, `()?`},
+ {`(){0}`, `(?:)`},
+ {`(){1}`, `()`},
+ {`(){1,}`, `()+`},
+ {`(){0,2}`, `(?:()()?)?`},
+}
+
+func TestSimplify(t *testing.T) {
+ for _, tt := range simplifyTests {
+ re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
+ if err != nil {
+ t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
+ continue
+ }
+ s := re.Simplify().String()
+ if s != tt.Simple {
+ t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
+ }
+ }
+}