Adding upstream version 1.20.14.upstream/1.20.14 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-16 19:23:18 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-16 19:23:18 +0000
commit: 43a123c1ae6613b3efeed291fa552ecd909d3acf (patch)
tree: fd92518b7024bc74031f78a1cf9e454b65e73665 /src/regexp/syntax
parent: Initial commit. (diff)
download: golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.tar.xz
golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.zip
12 files changed, 4509 insertions, 0 deletions
diff --git a/src/regexp/syntax/compile.go b/src/regexp/syntax/compile.go
new file mode 100644
index 0000000..c9f9fa0
--- /dev/null
+++ b/src/regexp/syntax/compile.go
@@ -0,0 +1,296 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import "unicode"
+
+// A patchList is a list of instruction pointers that need to be filled in (patched).
+// Because the pointers haven't been filled in yet, we can reuse their storage
+// to hold the list. It's kind of sleazy, but works well in practice.
+// See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
+//
+// These aren't really pointers: they're integers, so we can reinterpret them
+// this way without using package unsafe. A value l.head denotes
+// p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
+// head == 0 denotes the empty list, okay because we start every program
+// with a fail instruction, so we'll never want to point at its output link.
+type patchList struct {
+	head, tail uint32
+}
+
+func makePatchList(n uint32) patchList {
+	return patchList{n, n}
+}
+
+func (l patchList) patch(p *Prog, val uint32) {
+	head := l.head
+	for head != 0 {
+		i := &p.Inst[head>>1]
+		if head&1 == 0 {
+			head = i.Out
+			i.Out = val
+		} else {
+			head = i.Arg
+			i.Arg = val
+		}
+	}
+}
+
+func (l1 patchList) append(p *Prog, l2 patchList) patchList {
+	if l1.head == 0 {
+		return l2
+	}
+	if l2.head == 0 {
+		return l1
+	}
+
+	i := &p.Inst[l1.tail>>1]
+	if l1.tail&1 == 0 {
+		i.Out = l2.head
+	} else {
+		i.Arg = l2.head
+	}
+	return patchList{l1.head, l2.tail}
+}
+
+// A frag represents a compiled program fragment.
+type frag struct {
+	i        uint32    // index of first instruction
+	out      patchList // where to record end instruction
+	nullable bool      // whether fragment can match empty string
+}
+
+type compiler struct {
+	p *Prog
+}
+
+// Compile compiles the regexp into a program to be executed.
+// The regexp should have been simplified already (returned from re.Simplify).
+func Compile(re *Regexp) (*Prog, error) {
+	var c compiler
+	c.init()
+	f := c.compile(re)
+	f.out.patch(c.p, c.inst(InstMatch).i)
+	c.p.Start = int(f.i)
+	return c.p, nil
+}
+
+func (c *compiler) init() {
+	c.p = new(Prog)
+	c.p.NumCap = 2 // implicit ( and ) for whole match $0
+	c.inst(InstFail)
+}
+
+var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
+var anyRune = []rune{0, unicode.MaxRune}
+
+func (c *compiler) compile(re *Regexp) frag {
+	switch re.Op {
+	case OpNoMatch:
+		return c.fail()
+	case OpEmptyMatch:
+		return c.nop()
+	case OpLiteral:
+		if len(re.Rune) == 0 {
+			return c.nop()
+		}
+		var f frag
+		for j := range re.Rune {
+			f1 := c.rune(re.Rune[j:j+1], re.Flags)
+			if j == 0 {
+				f = f1
+			} else {
+				f = c.cat(f, f1)
+			}
+		}
+		return f
+	case OpCharClass:
+		return c.rune(re.Rune, re.Flags)
+	case OpAnyCharNotNL:
+		return c.rune(anyRuneNotNL, 0)
+	case OpAnyChar:
+		return c.rune(anyRune, 0)
+	case OpBeginLine:
+		return c.empty(EmptyBeginLine)
+	case OpEndLine:
+		return c.empty(EmptyEndLine)
+	case OpBeginText:
+		return c.empty(EmptyBeginText)
+	case OpEndText:
+		return c.empty(EmptyEndText)
+	case OpWordBoundary:
+		return c.empty(EmptyWordBoundary)
+	case OpNoWordBoundary:
+		return c.empty(EmptyNoWordBoundary)
+	case OpCapture:
+		bra := c.cap(uint32(re.Cap << 1))
+		sub := c.compile(re.Sub[0])
+		ket := c.cap(uint32(re.Cap<<1 | 1))
+		return c.cat(c.cat(bra, sub), ket)
+	case OpStar:
+		return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
+	case OpPlus:
+		return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
+	case OpQuest:
+		return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
+	case OpConcat:
+		if len(re.Sub) == 0 {
+			return c.nop()
+		}
+		var f frag
+		for i, sub := range re.Sub {
+			if i == 0 {
+				f = c.compile(sub)
+			} else {
+				f = c.cat(f, c.compile(sub))
+			}
+		}
+		return f
+	case OpAlternate:
+		var f frag
+		for _, sub := range re.Sub {
+			f = c.alt(f, c.compile(sub))
+		}
+		return f
+	}
+	panic("regexp: unhandled case in compile")
+}
+
+func (c *compiler) inst(op InstOp) frag {
+	// TODO: impose length limit
+	f := frag{i: uint32(len(c.p.Inst)), nullable: true}
+	c.p.Inst = append(c.p.Inst, Inst{Op: op})
+	return f
+}
+
+func (c *compiler) nop() frag {
+	f := c.inst(InstNop)
+	f.out = makePatchList(f.i << 1)
+	return f
+}
+
+func (c *compiler) fail() frag {
+	return frag{}
+}
+
+func (c *compiler) cap(arg uint32) frag {
+	f := c.inst(InstCapture)
+	f.out = makePatchList(f.i << 1)
+	c.p.Inst[f.i].Arg = arg
+
+	if c.p.NumCap < int(arg)+1 {
+		c.p.NumCap = int(arg) + 1
+	}
+	return f
+}
+
+func (c *compiler) cat(f1, f2 frag) frag {
+	// concat of failure is failure
+	if f1.i == 0 || f2.i == 0 {
+		return frag{}
+	}
+
+	// TODO: elide nop
+
+	f1.out.patch(c.p, f2.i)
+	return frag{f1.i, f2.out, f1.nullable && f2.nullable}
+}
+
+func (c *compiler) alt(f1, f2 frag) frag {
+	// alt of failure is other
+	if f1.i == 0 {
+		return f2
+	}
+	if f2.i == 0 {
+		return f1
+	}
+
+	f := c.inst(InstAlt)
+	i := &c.p.Inst[f.i]
+	i.Out = f1.i
+	i.Arg = f2.i
+	f.out = f1.out.append(c.p, f2.out)
+	f.nullable = f1.nullable || f2.nullable
+	return f
+}
+
+func (c *compiler) quest(f1 frag, nongreedy bool) frag {
+	f := c.inst(InstAlt)
+	i := &c.p.Inst[f.i]
+	if nongreedy {
+		i.Arg = f1.i
+		f.out = makePatchList(f.i << 1)
+	} else {
+		i.Out = f1.i
+		f.out = makePatchList(f.i<<1 | 1)
+	}
+	f.out = f.out.append(c.p, f1.out)
+	return f
+}
+
+// loop returns the fragment for the main loop of a plus or star.
+// For plus, it can be used after changing the entry to f1.i.
+// For star, it can be used directly when f1 can't match an empty string.
+// (When f1 can match an empty string, f1* must be implemented as (f1+)?
+// to get the priority match order correct.)
+func (c *compiler) loop(f1 frag, nongreedy bool) frag {
+	f := c.inst(InstAlt)
+	i := &c.p.Inst[f.i]
+	if nongreedy {
+		i.Arg = f1.i
+		f.out = makePatchList(f.i << 1)
+	} else {
+		i.Out = f1.i
+		f.out = makePatchList(f.i<<1 | 1)
+	}
+	f1.out.patch(c.p, f.i)
+	return f
+}
+
+func (c *compiler) star(f1 frag, nongreedy bool) frag {
+	if f1.nullable {
+		// Use (f1+)? to get priority match order correct.
+		// See golang.org/issue/46123.
+		return c.quest(c.plus(f1, nongreedy), nongreedy)
+	}
+	return c.loop(f1, nongreedy)
+}
+
+func (c *compiler) plus(f1 frag, nongreedy bool) frag {
+	return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
+}
+
+func (c *compiler) empty(op EmptyOp) frag {
+	f := c.inst(InstEmptyWidth)
+	c.p.Inst[f.i].Arg = uint32(op)
+	f.out = makePatchList(f.i << 1)
+	return f
+}
+
+func (c *compiler) rune(r []rune, flags Flags) frag {
+	f := c.inst(InstRune)
+	f.nullable = false
+	i := &c.p.Inst[f.i]
+	i.Rune = r
+	flags &= FoldCase // only relevant flag is FoldCase
+	if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
+		// and sometimes not even that
+		flags &^= FoldCase
+	}
+	i.Arg = uint32(flags)
+	f.out = makePatchList(f.i << 1)
+
+	// Special cases for exec machine.
+	switch {
+	case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
+		i.Op = InstRune1
+	case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
+		i.Op = InstRuneAny
+	case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
+		i.Op = InstRuneAnyNotNL
+	}
+
+	return f
+}
diff --git a/src/regexp/syntax/doc.go b/src/regexp/syntax/doc.go
new file mode 100644
index 0000000..f6a4b43
--- /dev/null
+++ b/src/regexp/syntax/doc.go
@@ -0,0 +1,141 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
+
+/*
+Package syntax parses regular expressions into parse trees and compiles
+parse trees into programs. Most clients of regular expressions will use the
+facilities of package regexp (such as Compile and Match) instead of this package.
+
+# Syntax
+
+The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
+Parts of the syntax can be disabled by passing alternate flags to Parse.
+
+Single characters:
+
+	.              any character, possibly including newline (flag s=true)
+	[xyz]          character class
+	[^xyz]         negated character class
+	\d             Perl character class
+	\D             negated Perl character class
+	[[:alpha:]]    ASCII character class
+	[[:^alpha:]]   negated ASCII character class
+	\pN            Unicode character class (one-letter name)
+	\p{Greek}      Unicode character class
+	\PN            negated Unicode character class (one-letter name)
+	\P{Greek}      negated Unicode character class
+
+Composites:
+
+	xy             x followed by y
+	x|y            x or y (prefer x)
+
+Repetitions:
+
+	x*             zero or more x, prefer more
+	x+             one or more x, prefer more
+	x?             zero or one x, prefer one
+	x{n,m}         n or n+1 or ... or m x, prefer more
+	x{n,}          n or more x, prefer more
+	x{n}           exactly n x
+	x*?            zero or more x, prefer fewer
+	x+?            one or more x, prefer fewer
+	x??            zero or one x, prefer zero
+	x{n,m}?        n or n+1 or ... or m x, prefer fewer
+	x{n,}?         n or more x, prefer fewer
+	x{n}?          exactly n x
+
+Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n}
+reject forms that create a minimum or maximum repetition count above 1000.
+Unlimited repetitions are not subject to this restriction.
+
+Grouping:
+
+	(re)           numbered capturing group (submatch)
+	(?P<name>re)   named & numbered capturing group (submatch)
+	(?:re)         non-capturing group
+	(?flags)       set flags within current group; non-capturing
+	(?flags:re)    set flags during re; non-capturing
+
+	Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
+
+	i              case-insensitive (default false)
+	m              multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
+	s              let . match \n (default false)
+	U              ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
+
+Empty strings:
+
+	^              at beginning of text or line (flag m=true)
+	$              at end of text (like \z not \Z) or line (flag m=true)
+	\A             at beginning of text
+	\b             at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
+	\B             not at ASCII word boundary
+	\z             at end of text
+
+Escape sequences:
+
+	\a             bell (== \007)
+	\f             form feed (== \014)
+	\t             horizontal tab (== \011)
+	\n             newline (== \012)
+	\r             carriage return (== \015)
+	\v             vertical tab character (== \013)
+	\*             literal *, for any punctuation character *
+	\123           octal character code (up to three digits)
+	\x7F           hex character code (exactly two digits)
+	\x{10FFFF}     hex character code
+	\Q...\E        literal text ... even if ... has punctuation
+
+Character class elements:
+
+	x              single character
+	A-Z            character range (inclusive)
+	\d             Perl character class
+	[:foo:]        ASCII character class foo
+	\p{Foo}        Unicode character class Foo
+	\pF            Unicode character class F (one-letter name)
+
+Named character classes as character class elements:
+
+	[\d]           digits (== \d)
+	[^\d]          not digits (== \D)
+	[\D]           not digits (== \D)
+	[^\D]          not not digits (== \d)
+	[[:name:]]     named ASCII class inside character class (== [:name:])
+	[^[:name:]]    named ASCII class inside negated character class (== [:^name:])
+	[\p{Name}]     named Unicode property inside character class (== \p{Name})
+	[^\p{Name}]    named Unicode property inside negated character class (== \P{Name})
+
+Perl character classes (all ASCII-only):
+
+	\d             digits (== [0-9])
+	\D             not digits (== [^0-9])
+	\s             whitespace (== [\t\n\f\r ])
+	\S             not whitespace (== [^\t\n\f\r ])
+	\w             word characters (== [0-9A-Za-z_])
+	\W             not word characters (== [^0-9A-Za-z_])
+
+ASCII character classes:
+
+	[[:alnum:]]    alphanumeric (== [0-9A-Za-z])
+	[[:alpha:]]    alphabetic (== [A-Za-z])
+	[[:ascii:]]    ASCII (== [\x00-\x7F])
+	[[:blank:]]    blank (== [\t ])
+	[[:cntrl:]]    control (== [\x00-\x1F\x7F])
+	[[:digit:]]    digits (== [0-9])
+	[[:graph:]]    graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
+	[[:lower:]]    lower case (== [a-z])
+	[[:print:]]    printable (== [ -~] == [ [:graph:]])
+	[[:punct:]]    punctuation (== [!-/:-@[-`{-~])
+	[[:space:]]    whitespace (== [\t\n\v\f\r ])
+	[[:upper:]]    upper case (== [A-Z])
+	[[:word:]]     word characters (== [0-9A-Za-z_])
+	[[:xdigit:]]   hex digit (== [0-9A-Fa-f])
+
+Unicode character classes are those in unicode.Categories and unicode.Scripts.
+*/
+package syntax
diff --git a/src/regexp/syntax/make_perl_groups.pl b/src/regexp/syntax/make_perl_groups.pl
new file mode 100755
index 0000000..80a2c9a
--- /dev/null
+++ b/src/regexp/syntax/make_perl_groups.pl
@@ -0,0 +1,113 @@
+#!/usr/bin/perl
+# Copyright 2008 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Modified version of RE2's make_perl_groups.pl.
+
+# Generate table entries giving character ranges
+# for POSIX/Perl character classes.  Rather than
+# figure out what the definition is, it is easier to ask
+# Perl about each letter from 0-128 and write down
+# its answer.
+
+@posixclasses = (
+	"[:alnum:]",
+	"[:alpha:]",
+	"[:ascii:]",
+	"[:blank:]",
+	"[:cntrl:]",
+	"[:digit:]",
+	"[:graph:]",
+	"[:lower:]",
+	"[:print:]",
+	"[:punct:]",
+	"[:space:]",
+	"[:upper:]",
+	"[:word:]",
+	"[:xdigit:]",
+);
+
+@perlclasses = (
+	"\\d",
+	"\\s",
+	"\\w",
+);
+
+%overrides = (
+	# Prior to Perl 5.18, \s did not match vertical tab.
+	# RE2 preserves that original behaviour.
+	"\\s:11" => 0,
+);
+
+sub ComputeClass($) {
+  my @ranges;
+  my ($class) = @_;
+  my $regexp = "[$class]";
+  my $start = -1;
+  for (my $i=0; $i<=129; $i++) {
+    if ($i == 129) { $i = 256; }
+    if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) {
+      if ($start < 0) {
+        $start = $i;
+      }
+    } else {
+      if ($start >= 0) {
+        push @ranges, [$start, $i-1];
+      }
+      $start = -1;
+    }
+  }
+  return @ranges;
+}
+
+sub PrintClass($$@) {
+  my ($cname, $name, @ranges) = @_;
+  print "var code$cname = []rune{  /* $name */\n";
+  for (my $i=0; $i<@ranges; $i++) {
+    my @a = @{$ranges[$i]};
+    printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
+  }
+  print "}\n\n";
+  my $n = @ranges;
+  $negname = $name;
+  if ($negname =~ /:/) {
+    $negname =~ s/:/:^/;
+  } else {
+    $negname =~ y/a-z/A-Z/;
+  }
+  return "\t`$name`: {+1, code$cname},\n" .
+  	"\t`$negname`: {-1, code$cname},\n";
+}
+
+my $gen = 0;
+
+sub PrintClasses($@) {
+  my ($cname, @classes) = @_;
+  my @entries;
+  foreach my $cl (@classes) {
+    my @ranges = ComputeClass($cl);
+    push @entries, PrintClass(++$gen, $cl, @ranges);
+  }
+  print "var ${cname}Group = map[string]charGroup{\n";
+  foreach my $e (@entries) {
+    print $e;
+  }
+  print "}\n";
+  my $count = @entries;
+}
+
+print <<EOF;
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.go
+
+package syntax
+
+EOF
+
+PrintClasses("perl", @perlclasses);
+PrintClasses("posix", @posixclasses);
diff --git a/src/regexp/syntax/op_string.go b/src/regexp/syntax/op_string.go
new file mode 100644
index 0000000..3952b2b
--- /dev/null
+++ b/src/regexp/syntax/op_string.go
@@ -0,0 +1,26 @@
+// Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT.
+
+package syntax
+
+import "strconv"
+
+const (
+	_Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"
+	_Op_name_1 = "opPseudo"
+)
+
+var (
+	_Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151}
+)
+
+func (i Op) String() string {
+	switch {
+	case 1 <= i && i <= 19:
+		i -= 1
+		return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]]
+	case i == 128:
+		return _Op_name_1
+	default:
+		return "Op(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go
new file mode 100644
index 0000000..accee9a
--- /dev/null
+++ b/src/regexp/syntax/parse.go
@@ -0,0 +1,2115 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+	"sort"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+// An Error describes a failure to parse a regular expression
+// and gives the offending expression.
+type Error struct {
+	Code ErrorCode
+	Expr string
+}
+
+func (e *Error) Error() string {
+	return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
+}
+
+// An ErrorCode describes a failure to parse a regular expression.
+type ErrorCode string
+
+const (
+	// Unexpected error
+	ErrInternalError ErrorCode = "regexp/syntax: internal error"
+
+	// Parse errors
+	ErrInvalidCharClass      ErrorCode = "invalid character class"
+	ErrInvalidCharRange      ErrorCode = "invalid character class range"
+	ErrInvalidEscape         ErrorCode = "invalid escape sequence"
+	ErrInvalidNamedCapture   ErrorCode = "invalid named capture"
+	ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax"
+	ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator"
+	ErrInvalidRepeatSize     ErrorCode = "invalid repeat count"
+	ErrInvalidUTF8           ErrorCode = "invalid UTF-8"
+	ErrMissingBracket        ErrorCode = "missing closing ]"
+	ErrMissingParen          ErrorCode = "missing closing )"
+	ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
+	ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
+	ErrUnexpectedParen       ErrorCode = "unexpected )"
+	ErrNestingDepth          ErrorCode = "expression nests too deeply"
+	ErrLarge                 ErrorCode = "expression too large"
+)
+
+func (e ErrorCode) String() string {
+	return string(e)
+}
+
+// Flags control the behavior of the parser and record information about regexp context.
+type Flags uint16
+
+const (
+	FoldCase      Flags = 1 << iota // case-insensitive match
+	Literal                         // treat pattern as literal string
+	ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline
+	DotNL                           // allow . to match newline
+	OneLine                         // treat ^ and $ as only matching at beginning and end of text
+	NonGreedy                       // make repetition operators default to non-greedy
+	PerlX                           // allow Perl extensions
+	UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation
+	WasDollar                       // regexp OpEndText was $, not \z
+	Simple                          // regexp contains no counted repetition
+
+	MatchNL = ClassNL | DotNL
+
+	Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
+	POSIX Flags = 0                                         // POSIX syntax
+)
+
+// Pseudo-ops for parsing stack.
+const (
+	opLeftParen = opPseudo + iota
+	opVerticalBar
+)
+
+// maxHeight is the maximum height of a regexp parse tree.
+// It is somewhat arbitrarily chosen, but the idea is to be large enough
+// that no one will actually hit in real use but at the same time small enough
+// that recursion on the Regexp tree will not hit the 1GB Go stack limit.
+// The maximum amount of stack for a single recursive frame is probably
+// closer to 1kB, so this could potentially be raised, but it seems unlikely
+// that people have regexps nested even this deeply.
+// We ran a test on Google's C++ code base and turned up only
+// a single use case with depth > 100; it had depth 128.
+// Using depth 1000 should be plenty of margin.
+// As an optimization, we don't even bother calculating heights
+// until we've allocated at least maxHeight Regexp structures.
+const maxHeight = 1000
+
+// maxSize is the maximum size of a compiled regexp in Insts.
+// It too is somewhat arbitrarily chosen, but the idea is to be large enough
+// to allow significant regexps while at the same time small enough that
+// the compiled form will not take up too much memory.
+// 128 MB is enough for a 3.3 million Inst structures, which roughly
+// corresponds to a 3.3 MB regexp.
+const (
+	maxSize  = 128 << 20 / instSize
+	instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words
+)
+
+// maxRunes is the maximum number of runes allowed in a regexp tree
+// counting the runes in all the nodes.
+// Ignoring character classes p.numRunes is always less than the length of the regexp.
+// Character classes can make it much larger: each \pL adds 1292 runes.
+// 128 MB is enough for 32M runes, which is over 26k \pL instances.
+// Note that repetitions do not make copies of the rune slices,
+// so \pL{1000} is only one rune slice, not 1000.
+// We could keep a cache of character classes we've seen,
+// so that all the \pL we see use the same rune list,
+// but that doesn't remove the problem entirely:
+// consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
+// And because the Rune slice is exposed directly in the Regexp,
+// there is not an opportunity to change the representation to allow
+// partial sharing between different character classes.
+// So the limit is the best we can do.
+const (
+	maxRunes = 128 << 20 / runeSize
+	runeSize = 4 // rune is int32
+)
+
+type parser struct {
+	flags       Flags     // parse mode flags
+	stack       []*Regexp // stack of parsed expressions
+	free        *Regexp
+	numCap      int // number of capturing groups seen
+	wholeRegexp string
+	tmpClass    []rune            // temporary char class work space
+	numRegexp   int               // number of regexps allocated
+	numRunes    int               // number of runes in char classes
+	repeats     int64             // product of all repetitions seen
+	height      map[*Regexp]int   // regexp height, for height limit check
+	size        map[*Regexp]int64 // regexp compiled size, for size limit check
+}
+
+func (p *parser) newRegexp(op Op) *Regexp {
+	re := p.free
+	if re != nil {
+		p.free = re.Sub0[0]
+		*re = Regexp{}
+	} else {
+		re = new(Regexp)
+		p.numRegexp++
+	}
+	re.Op = op
+	return re
+}
+
+func (p *parser) reuse(re *Regexp) {
+	if p.height != nil {
+		delete(p.height, re)
+	}
+	re.Sub0[0] = p.free
+	p.free = re
+}
+
+func (p *parser) checkLimits(re *Regexp) {
+	if p.numRunes > maxRunes {
+		panic(ErrLarge)
+	}
+	p.checkSize(re)
+	p.checkHeight(re)
+}
+
+func (p *parser) checkSize(re *Regexp) {
+	if p.size == nil {
+		// We haven't started tracking size yet.
+		// Do a relatively cheap check to see if we need to start.
+		// Maintain the product of all the repeats we've seen
+		// and don't track if the total number of regexp nodes
+		// we've seen times the repeat product is in budget.
+		if p.repeats == 0 {
+			p.repeats = 1
+		}
+		if re.Op == OpRepeat {
+			n := re.Max
+			if n == -1 {
+				n = re.Min
+			}
+			if n <= 0 {
+				n = 1
+			}
+			if int64(n) > maxSize/p.repeats {
+				p.repeats = maxSize
+			} else {
+				p.repeats *= int64(n)
+			}
+		}
+		if int64(p.numRegexp) < maxSize/p.repeats {
+			return
+		}
+
+		// We need to start tracking size.
+		// Make the map and belatedly populate it
+		// with info about everything we've constructed so far.
+		p.size = make(map[*Regexp]int64)
+		for _, re := range p.stack {
+			p.checkSize(re)
+		}
+	}
+
+	if p.calcSize(re, true) > maxSize {
+		panic(ErrLarge)
+	}
+}
+
+func (p *parser) calcSize(re *Regexp, force bool) int64 {
+	if !force {
+		if size, ok := p.size[re]; ok {
+			return size
+		}
+	}
+
+	var size int64
+	switch re.Op {
+	case OpLiteral:
+		size = int64(len(re.Rune))
+	case OpCapture, OpStar:
+		// star can be 1+ or 2+; assume 2 pessimistically
+		size = 2 + p.calcSize(re.Sub[0], false)
+	case OpPlus, OpQuest:
+		size = 1 + p.calcSize(re.Sub[0], false)
+	case OpConcat:
+		for _, sub := range re.Sub {
+			size += p.calcSize(sub, false)
+		}
+	case OpAlternate:
+		for _, sub := range re.Sub {
+			size += p.calcSize(sub, false)
+		}
+		if len(re.Sub) > 1 {
+			size += int64(len(re.Sub)) - 1
+		}
+	case OpRepeat:
+		sub := p.calcSize(re.Sub[0], false)
+		if re.Max == -1 {
+			if re.Min == 0 {
+				size = 2 + sub // x*
+			} else {
+				size = 1 + int64(re.Min)*sub // xxx+
+			}
+			break
+		}
+		// x{2,5} = xx(x(x(x)?)?)?
+		size = int64(re.Max)*sub + int64(re.Max-re.Min)
+	}
+
+	if size < 1 {
+		size = 1
+	}
+	p.size[re] = size
+	return size
+}
+
+func (p *parser) checkHeight(re *Regexp) {
+	if p.numRegexp < maxHeight {
+		return
+	}
+	if p.height == nil {
+		p.height = make(map[*Regexp]int)
+		for _, re := range p.stack {
+			p.checkHeight(re)
+		}
+	}
+	if p.calcHeight(re, true) > maxHeight {
+		panic(ErrNestingDepth)
+	}
+}
+
+func (p *parser) calcHeight(re *Regexp, force bool) int {
+	if !force {
+		if h, ok := p.height[re]; ok {
+			return h
+		}
+	}
+	h := 1
+	for _, sub := range re.Sub {
+		hsub := p.calcHeight(sub, false)
+		if h < 1+hsub {
+			h = 1 + hsub
+		}
+	}
+	p.height[re] = h
+	return h
+}
+
+// Parse stack manipulation.
+
+// push pushes the regexp re onto the parse stack and returns the regexp.
+func (p *parser) push(re *Regexp) *Regexp {
+	p.numRunes += len(re.Rune)
+	if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
+		// Single rune.
+		if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
+			return nil
+		}
+		re.Op = OpLiteral
+		re.Rune = re.Rune[:1]
+		re.Flags = p.flags &^ FoldCase
+	} else if re.Op == OpCharClass && len(re.Rune) == 4 &&
+		re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
+		unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
+		unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
+		re.Op == OpCharClass && len(re.Rune) == 2 &&
+			re.Rune[0]+1 == re.Rune[1] &&
+			unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
+			unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
+		// Case-insensitive rune like [Aa] or [Δδ].
+		if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
+			return nil
+		}
+
+		// Rewrite as (case-insensitive) literal.
+		re.Op = OpLiteral
+		re.Rune = re.Rune[:1]
+		re.Flags = p.flags | FoldCase
+	} else {
+		// Incremental concatenation.
+		p.maybeConcat(-1, 0)
+	}
+
+	p.stack = append(p.stack, re)
+	p.checkLimits(re)
+	return re
+}
+
+// maybeConcat implements incremental concatenation
+// of literal runes into string nodes. The parser calls this
+// before each push, so only the top fragment of the stack
+// might need processing. Since this is called before a push,
+// the topmost literal is no longer subject to operators like *
+// (Otherwise ab* would turn into (ab)*.)
+// If r >= 0 and there's a node left over, maybeConcat uses it
+// to push r with the given flags.
+// maybeConcat reports whether r was pushed.
+func (p *parser) maybeConcat(r rune, flags Flags) bool {
+	n := len(p.stack)
+	if n < 2 {
+		return false
+	}
+
+	re1 := p.stack[n-1]
+	re2 := p.stack[n-2]
+	if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
+		return false
+	}
+
+	// Push re1 into re2.
+	re2.Rune = append(re2.Rune, re1.Rune...)
+
+	// Reuse re1 if possible.
+	if r >= 0 {
+		re1.Rune = re1.Rune0[:1]
+		re1.Rune[0] = r
+		re1.Flags = flags
+		return true
+	}
+
+	p.stack = p.stack[:n-1]
+	p.reuse(re1)
+	return false // did not push r
+}
+
+// literal pushes a literal regexp for the rune r on the stack.
+func (p *parser) literal(r rune) {
+	re := p.newRegexp(OpLiteral)
+	re.Flags = p.flags
+	if p.flags&FoldCase != 0 {
+		r = minFoldRune(r)
+	}
+	re.Rune0[0] = r
+	re.Rune = re.Rune0[:1]
+	p.push(re)
+}
+
+// minFoldRune returns the minimum rune fold-equivalent to r.
+func minFoldRune(r rune) rune {
+	if r < minFold || r > maxFold {
+		return r
+	}
+	min := r
+	r0 := r
+	for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
+		if min > r {
+			min = r
+		}
+	}
+	return min
+}
+
+// op pushes a regexp with the given op onto the stack
+// and returns that regexp.
+func (p *parser) op(op Op) *Regexp {
+	re := p.newRegexp(op)
+	re.Flags = p.flags
+	return p.push(re)
+}
+
+// repeat replaces the top stack element with itself repeated according to op, min, max.
+// before is the regexp suffix starting at the repetition operator.
+// after is the regexp suffix following after the repetition operator.
+// repeat returns an updated 'after' and an error, if any.
+func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
+	flags := p.flags
+	if p.flags&PerlX != 0 {
+		if len(after) > 0 && after[0] == '?' {
+			after = after[1:]
+			flags ^= NonGreedy
+		}
+		if lastRepeat != "" {
+			// In Perl it is not allowed to stack repetition operators:
+			// a** is a syntax error, not a doubled star, and a++ means
+			// something else entirely, which we don't support!
+			return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
+		}
+	}
+	n := len(p.stack)
+	if n == 0 {
+		return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
+	}
+	sub := p.stack[n-1]
+	if sub.Op >= opPseudo {
+		return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
+	}
+
+	re := p.newRegexp(op)
+	re.Min = min
+	re.Max = max
+	re.Flags = flags
+	re.Sub = re.Sub0[:1]
+	re.Sub[0] = sub
+	p.stack[n-1] = re
+	p.checkLimits(re)
+
+	if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
+		return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
+	}
+
+	return after, nil
+}
+
+// repeatIsValid reports whether the repetition re is valid.
+// Valid means that the combination of the top-level repetition
+// and any inner repetitions does not exceed n copies of the
+// innermost thing.
+// This function rewalks the regexp tree and is called for every repetition,
+// so we have to worry about inducing quadratic behavior in the parser.
+// We avoid this by only calling repeatIsValid when min or max >= 2.
+// In that case the depth of any >= 2 nesting can only get to 9 without
+// triggering a parse error, so each subtree can only be rewalked 9 times.
+func repeatIsValid(re *Regexp, n int) bool {
+	if re.Op == OpRepeat {
+		m := re.Max
+		if m == 0 {
+			return true
+		}
+		if m < 0 {
+			m = re.Min
+		}
+		if m > n {
+			return false
+		}
+		if m > 0 {
+			n /= m
+		}
+	}
+	for _, sub := range re.Sub {
+		if !repeatIsValid(sub, n) {
+			return false
+		}
+	}
+	return true
+}
+
+// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
+func (p *parser) concat() *Regexp {
+	p.maybeConcat(-1, 0)
+
+	// Scan down to find pseudo-operator | or (.
+	i := len(p.stack)
+	for i > 0 && p.stack[i-1].Op < opPseudo {
+		i--
+	}
+	subs := p.stack[i:]
+	p.stack = p.stack[:i]
+
+	// Empty concatenation is special case.
+	if len(subs) == 0 {
+		return p.push(p.newRegexp(OpEmptyMatch))
+	}
+
+	return p.push(p.collapse(subs, OpConcat))
+}
+
+// alternate replaces the top of the stack (above the topmost '(') with its alternation.
+func (p *parser) alternate() *Regexp {
+	// Scan down to find pseudo-operator (.
+	// There are no | above (.
+	i := len(p.stack)
+	for i > 0 && p.stack[i-1].Op < opPseudo {
+		i--
+	}
+	subs := p.stack[i:]
+	p.stack = p.stack[:i]
+
+	// Make sure top class is clean.
+	// All the others already are (see swapVerticalBar).
+	if len(subs) > 0 {
+		cleanAlt(subs[len(subs)-1])
+	}
+
+	// Empty alternate is special case
+	// (shouldn't happen but easy to handle).
+	if len(subs) == 0 {
+		return p.push(p.newRegexp(OpNoMatch))
+	}
+
+	return p.push(p.collapse(subs, OpAlternate))
+}
+
+// cleanAlt cleans re for eventual inclusion in an alternation.
+func cleanAlt(re *Regexp) {
+	switch re.Op {
+	case OpCharClass:
+		re.Rune = cleanClass(&re.Rune)
+		if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
+			re.Rune = nil
+			re.Op = OpAnyChar
+			return
+		}
+		if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
+			re.Rune = nil
+			re.Op = OpAnyCharNotNL
+			return
+		}
+		if cap(re.Rune)-len(re.Rune) > 100 {
+			// re.Rune will not grow any more.
+			// Make a copy or inline to reclaim storage.
+			re.Rune = append(re.Rune0[:0], re.Rune...)
+		}
+	}
+}
+
+// collapse returns the result of applying op to sub.
+// If sub contains op nodes, they all get hoisted up
+// so that there is never a concat of a concat or an
+// alternate of an alternate.
+func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
+	if len(subs) == 1 {
+		return subs[0]
+	}
+	re := p.newRegexp(op)
+	re.Sub = re.Sub0[:0]
+	for _, sub := range subs {
+		if sub.Op == op {
+			re.Sub = append(re.Sub, sub.Sub...)
+			p.reuse(sub)
+		} else {
+			re.Sub = append(re.Sub, sub)
+		}
+	}
+	if op == OpAlternate {
+		re.Sub = p.factor(re.Sub)
+		if len(re.Sub) == 1 {
+			old := re
+			re = re.Sub[0]
+			p.reuse(old)
+		}
+	}
+	return re
+}
+
+// factor factors common prefixes from the alternation list sub.
+// It returns a replacement list that reuses the same storage and
+// frees (passes to p.reuse) any removed *Regexps.
+//
+// For example,
+//
+//	ABC|ABD|AEF|BCX|BCY
+//
+// simplifies by literal prefix extraction to
+//
+//	A(B(C|D)|EF)|BC(X|Y)
+//
+// which simplifies by character class introduction to
+//
+//	A(B[CD]|EF)|BC[XY]
+func (p *parser) factor(sub []*Regexp) []*Regexp {
+	if len(sub) < 2 {
+		return sub
+	}
+
+	// Round 1: Factor out common literal prefixes.
+	var str []rune
+	var strflags Flags
+	start := 0
+	out := sub[:0]
+	for i := 0; i <= len(sub); i++ {
+		// Invariant: the Regexps that were in sub[0:start] have been
+		// used or marked for reuse, and the slice space has been reused
+		// for out (len(out) <= start).
+		//
+		// Invariant: sub[start:i] consists of regexps that all begin
+		// with str as modified by strflags.
+		var istr []rune
+		var iflags Flags
+		if i < len(sub) {
+			istr, iflags = p.leadingString(sub[i])
+			if iflags == strflags {
+				same := 0
+				for same < len(str) && same < len(istr) && str[same] == istr[same] {
+					same++
+				}
+				if same > 0 {
+					// Matches at least one rune in current range.
+					// Keep going around.
+					str = str[:same]
+					continue
+				}
+			}
+		}
+
+		// Found end of a run with common leading literal string:
+		// sub[start:i] all begin with str[0:len(str)], but sub[i]
+		// does not even begin with str[0].
+		//
+		// Factor out common string and append factored expression to out.
+		if i == start {
+			// Nothing to do - run of length 0.
+		} else if i == start+1 {
+			// Just one: don't bother factoring.
+			out = append(out, sub[start])
+		} else {
+			// Construct factored form: prefix(suffix1|suffix2|...)
+			prefix := p.newRegexp(OpLiteral)
+			prefix.Flags = strflags
+			prefix.Rune = append(prefix.Rune[:0], str...)
+
+			for j := start; j < i; j++ {
+				sub[j] = p.removeLeadingString(sub[j], len(str))
+				p.checkLimits(sub[j])
+			}
+			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
+
+			re := p.newRegexp(OpConcat)
+			re.Sub = append(re.Sub[:0], prefix, suffix)
+			out = append(out, re)
+		}
+
+		// Prepare for next iteration.
+		start = i
+		str = istr
+		strflags = iflags
+	}
+	sub = out
+
+	// Round 2: Factor out common simple prefixes,
+	// just the first piece of each concatenation.
+	// This will be good enough a lot of the time.
+	//
+	// Complex subexpressions (e.g. involving quantifiers)
+	// are not safe to factor because that collapses their
+	// distinct paths through the automaton, which affects
+	// correctness in some cases.
+	start = 0
+	out = sub[:0]
+	var first *Regexp
+	for i := 0; i <= len(sub); i++ {
+		// Invariant: the Regexps that were in sub[0:start] have been
+		// used or marked for reuse, and the slice space has been reused
+		// for out (len(out) <= start).
+		//
+		// Invariant: sub[start:i] consists of regexps that all begin with ifirst.
+		var ifirst *Regexp
+		if i < len(sub) {
+			ifirst = p.leadingRegexp(sub[i])
+			if first != nil && first.Equal(ifirst) &&
+				// first must be a character class OR a fixed repeat of a character class.
+				(isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
+				continue
+			}
+		}
+
+		// Found end of a run with common leading regexp:
+		// sub[start:i] all begin with first but sub[i] does not.
+		//
+		// Factor out common regexp and append factored expression to out.
+		if i == start {
+			// Nothing to do - run of length 0.
+		} else if i == start+1 {
+			// Just one: don't bother factoring.
+			out = append(out, sub[start])
+		} else {
+			// Construct factored form: prefix(suffix1|suffix2|...)
+			prefix := first
+			for j := start; j < i; j++ {
+				reuse := j != start // prefix came from sub[start]
+				sub[j] = p.removeLeadingRegexp(sub[j], reuse)
+				p.checkLimits(sub[j])
+			}
+			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
+
+			re := p.newRegexp(OpConcat)
+			re.Sub = append(re.Sub[:0], prefix, suffix)
+			out = append(out, re)
+		}
+
+		// Prepare for next iteration.
+		start = i
+		first = ifirst
+	}
+	sub = out
+
+	// Round 3: Collapse runs of single literals into character classes.
+	start = 0
+	out = sub[:0]
+	for i := 0; i <= len(sub); i++ {
+		// Invariant: the Regexps that were in sub[0:start] have been
+		// used or marked for reuse, and the slice space has been reused
+		// for out (len(out) <= start).
+		//
+		// Invariant: sub[start:i] consists of regexps that are either
+		// literal runes or character classes.
+		if i < len(sub) && isCharClass(sub[i]) {
+			continue
+		}
+
+		// sub[i] is not a char or char class;
+		// emit char class for sub[start:i]...
+		if i == start {
+			// Nothing to do - run of length 0.
+		} else if i == start+1 {
+			out = append(out, sub[start])
+		} else {
+			// Make new char class.
+			// Start with most complex regexp in sub[start].
+			max := start
+			for j := start + 1; j < i; j++ {
+				if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
+					max = j
+				}
+			}
+			sub[start], sub[max] = sub[max], sub[start]
+
+			for j := start + 1; j < i; j++ {
+				mergeCharClass(sub[start], sub[j])
+				p.reuse(sub[j])
+			}
+			cleanAlt(sub[start])
+			out = append(out, sub[start])
+		}
+
+		// ... and then emit sub[i].
+		if i < len(sub) {
+			out = append(out, sub[i])
+		}
+		start = i + 1
+	}
+	sub = out
+
+	// Round 4: Collapse runs of empty matches into a single empty match.
+	start = 0
+	out = sub[:0]
+	for i := range sub {
+		if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
+			continue
+		}
+		out = append(out, sub[i])
+	}
+	sub = out
+
+	return sub
+}
+
+// leadingString returns the leading literal string that re begins with.
+// The string refers to storage in re or its children.
+func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
+	if re.Op == OpConcat && len(re.Sub) > 0 {
+		re = re.Sub[0]
+	}
+	if re.Op != OpLiteral {
+		return nil, 0
+	}
+	return re.Rune, re.Flags & FoldCase
+}
+
+// removeLeadingString removes the first n leading runes
+// from the beginning of re. It returns the replacement for re.
+func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
+	if re.Op == OpConcat && len(re.Sub) > 0 {
+		// Removing a leading string in a concatenation
+		// might simplify the concatenation.
+		sub := re.Sub[0]
+		sub = p.removeLeadingString(sub, n)
+		re.Sub[0] = sub
+		if sub.Op == OpEmptyMatch {
+			p.reuse(sub)
+			switch len(re.Sub) {
+			case 0, 1:
+				// Impossible but handle.
+				re.Op = OpEmptyMatch
+				re.Sub = nil
+			case 2:
+				old := re
+				re = re.Sub[1]
+				p.reuse(old)
+			default:
+				copy(re.Sub, re.Sub[1:])
+				re.Sub = re.Sub[:len(re.Sub)-1]
+			}
+		}
+		return re
+	}
+
+	if re.Op == OpLiteral {
+		re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
+		if len(re.Rune) == 0 {
+			re.Op = OpEmptyMatch
+		}
+	}
+	return re
+}
+
+// leadingRegexp returns the leading regexp that re begins with.
+// The regexp refers to storage in re or its children.
+func (p *parser) leadingRegexp(re *Regexp) *Regexp {
+	if re.Op == OpEmptyMatch {
+		return nil
+	}
+	if re.Op == OpConcat && len(re.Sub) > 0 {
+		sub := re.Sub[0]
+		if sub.Op == OpEmptyMatch {
+			return nil
+		}
+		return sub
+	}
+	return re
+}
+
+// removeLeadingRegexp removes the leading regexp in re.
+// It returns the replacement for re.
+// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
+func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
+	if re.Op == OpConcat && len(re.Sub) > 0 {
+		if reuse {
+			p.reuse(re.Sub[0])
+		}
+		re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
+		switch len(re.Sub) {
+		case 0:
+			re.Op = OpEmptyMatch
+			re.Sub = nil
+		case 1:
+			old := re
+			re = re.Sub[0]
+			p.reuse(old)
+		}
+		return re
+	}
+	if reuse {
+		p.reuse(re)
+	}
+	return p.newRegexp(OpEmptyMatch)
+}
+
+func literalRegexp(s string, flags Flags) *Regexp {
+	re := &Regexp{Op: OpLiteral}
+	re.Flags = flags
+	re.Rune = re.Rune0[:0] // use local storage for small strings
+	for _, c := range s {
+		if len(re.Rune) >= cap(re.Rune) {
+			// string is too long to fit in Rune0.  let Go handle it
+			re.Rune = []rune(s)
+			break
+		}
+		re.Rune = append(re.Rune, c)
+	}
+	return re
+}
+
+// Parsing.
+
+// Parse parses a regular expression string s, controlled by the specified
+// Flags, and returns a regular expression parse tree. The syntax is
+// described in the top-level comment.
+func Parse(s string, flags Flags) (*Regexp, error) {
+	return parse(s, flags)
+}
+
+func parse(s string, flags Flags) (_ *Regexp, err error) {
+	defer func() {
+		switch r := recover(); r {
+		default:
+			panic(r)
+		case nil:
+			// ok
+		case ErrLarge: // too big
+			err = &Error{Code: ErrLarge, Expr: s}
+		case ErrNestingDepth:
+			err = &Error{Code: ErrNestingDepth, Expr: s}
+		}
+	}()
+
+	if flags&Literal != 0 {
+		// Trivial parser for literal string.
+		if err := checkUTF8(s); err != nil {
+			return nil, err
+		}
+		return literalRegexp(s, flags), nil
+	}
+
+	// Otherwise, must do real work.
+	var (
+		p          parser
+		c          rune
+		op         Op
+		lastRepeat string
+	)
+	p.flags = flags
+	p.wholeRegexp = s
+	t := s
+	for t != "" {
+		repeat := ""
+	BigSwitch:
+		switch t[0] {
+		default:
+			if c, t, err = nextRune(t); err != nil {
+				return nil, err
+			}
+			p.literal(c)
+
+		case '(':
+			if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
+				// Flag changes and non-capturing groups.
+				if t, err = p.parsePerlFlags(t); err != nil {
+					return nil, err
+				}
+				break
+			}
+			p.numCap++
+			p.op(opLeftParen).Cap = p.numCap
+			t = t[1:]
+		case '|':
+			if err = p.parseVerticalBar(); err != nil {
+				return nil, err
+			}
+			t = t[1:]
+		case ')':
+			if err = p.parseRightParen(); err != nil {
+				return nil, err
+			}
+			t = t[1:]
+		case '^':
+			if p.flags&OneLine != 0 {
+				p.op(OpBeginText)
+			} else {
+				p.op(OpBeginLine)
+			}
+			t = t[1:]
+		case '$':
+			if p.flags&OneLine != 0 {
+				p.op(OpEndText).Flags |= WasDollar
+			} else {
+				p.op(OpEndLine)
+			}
+			t = t[1:]
+		case '.':
+			if p.flags&DotNL != 0 {
+				p.op(OpAnyChar)
+			} else {
+				p.op(OpAnyCharNotNL)
+			}
+			t = t[1:]
+		case '[':
+			if t, err = p.parseClass(t); err != nil {
+				return nil, err
+			}
+		case '*', '+', '?':
+			before := t
+			switch t[0] {
+			case '*':
+				op = OpStar
+			case '+':
+				op = OpPlus
+			case '?':
+				op = OpQuest
+			}
+			after := t[1:]
+			if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
+				return nil, err
+			}
+			repeat = before
+			t = after
+		case '{':
+			op = OpRepeat
+			before := t
+			min, max, after, ok := p.parseRepeat(t)
+			if !ok {
+				// If the repeat cannot be parsed, { is a literal.
+				p.literal('{')
+				t = t[1:]
+				break
+			}
+			if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
+				// Numbers were too big, or max is present and min > max.
+				return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
+			}
+			if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
+				return nil, err
+			}
+			repeat = before
+			t = after
+		case '\\':
+			if p.flags&PerlX != 0 && len(t) >= 2 {
+				switch t[1] {
+				case 'A':
+					p.op(OpBeginText)
+					t = t[2:]
+					break BigSwitch
+				case 'b':
+					p.op(OpWordBoundary)
+					t = t[2:]
+					break BigSwitch
+				case 'B':
+					p.op(OpNoWordBoundary)
+					t = t[2:]
+					break BigSwitch
+				case 'C':
+					// any byte; not supported
+					return nil, &Error{ErrInvalidEscape, t[:2]}
+				case 'Q':
+					// \Q ... \E: the ... is always literals
+					var lit string
+					lit, t, _ = strings.Cut(t[2:], `\E`)
+					for lit != "" {
+						c, rest, err := nextRune(lit)
+						if err != nil {
+							return nil, err
+						}
+						p.literal(c)
+						lit = rest
+					}
+					break BigSwitch
+				case 'z':
+					p.op(OpEndText)
+					t = t[2:]
+					break BigSwitch
+				}
+			}
+
+			re := p.newRegexp(OpCharClass)
+			re.Flags = p.flags
+
+			// Look for Unicode character group like \p{Han}
+			if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
+				r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
+				if err != nil {
+					return nil, err
+				}
+				if r != nil {
+					re.Rune = r
+					t = rest
+					p.push(re)
+					break BigSwitch
+				}
+			}
+
+			// Perl character class escape.
+			if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
+				re.Rune = r
+				t = rest
+				p.push(re)
+				break BigSwitch
+			}
+			p.reuse(re)
+
+			// Ordinary single-character escape.
+			if c, t, err = p.parseEscape(t); err != nil {
+				return nil, err
+			}
+			p.literal(c)
+		}
+		lastRepeat = repeat
+	}
+
+	p.concat()
+	if p.swapVerticalBar() {
+		// pop vertical bar
+		p.stack = p.stack[:len(p.stack)-1]
+	}
+	p.alternate()
+
+	n := len(p.stack)
+	if n != 1 {
+		return nil, &Error{ErrMissingParen, s}
+	}
+	return p.stack[0], nil
+}
+
+// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
+// If s is not of that form, it returns ok == false.
+// If s has the right form but the values are too big, it returns min == -1, ok == true.
+func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
+	if s == "" || s[0] != '{' {
+		return
+	}
+	s = s[1:]
+	var ok1 bool
+	if min, s, ok1 = p.parseInt(s); !ok1 {
+		return
+	}
+	if s == "" {
+		return
+	}
+	if s[0] != ',' {
+		max = min
+	} else {
+		s = s[1:]
+		if s == "" {
+			return
+		}
+		if s[0] == '}' {
+			max = -1
+		} else if max, s, ok1 = p.parseInt(s); !ok1 {
+			return
+		} else if max < 0 {
+			// parseInt found too big a number
+			min = -1
+		}
+	}
+	if s == "" || s[0] != '}' {
+		return
+	}
+	rest = s[1:]
+	ok = true
+	return
+}
+
+// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
+// like (?i) or (?: or (?i:.  It removes the prefix from s and updates the parse state.
+// The caller must have ensured that s begins with "(?".
+func (p *parser) parsePerlFlags(s string) (rest string, err error) {
+	t := s
+
+	// Check for named captures, first introduced in Python's regexp library.
+	// As usual, there are three slightly different syntaxes:
+	//
+	//   (?P<name>expr)   the original, introduced by Python
+	//   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
+	//   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
+	//
+	// Perl 5.10 gave in and implemented the Python version too,
+	// but they claim that the last two are the preferred forms.
+	// PCRE and languages based on it (specifically, PHP and Ruby)
+	// support all three as well. EcmaScript 4 uses only the Python form.
+	//
+	// In both the open source world (via Code Search) and the
+	// Google source tree, (?P<expr>name) is the dominant form,
+	// so that's the one we implement. One is enough.
+	if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
+		// Pull out name.
+		end := strings.IndexRune(t, '>')
+		if end < 0 {
+			if err = checkUTF8(t); err != nil {
+				return "", err
+			}
+			return "", &Error{ErrInvalidNamedCapture, s}
+		}
+
+		capture := t[:end+1] // "(?P<name>"
+		name := t[4:end]     // "name"
+		if err = checkUTF8(name); err != nil {
+			return "", err
+		}
+		if !isValidCaptureName(name) {
+			return "", &Error{ErrInvalidNamedCapture, capture}
+		}
+
+		// Like ordinary capture, but named.
+		p.numCap++
+		re := p.op(opLeftParen)
+		re.Cap = p.numCap
+		re.Name = name
+		return t[end+1:], nil
+	}
+
+	// Non-capturing group. Might also twiddle Perl flags.
+	var c rune
+	t = t[2:] // skip (?
+	flags := p.flags
+	sign := +1
+	sawFlag := false
+Loop:
+	for t != "" {
+		if c, t, err = nextRune(t); err != nil {
+			return "", err
+		}
+		switch c {
+		default:
+			break Loop
+
+		// Flags.
+		case 'i':
+			flags |= FoldCase
+			sawFlag = true
+		case 'm':
+			flags &^= OneLine
+			sawFlag = true
+		case 's':
+			flags |= DotNL
+			sawFlag = true
+		case 'U':
+			flags |= NonGreedy
+			sawFlag = true
+
+		// Switch to negation.
+		case '-':
+			if sign < 0 {
+				break Loop
+			}
+			sign = -1
+			// Invert flags so that | above turn into &^ and vice versa.
+			// We'll invert flags again before using it below.
+			flags = ^flags
+			sawFlag = false
+
+		// End of flags, starting group or not.
+		case ':', ')':
+			if sign < 0 {
+				if !sawFlag {
+					break Loop
+				}
+				flags = ^flags
+			}
+			if c == ':' {
+				// Open new group
+				p.op(opLeftParen)
+			}
+			p.flags = flags
+			return t, nil
+		}
+	}
+
+	return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
+}
+
+// isValidCaptureName reports whether name
+// is a valid capture name: [A-Za-z0-9_]+.
+// PCRE limits names to 32 bytes.
+// Python rejects names starting with digits.
+// We don't enforce either of those.
+func isValidCaptureName(name string) bool {
+	if name == "" {
+		return false
+	}
+	for _, c := range name {
+		if c != '_' && !isalnum(c) {
+			return false
+		}
+	}
+	return true
+}
+
+// parseInt parses a decimal integer.
+func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
+	if s == "" || s[0] < '0' || '9' < s[0] {
+		return
+	}
+	// Disallow leading zeros.
+	if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
+		return
+	}
+	t := s
+	for s != "" && '0' <= s[0] && s[0] <= '9' {
+		s = s[1:]
+	}
+	rest = s
+	ok = true
+	// Have digits, compute value.
+	t = t[:len(t)-len(s)]
+	for i := 0; i < len(t); i++ {
+		// Avoid overflow.
+		if n >= 1e8 {
+			n = -1
+			break
+		}
+		n = n*10 + int(t[i]) - '0'
+	}
+	return
+}
+
+// can this be represented as a character class?
+// single-rune literal string, char class, ., and .|\n.
+func isCharClass(re *Regexp) bool {
+	return re.Op == OpLiteral && len(re.Rune) == 1 ||
+		re.Op == OpCharClass ||
+		re.Op == OpAnyCharNotNL ||
+		re.Op == OpAnyChar
+}
+
+// does re match r?
+func matchRune(re *Regexp, r rune) bool {
+	switch re.Op {
+	case OpLiteral:
+		return len(re.Rune) == 1 && re.Rune[0] == r
+	case OpCharClass:
+		for i := 0; i < len(re.Rune); i += 2 {
+			if re.Rune[i] <= r && r <= re.Rune[i+1] {
+				return true
+			}
+		}
+		return false
+	case OpAnyCharNotNL:
+		return r != '\n'
+	case OpAnyChar:
+		return true
+	}
+	return false
+}
+
+// parseVerticalBar handles a | in the input.
+func (p *parser) parseVerticalBar() error {
+	p.concat()
+
+	// The concatenation we just parsed is on top of the stack.
+	// If it sits above an opVerticalBar, swap it below
+	// (things below an opVerticalBar become an alternation).
+	// Otherwise, push a new vertical bar.
+	if !p.swapVerticalBar() {
+		p.op(opVerticalBar)
+	}
+
+	return nil
+}
+
+// mergeCharClass makes dst = dst|src.
+// The caller must ensure that dst.Op >= src.Op,
+// to reduce the amount of copying.
+func mergeCharClass(dst, src *Regexp) {
+	switch dst.Op {
+	case OpAnyChar:
+		// src doesn't add anything.
+	case OpAnyCharNotNL:
+		// src might add \n
+		if matchRune(src, '\n') {
+			dst.Op = OpAnyChar
+		}
+	case OpCharClass:
+		// src is simpler, so either literal or char class
+		if src.Op == OpLiteral {
+			dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
+		} else {
+			dst.Rune = appendClass(dst.Rune, src.Rune)
+		}
+	case OpLiteral:
+		// both literal
+		if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
+			break
+		}
+		dst.Op = OpCharClass
+		dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
+		dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
+	}
+}
+
+// If the top of the stack is an element followed by an opVerticalBar
+// swapVerticalBar swaps the two and returns true.
+// Otherwise it returns false.
+func (p *parser) swapVerticalBar() bool {
+	// If above and below vertical bar are literal or char class,
+	// can merge into a single char class.
+	n := len(p.stack)
+	if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
+		re1 := p.stack[n-1]
+		re3 := p.stack[n-3]
+		// Make re3 the more complex of the two.
+		if re1.Op > re3.Op {
+			re1, re3 = re3, re1
+			p.stack[n-3] = re3
+		}
+		mergeCharClass(re3, re1)
+		p.reuse(re1)
+		p.stack = p.stack[:n-1]
+		return true
+	}
+
+	if n >= 2 {
+		re1 := p.stack[n-1]
+		re2 := p.stack[n-2]
+		if re2.Op == opVerticalBar {
+			if n >= 3 {
+				// Now out of reach.
+				// Clean opportunistically.
+				cleanAlt(p.stack[n-3])
+			}
+			p.stack[n-2] = re1
+			p.stack[n-1] = re2
+			return true
+		}
+	}
+	return false
+}
+
+// parseRightParen handles a ) in the input.
+func (p *parser) parseRightParen() error {
+	p.concat()
+	if p.swapVerticalBar() {
+		// pop vertical bar
+		p.stack = p.stack[:len(p.stack)-1]
+	}
+	p.alternate()
+
+	n := len(p.stack)
+	if n < 2 {
+		return &Error{ErrUnexpectedParen, p.wholeRegexp}
+	}
+	re1 := p.stack[n-1]
+	re2 := p.stack[n-2]
+	p.stack = p.stack[:n-2]
+	if re2.Op != opLeftParen {
+		return &Error{ErrUnexpectedParen, p.wholeRegexp}
+	}
+	// Restore flags at time of paren.
+	p.flags = re2.Flags
+	if re2.Cap == 0 {
+		// Just for grouping.
+		p.push(re1)
+	} else {
+		re2.Op = OpCapture
+		re2.Sub = re2.Sub0[:1]
+		re2.Sub[0] = re1
+		p.push(re2)
+	}
+	return nil
+}
+
+// parseEscape parses an escape sequence at the beginning of s
+// and returns the rune.
+func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
+	t := s[1:]
+	if t == "" {
+		return 0, "", &Error{ErrTrailingBackslash, ""}
+	}
+	c, t, err := nextRune(t)
+	if err != nil {
+		return 0, "", err
+	}
+
+Switch:
+	switch c {
+	default:
+		if c < utf8.RuneSelf && !isalnum(c) {
+			// Escaped non-word characters are always themselves.
+			// PCRE is not quite so rigorous: it accepts things like
+			// \q, but we don't. We once rejected \_, but too many
+			// programs and people insist on using it, so allow \_.
+			return c, t, nil
+		}
+
+	// Octal escapes.
+	case '1', '2', '3', '4', '5', '6', '7':
+		// Single non-zero digit is a backreference; not supported
+		if t == "" || t[0] < '0' || t[0] > '7' {
+			break
+		}
+		fallthrough
+	case '0':
+		// Consume up to three octal digits; already have one.
+		r = c - '0'
+		for i := 1; i < 3; i++ {
+			if t == "" || t[0] < '0' || t[0] > '7' {
+				break
+			}
+			r = r*8 + rune(t[0]) - '0'
+			t = t[1:]
+		}
+		return r, t, nil
+
+	// Hexadecimal escapes.
+	case 'x':
+		if t == "" {
+			break
+		}
+		if c, t, err = nextRune(t); err != nil {
+			return 0, "", err
+		}
+		if c == '{' {
+			// Any number of digits in braces.
+			// Perl accepts any text at all; it ignores all text
+			// after the first non-hex digit. We require only hex digits,
+			// and at least one.
+			nhex := 0
+			r = 0
+			for {
+				if t == "" {
+					break Switch
+				}
+				if c, t, err = nextRune(t); err != nil {
+					return 0, "", err
+				}
+				if c == '}' {
+					break
+				}
+				v := unhex(c)
+				if v < 0 {
+					break Switch
+				}
+				r = r*16 + v
+				if r > unicode.MaxRune {
+					break Switch
+				}
+				nhex++
+			}
+			if nhex == 0 {
+				break Switch
+			}
+			return r, t, nil
+		}
+
+		// Easy case: two hex digits.
+		x := unhex(c)
+		if c, t, err = nextRune(t); err != nil {
+			return 0, "", err
+		}
+		y := unhex(c)
+		if x < 0 || y < 0 {
+			break
+		}
+		return x*16 + y, t, nil
+
+	// C escapes. There is no case 'b', to avoid misparsing
+	// the Perl word-boundary \b as the C backspace \b
+	// when in POSIX mode. In Perl, /\b/ means word-boundary
+	// but /[\b]/ means backspace. We don't support that.
+	// If you want a backspace, embed a literal backspace
+	// character or use \x08.
+	case 'a':
+		return '\a', t, err
+	case 'f':
+		return '\f', t, err
+	case 'n':
+		return '\n', t, err
+	case 'r':
+		return '\r', t, err
+	case 't':
+		return '\t', t, err
+	case 'v':
+		return '\v', t, err
+	}
+	return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
+}
+
+// parseClassChar parses a character class character at the beginning of s
+// and returns it.
+func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
+	if s == "" {
+		return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
+	}
+
+	// Allow regular escape sequences even though
+	// many need not be escaped in this context.
+	if s[0] == '\\' {
+		return p.parseEscape(s)
+	}
+
+	return nextRune(s)
+}
+
+type charGroup struct {
+	sign  int
+	class []rune
+}
+
+// parsePerlClassEscape parses a leading Perl character class escape like \d
+// from the beginning of s. If one is present, it appends the characters to r
+// and returns the new slice r and the remainder of the string.
+func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
+	if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
+		return
+	}
+	g := perlGroup[s[0:2]]
+	if g.sign == 0 {
+		return
+	}
+	return p.appendGroup(r, g), s[2:]
+}
+
+// parseNamedClass parses a leading POSIX named character class like [:alnum:]
+// from the beginning of s. If one is present, it appends the characters to r
+// and returns the new slice r and the remainder of the string.
+func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
+	if len(s) < 2 || s[0] != '[' || s[1] != ':' {
+		return
+	}
+
+	i := strings.Index(s[2:], ":]")
+	if i < 0 {
+		return
+	}
+	i += 2
+	name, s := s[0:i+2], s[i+2:]
+	g := posixGroup[name]
+	if g.sign == 0 {
+		return nil, "", &Error{ErrInvalidCharRange, name}
+	}
+	return p.appendGroup(r, g), s, nil
+}
+
+func (p *parser) appendGroup(r []rune, g charGroup) []rune {
+	if p.flags&FoldCase == 0 {
+		if g.sign < 0 {
+			r = appendNegatedClass(r, g.class)
+		} else {
+			r = appendClass(r, g.class)
+		}
+	} else {
+		tmp := p.tmpClass[:0]
+		tmp = appendFoldedClass(tmp, g.class)
+		p.tmpClass = tmp
+		tmp = cleanClass(&p.tmpClass)
+		if g.sign < 0 {
+			r = appendNegatedClass(r, tmp)
+		} else {
+			r = appendClass(r, tmp)
+		}
+	}
+	return r
+}
+
+var anyTable = &unicode.RangeTable{
+	R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
+	R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
+}
+
+// unicodeTable returns the unicode.RangeTable identified by name
+// and the table of additional fold-equivalent code points.
+func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
+	// Special case: "Any" means any.
+	if name == "Any" {
+		return anyTable, anyTable
+	}
+	if t := unicode.Categories[name]; t != nil {
+		return t, unicode.FoldCategory[name]
+	}
+	if t := unicode.Scripts[name]; t != nil {
+		return t, unicode.FoldScript[name]
+	}
+	return nil, nil
+}
+
+// parseUnicodeClass parses a leading Unicode character class like \p{Han}
+// from the beginning of s. If one is present, it appends the characters to r
+// and returns the new slice r and the remainder of the string.
+func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
+	if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
+		return
+	}
+
+	// Committed to parse or return error.
+	sign := +1
+	if s[1] == 'P' {
+		sign = -1
+	}
+	t := s[2:]
+	c, t, err := nextRune(t)
+	if err != nil {
+		return
+	}
+	var seq, name string
+	if c != '{' {
+		// Single-letter name.
+		seq = s[:len(s)-len(t)]
+		name = seq[2:]
+	} else {
+		// Name is in braces.
+		end := strings.IndexRune(s, '}')
+		if end < 0 {
+			if err = checkUTF8(s); err != nil {
+				return
+			}
+			return nil, "", &Error{ErrInvalidCharRange, s}
+		}
+		seq, t = s[:end+1], s[end+1:]
+		name = s[3:end]
+		if err = checkUTF8(name); err != nil {
+			return
+		}
+	}
+
+	// Group can have leading negation too.  \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
+	if name != "" && name[0] == '^' {
+		sign = -sign
+		name = name[1:]
+	}
+
+	tab, fold := unicodeTable(name)
+	if tab == nil {
+		return nil, "", &Error{ErrInvalidCharRange, seq}
+	}
+
+	if p.flags&FoldCase == 0 || fold == nil {
+		if sign > 0 {
+			r = appendTable(r, tab)
+		} else {
+			r = appendNegatedTable(r, tab)
+		}
+	} else {
+		// Merge and clean tab and fold in a temporary buffer.
+		// This is necessary for the negative case and just tidy
+		// for the positive case.
+		tmp := p.tmpClass[:0]
+		tmp = appendTable(tmp, tab)
+		tmp = appendTable(tmp, fold)
+		p.tmpClass = tmp
+		tmp = cleanClass(&p.tmpClass)
+		if sign > 0 {
+			r = appendClass(r, tmp)
+		} else {
+			r = appendNegatedClass(r, tmp)
+		}
+	}
+	return r, t, nil
+}
+
+// parseClass parses a character class at the beginning of s
+// and pushes it onto the parse stack.
+func (p *parser) parseClass(s string) (rest string, err error) {
+	t := s[1:] // chop [
+	re := p.newRegexp(OpCharClass)
+	re.Flags = p.flags
+	re.Rune = re.Rune0[:0]
+
+	sign := +1
+	if t != "" && t[0] == '^' {
+		sign = -1
+		t = t[1:]
+
+		// If character class does not match \n, add it here,
+		// so that negation later will do the right thing.
+		if p.flags&ClassNL == 0 {
+			re.Rune = append(re.Rune, '\n', '\n')
+		}
+	}
+
+	class := re.Rune
+	first := true // ] and - are okay as first char in class
+	for t == "" || t[0] != ']' || first {
+		// POSIX: - is only okay unescaped as first or last in class.
+		// Perl: - is okay anywhere.
+		if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
+			_, size := utf8.DecodeRuneInString(t[1:])
+			return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
+		}
+		first = false
+
+		// Look for POSIX [:alnum:] etc.
+		if len(t) > 2 && t[0] == '[' && t[1] == ':' {
+			nclass, nt, err := p.parseNamedClass(t, class)
+			if err != nil {
+				return "", err
+			}
+			if nclass != nil {
+				class, t = nclass, nt
+				continue
+			}
+		}
+
+		// Look for Unicode character group like \p{Han}.
+		nclass, nt, err := p.parseUnicodeClass(t, class)
+		if err != nil {
+			return "", err
+		}
+		if nclass != nil {
+			class, t = nclass, nt
+			continue
+		}
+
+		// Look for Perl character class symbols (extension).
+		if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
+			class, t = nclass, nt
+			continue
+		}
+
+		// Single character or simple range.
+		rng := t
+		var lo, hi rune
+		if lo, t, err = p.parseClassChar(t, s); err != nil {
+			return "", err
+		}
+		hi = lo
+		// [a-] means (a|-) so check for final ].
+		if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
+			t = t[1:]
+			if hi, t, err = p.parseClassChar(t, s); err != nil {
+				return "", err
+			}
+			if hi < lo {
+				rng = rng[:len(rng)-len(t)]
+				return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
+			}
+		}
+		if p.flags&FoldCase == 0 {
+			class = appendRange(class, lo, hi)
+		} else {
+			class = appendFoldedRange(class, lo, hi)
+		}
+	}
+	t = t[1:] // chop ]
+
+	// Use &re.Rune instead of &class to avoid allocation.
+	re.Rune = class
+	class = cleanClass(&re.Rune)
+	if sign < 0 {
+		class = negateClass(class)
+	}
+	re.Rune = class
+	p.push(re)
+	return t, nil
+}
+
+// cleanClass sorts the ranges (pairs of elements of r),
+// merges them, and eliminates duplicates.
+func cleanClass(rp *[]rune) []rune {
+
+	// Sort by lo increasing, hi decreasing to break ties.
+	sort.Sort(ranges{rp})
+
+	r := *rp
+	if len(r) < 2 {
+		return r
+	}
+
+	// Merge abutting, overlapping.
+	w := 2 // write index
+	for i := 2; i < len(r); i += 2 {
+		lo, hi := r[i], r[i+1]
+		if lo <= r[w-1]+1 {
+			// merge with previous range
+			if hi > r[w-1] {
+				r[w-1] = hi
+			}
+			continue
+		}
+		// new disjoint range
+		r[w] = lo
+		r[w+1] = hi
+		w += 2
+	}
+
+	return r[:w]
+}
+
+// appendLiteral returns the result of appending the literal x to the class r.
+func appendLiteral(r []rune, x rune, flags Flags) []rune {
+	if flags&FoldCase != 0 {
+		return appendFoldedRange(r, x, x)
+	}
+	return appendRange(r, x, x)
+}
+
+// appendRange returns the result of appending the range lo-hi to the class r.
+func appendRange(r []rune, lo, hi rune) []rune {
+	// Expand last range or next to last range if it overlaps or abuts.
+	// Checking two ranges helps when appending case-folded
+	// alphabets, so that one range can be expanding A-Z and the
+	// other expanding a-z.
+	n := len(r)
+	for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
+		if n >= i {
+			rlo, rhi := r[n-i], r[n-i+1]
+			if lo <= rhi+1 && rlo <= hi+1 {
+				if lo < rlo {
+					r[n-i] = lo
+				}
+				if hi > rhi {
+					r[n-i+1] = hi
+				}
+				return r
+			}
+		}
+	}
+
+	return append(r, lo, hi)
+}
+
+const (
+	// minimum and maximum runes involved in folding.
+	// checked during test.
+	minFold = 0x0041
+	maxFold = 0x1e943
+)
+
+// appendFoldedRange returns the result of appending the range lo-hi
+// and its case folding-equivalent runes to the class r.
+func appendFoldedRange(r []rune, lo, hi rune) []rune {
+	// Optimizations.
+	if lo <= minFold && hi >= maxFold {
+		// Range is full: folding can't add more.
+		return appendRange(r, lo, hi)
+	}
+	if hi < minFold || lo > maxFold {
+		// Range is outside folding possibilities.
+		return appendRange(r, lo, hi)
+	}
+	if lo < minFold {
+		// [lo, minFold-1] needs no folding.
+		r = appendRange(r, lo, minFold-1)
+		lo = minFold
+	}
+	if hi > maxFold {
+		// [maxFold+1, hi] needs no folding.
+		r = appendRange(r, maxFold+1, hi)
+		hi = maxFold
+	}
+
+	// Brute force. Depend on appendRange to coalesce ranges on the fly.
+	for c := lo; c <= hi; c++ {
+		r = appendRange(r, c, c)
+		f := unicode.SimpleFold(c)
+		for f != c {
+			r = appendRange(r, f, f)
+			f = unicode.SimpleFold(f)
+		}
+	}
+	return r
+}
+
+// appendClass returns the result of appending the class x to the class r.
+// It assume x is clean.
+func appendClass(r []rune, x []rune) []rune {
+	for i := 0; i < len(x); i += 2 {
+		r = appendRange(r, x[i], x[i+1])
+	}
+	return r
+}
+
+// appendFoldedClass returns the result of appending the case folding of the class x to the class r.
+func appendFoldedClass(r []rune, x []rune) []rune {
+	for i := 0; i < len(x); i += 2 {
+		r = appendFoldedRange(r, x[i], x[i+1])
+	}
+	return r
+}
+
+// appendNegatedClass returns the result of appending the negation of the class x to the class r.
+// It assumes x is clean.
+func appendNegatedClass(r []rune, x []rune) []rune {
+	nextLo := '\u0000'
+	for i := 0; i < len(x); i += 2 {
+		lo, hi := x[i], x[i+1]
+		if nextLo <= lo-1 {
+			r = appendRange(r, nextLo, lo-1)
+		}
+		nextLo = hi + 1
+	}
+	if nextLo <= unicode.MaxRune {
+		r = appendRange(r, nextLo, unicode.MaxRune)
+	}
+	return r
+}
+
+// appendTable returns the result of appending x to the class r.
+func appendTable(r []rune, x *unicode.RangeTable) []rune {
+	for _, xr := range x.R16 {
+		lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
+		if stride == 1 {
+			r = appendRange(r, lo, hi)
+			continue
+		}
+		for c := lo; c <= hi; c += stride {
+			r = appendRange(r, c, c)
+		}
+	}
+	for _, xr := range x.R32 {
+		lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
+		if stride == 1 {
+			r = appendRange(r, lo, hi)
+			continue
+		}
+		for c := lo; c <= hi; c += stride {
+			r = appendRange(r, c, c)
+		}
+	}
+	return r
+}
+
+// appendNegatedTable returns the result of appending the negation of x to the class r.
+func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
+	nextLo := '\u0000' // lo end of next class to add
+	for _, xr := range x.R16 {
+		lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
+		if stride == 1 {
+			if nextLo <= lo-1 {
+				r = appendRange(r, nextLo, lo-1)
+			}
+			nextLo = hi + 1
+			continue
+		}
+		for c := lo; c <= hi; c += stride {
+			if nextLo <= c-1 {
+				r = appendRange(r, nextLo, c-1)
+			}
+			nextLo = c + 1
+		}
+	}
+	for _, xr := range x.R32 {
+		lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
+		if stride == 1 {
+			if nextLo <= lo-1 {
+				r = appendRange(r, nextLo, lo-1)
+			}
+			nextLo = hi + 1
+			continue
+		}
+		for c := lo; c <= hi; c += stride {
+			if nextLo <= c-1 {
+				r = appendRange(r, nextLo, c-1)
+			}
+			nextLo = c + 1
+		}
+	}
+	if nextLo <= unicode.MaxRune {
+		r = appendRange(r, nextLo, unicode.MaxRune)
+	}
+	return r
+}
+
+// negateClass overwrites r and returns r's negation.
+// It assumes the class r is already clean.
+func negateClass(r []rune) []rune {
+	nextLo := '\u0000' // lo end of next class to add
+	w := 0             // write index
+	for i := 0; i < len(r); i += 2 {
+		lo, hi := r[i], r[i+1]
+		if nextLo <= lo-1 {
+			r[w] = nextLo
+			r[w+1] = lo - 1
+			w += 2
+		}
+		nextLo = hi + 1
+	}
+	r = r[:w]
+	if nextLo <= unicode.MaxRune {
+		// It's possible for the negation to have one more
+		// range - this one - than the original class, so use append.
+		r = append(r, nextLo, unicode.MaxRune)
+	}
+	return r
+}
+
+// ranges implements sort.Interface on a []rune.
+// The choice of receiver type definition is strange
+// but avoids an allocation since we already have
+// a *[]rune.
+type ranges struct {
+	p *[]rune
+}
+
+func (ra ranges) Less(i, j int) bool {
+	p := *ra.p
+	i *= 2
+	j *= 2
+	return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
+}
+
+func (ra ranges) Len() int {
+	return len(*ra.p) / 2
+}
+
+func (ra ranges) Swap(i, j int) {
+	p := *ra.p
+	i *= 2
+	j *= 2
+	p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
+}
+
+func checkUTF8(s string) error {
+	for s != "" {
+		rune, size := utf8.DecodeRuneInString(s)
+		if rune == utf8.RuneError && size == 1 {
+			return &Error{Code: ErrInvalidUTF8, Expr: s}
+		}
+		s = s[size:]
+	}
+	return nil
+}
+
+func nextRune(s string) (c rune, t string, err error) {
+	c, size := utf8.DecodeRuneInString(s)
+	if c == utf8.RuneError && size == 1 {
+		return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
+	}
+	return c, s[size:], nil
+}
+
+func isalnum(c rune) bool {
+	return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
+}
+
+func unhex(c rune) rune {
+	if '0' <= c && c <= '9' {
+		return c - '0'
+	}
+	if 'a' <= c && c <= 'f' {
+		return c - 'a' + 10
+	}
+	if 'A' <= c && c <= 'F' {
+		return c - 'A' + 10
+	}
+	return -1
+}
diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go
new file mode 100644
index 0000000..67e3c56
--- /dev/null
+++ b/src/regexp/syntax/parse_test.go
@@ -0,0 +1,586 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+	"unicode"
+)
+
+type parseTest struct {
+	Regexp string
+	Dump   string
+}
+
+var parseTests = []parseTest{
+	// Base cases
+	{`a`, `lit{a}`},
+	{`a.`, `cat{lit{a}dot{}}`},
+	{`a.b`, `cat{lit{a}dot{}lit{b}}`},
+	{`ab`, `str{ab}`},
+	{`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
+	{`abc`, `str{abc}`},
+	{`a|^`, `alt{lit{a}bol{}}`},
+	{`a|b`, `cc{0x61-0x62}`},
+	{`(a)`, `cap{lit{a}}`},
+	{`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
+	{`a*`, `star{lit{a}}`},
+	{`a+`, `plus{lit{a}}`},
+	{`a?`, `que{lit{a}}`},
+	{`a{2}`, `rep{2,2 lit{a}}`},
+	{`a{2,3}`, `rep{2,3 lit{a}}`},
+	{`a{2,}`, `rep{2,-1 lit{a}}`},
+	{`a*?`, `nstar{lit{a}}`},
+	{`a+?`, `nplus{lit{a}}`},
+	{`a??`, `nque{lit{a}}`},
+	{`a{2}?`, `nrep{2,2 lit{a}}`},
+	{`a{2,3}?`, `nrep{2,3 lit{a}}`},
+	{`a{2,}?`, `nrep{2,-1 lit{a}}`},
+	// Malformed { } are treated as literals.
+	{`x{1001`, `str{x{1001}`},
+	{`x{9876543210`, `str{x{9876543210}`},
+	{`x{9876543210,`, `str{x{9876543210,}`},
+	{`x{2,1`, `str{x{2,1}`},
+	{`x{1,9876543210`, `str{x{1,9876543210}`},
+	{``, `emp{}`},
+	{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
+	{`|x|`, `alt{emp{}lit{x}emp{}}`},
+	{`.`, `dot{}`},
+	{`^`, `bol{}`},
+	{`$`, `eol{}`},
+	{`\|`, `lit{|}`},
+	{`\(`, `lit{(}`},
+	{`\)`, `lit{)}`},
+	{`\*`, `lit{*}`},
+	{`\+`, `lit{+}`},
+	{`\?`, `lit{?}`},
+	{`{`, `lit{{}`},
+	{`}`, `lit{}}`},
+	{`\.`, `lit{.}`},
+	{`\^`, `lit{^}`},
+	{`\$`, `lit{$}`},
+	{`\\`, `lit{\}`},
+	{`[ace]`, `cc{0x61 0x63 0x65}`},
+	{`[abc]`, `cc{0x61-0x63}`},
+	{`[a-z]`, `cc{0x61-0x7a}`},
+	{`[a]`, `lit{a}`},
+	{`\-`, `lit{-}`},
+	{`-`, `lit{-}`},
+	{`\_`, `lit{_}`},
+	{`abc`, `str{abc}`},
+	{`abc|def`, `alt{str{abc}str{def}}`},
+	{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
+
+	// Posix and Perl extensions
+	{`[[:lower:]]`, `cc{0x61-0x7a}`},
+	{`[a-z]`, `cc{0x61-0x7a}`},
+	{`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
+	{`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
+	{`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+	{`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+	{`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+	{`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+	{`\d`, `cc{0x30-0x39}`},
+	{`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
+	{`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
+	{`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
+	{`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
+	{`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
+	{`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
+	{`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
+	{`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
+	//	{ `\C`, `byte{}` },  // probably never
+
+	// Unicode, negatives, and a double negative.
+	{`\p{Braille}`, `cc{0x2800-0x28ff}`},
+	{`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+	{`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+	{`\P{^Braille}`, `cc{0x2800-0x28ff}`},
+	{`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
+	{`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
+	{`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+	{`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
+	{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
+	{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
+	{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
+	{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
+	{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
+	{`\p{Any}`, `dot{}`},
+	{`\p{^Any}`, `cc{}`},
+
+	// Hex, octal.
+	{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
+	{`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
+
+	// More interesting regular expressions.
+	{`a{,2}`, `str{a{,2}}`},
+	{`\.\^\$\\`, `str{.^$\}`},
+	{`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
+	{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
+	{`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
+	{`a*{`, `cat{star{lit{a}}lit{{}}`},
+
+	// Test precedences
+	{`(?:ab)*`, `star{str{ab}}`},
+	{`(ab)*`, `star{cap{str{ab}}}`},
+	{`ab|cd`, `alt{str{ab}str{cd}}`},
+	{`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
+
+	// Test flattening.
+	{`(?:a)`, `lit{a}`},
+	{`(?:ab)(?:cd)`, `str{abcd}`},
+	{`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
+	{`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
+	{`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
+	{`a|.`, `dot{}`},
+	{`.|a`, `dot{}`},
+	{`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
+	{`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
+
+	// Test Perl quoted literals
+	{`\Q+|*?{[\E`, `str{+|*?{[}`},
+	{`\Q+\E+`, `plus{lit{+}}`},
+	{`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`},
+	{`\Q\\E`, `lit{\}`},
+	{`\Q\\\E`, `str{\\}`},
+
+	// Test Perl \A and \z
+	{`(?m)^`, `bol{}`},
+	{`(?m)$`, `eol{}`},
+	{`(?-m)^`, `bot{}`},
+	{`(?-m)$`, `eot{}`},
+	{`(?m)\A`, `bot{}`},
+	{`(?m)\z`, `eot{\z}`},
+	{`(?-m)\A`, `bot{}`},
+	{`(?-m)\z`, `eot{\z}`},
+
+	// Test named captures
+	{`(?P<name>a)`, `cap{name:lit{a}}`},
+
+	// Case-folded literals
+	{`[Aa]`, `litfold{A}`},
+	{`[\x{100}\x{101}]`, `litfold{Ā}`},
+	{`[Δδ]`, `litfold{Δ}`},
+
+	// Strings
+	{`abcde`, `str{abcde}`},
+	{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
+
+	// Factoring.
+	{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
+	{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
+
+	// Bug fixes.
+	{`(?:.)`, `dot{}`},
+	{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
+	{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
+	{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
+	{`(?:A|a)`, `litfold{A}`},
+	{`A|(?:A|a)`, `litfold{A}`},
+	{`(?s).`, `dot{}`},
+	{`(?-s).`, `dnl{}`},
+	{`(?:(?:^).)`, `cat{bol{}dot{}}`},
+	{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
+	{`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`},
+
+	// RE2 prefix_tests
+	{`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
+	{`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
+	{`abc|abd|aef|bcx|bcy`,
+		`alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
+			`cat{str{bc}cc{0x78-0x79}}}`},
+	{`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
+	{`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
+	{`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
+	{`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
+	{`x{2}|x{2}[0-9]`,
+		`cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
+	{`x{2}y|x{2}[0-9]y`,
+		`cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
+	{`a.*?c|a.*?b`,
+		`cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
+
+	// Valid repetitions.
+	{`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
+	{`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``},
+
+	// Valid nesting.
+	{strings.Repeat("(", 999) + strings.Repeat(")", 999), ``},
+	{strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``},
+	{"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all
+}
+
+const testFlags = MatchNL | PerlX | UnicodeGroups
+
+func TestParseSimple(t *testing.T) {
+	testParseDump(t, parseTests, testFlags)
+}
+
+var foldcaseTests = []parseTest{
+	{`AbCdE`, `strfold{ABCDE}`},
+	{`[Aa]`, `litfold{A}`},
+	{`a`, `litfold{A}`},
+
+	// 0x17F is an old English long s (looks like an f) and folds to s.
+	// 0x212A is the Kelvin symbol and folds to k.
+	{`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
+	{`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+	{`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+}
+
+func TestParseFoldCase(t *testing.T) {
+	testParseDump(t, foldcaseTests, FoldCase)
+}
+
+var literalTests = []parseTest{
+	{"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
+}
+
+func TestParseLiteral(t *testing.T) {
+	testParseDump(t, literalTests, Literal)
+}
+
+var matchnlTests = []parseTest{
+	{`.`, `dot{}`},
+	{"\n", "lit{\n}"},
+	{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
+	{`[a\n]`, `cc{0xa 0x61}`},
+}
+
+func TestParseMatchNL(t *testing.T) {
+	testParseDump(t, matchnlTests, MatchNL)
+}
+
+var nomatchnlTests = []parseTest{
+	{`.`, `dnl{}`},
+	{"\n", "lit{\n}"},
+	{`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
+	{`[a\n]`, `cc{0xa 0x61}`},
+}
+
+func TestParseNoMatchNL(t *testing.T) {
+	testParseDump(t, nomatchnlTests, 0)
+}
+
+// Test Parse -> Dump.
+func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
+	for _, tt := range tests {
+		re, err := Parse(tt.Regexp, flags)
+		if err != nil {
+			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
+			continue
+		}
+		if tt.Dump == "" {
+			// It parsed. That's all we care about.
+			continue
+		}
+		d := dump(re)
+		if d != tt.Dump {
+			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
+		}
+	}
+}
+
+// dump prints a string representation of the regexp showing
+// the structure explicitly.
+func dump(re *Regexp) string {
+	var b strings.Builder
+	dumpRegexp(&b, re)
+	return b.String()
+}
+
+var opNames = []string{
+	OpNoMatch:        "no",
+	OpEmptyMatch:     "emp",
+	OpLiteral:        "lit",
+	OpCharClass:      "cc",
+	OpAnyCharNotNL:   "dnl",
+	OpAnyChar:        "dot",
+	OpBeginLine:      "bol",
+	OpEndLine:        "eol",
+	OpBeginText:      "bot",
+	OpEndText:        "eot",
+	OpWordBoundary:   "wb",
+	OpNoWordBoundary: "nwb",
+	OpCapture:        "cap",
+	OpStar:           "star",
+	OpPlus:           "plus",
+	OpQuest:          "que",
+	OpRepeat:         "rep",
+	OpConcat:         "cat",
+	OpAlternate:      "alt",
+}
+
+// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
+// It is used during testing to distinguish between parses that might print
+// the same using re's String method.
+func dumpRegexp(b *strings.Builder, re *Regexp) {
+	if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
+		fmt.Fprintf(b, "op%d", re.Op)
+	} else {
+		switch re.Op {
+		default:
+			b.WriteString(opNames[re.Op])
+		case OpStar, OpPlus, OpQuest, OpRepeat:
+			if re.Flags&NonGreedy != 0 {
+				b.WriteByte('n')
+			}
+			b.WriteString(opNames[re.Op])
+		case OpLiteral:
+			if len(re.Rune) > 1 {
+				b.WriteString("str")
+			} else {
+				b.WriteString("lit")
+			}
+			if re.Flags&FoldCase != 0 {
+				for _, r := range re.Rune {
+					if unicode.SimpleFold(r) != r {
+						b.WriteString("fold")
+						break
+					}
+				}
+			}
+		}
+	}
+	b.WriteByte('{')
+	switch re.Op {
+	case OpEndText:
+		if re.Flags&WasDollar == 0 {
+			b.WriteString(`\z`)
+		}
+	case OpLiteral:
+		for _, r := range re.Rune {
+			b.WriteRune(r)
+		}
+	case OpConcat, OpAlternate:
+		for _, sub := range re.Sub {
+			dumpRegexp(b, sub)
+		}
+	case OpStar, OpPlus, OpQuest:
+		dumpRegexp(b, re.Sub[0])
+	case OpRepeat:
+		fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
+		dumpRegexp(b, re.Sub[0])
+	case OpCapture:
+		if re.Name != "" {
+			b.WriteString(re.Name)
+			b.WriteByte(':')
+		}
+		dumpRegexp(b, re.Sub[0])
+	case OpCharClass:
+		sep := ""
+		for i := 0; i < len(re.Rune); i += 2 {
+			b.WriteString(sep)
+			sep = " "
+			lo, hi := re.Rune[i], re.Rune[i+1]
+			if lo == hi {
+				fmt.Fprintf(b, "%#x", lo)
+			} else {
+				fmt.Fprintf(b, "%#x-%#x", lo, hi)
+			}
+		}
+	}
+	b.WriteByte('}')
+}
+
+func mkCharClass(f func(rune) bool) string {
+	re := &Regexp{Op: OpCharClass}
+	lo := rune(-1)
+	for i := rune(0); i <= unicode.MaxRune; i++ {
+		if f(i) {
+			if lo < 0 {
+				lo = i
+			}
+		} else {
+			if lo >= 0 {
+				re.Rune = append(re.Rune, lo, i-1)
+				lo = -1
+			}
+		}
+	}
+	if lo >= 0 {
+		re.Rune = append(re.Rune, lo, unicode.MaxRune)
+	}
+	return dump(re)
+}
+
+func isUpperFold(r rune) bool {
+	if unicode.IsUpper(r) {
+		return true
+	}
+	c := unicode.SimpleFold(r)
+	for c != r {
+		if unicode.IsUpper(c) {
+			return true
+		}
+		c = unicode.SimpleFold(c)
+	}
+	return false
+}
+
+func TestFoldConstants(t *testing.T) {
+	last := rune(-1)
+	for i := rune(0); i <= unicode.MaxRune; i++ {
+		if unicode.SimpleFold(i) == i {
+			continue
+		}
+		if last == -1 && minFold != i {
+			t.Errorf("minFold=%#U should be %#U", minFold, i)
+		}
+		last = i
+	}
+	if maxFold != last {
+		t.Errorf("maxFold=%#U should be %#U", maxFold, last)
+	}
+}
+
+func TestAppendRangeCollapse(t *testing.T) {
+	// AppendRange should collapse each of the new ranges
+	// into the earlier ones (it looks back two ranges), so that
+	// the slice never grows very large.
+	// Note that we are not calling cleanClass.
+	var r []rune
+	for i := rune('A'); i <= 'Z'; i++ {
+		r = appendRange(r, i, i)
+		r = appendRange(r, i+'a'-'A', i+'a'-'A')
+	}
+	if string(r) != "AZaz" {
+		t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
+	}
+}
+
+var invalidRegexps = []string{
+	`(`,
+	`)`,
+	`(a`,
+	`a)`,
+	`(a))`,
+	`(a|b|`,
+	`a|b|)`,
+	`(a|b|))`,
+	`(a|b`,
+	`a|b)`,
+	`(a|b))`,
+	`[a-z`,
+	`([a-z)`,
+	`[a-z)`,
+	`([a-z]))`,
+	`x{1001}`,
+	`x{9876543210}`,
+	`x{2,1}`,
+	`x{1,9876543210}`,
+	"\xff", // Invalid UTF-8
+	"[\xff]",
+	"[\\\xff]",
+	"\\\xff",
+	`(?P<name>a`,
+	`(?P<name>`,
+	`(?P<name`,
+	`(?P<x y>a)`,
+	`(?P<>a)`,
+	`[a-Z]`,
+	`(?i)[a-Z]`,
+	`\Q\E*`,
+	`a{100000}`,  // too much repetition
+	`a{100000,}`, // too much repetition
+	"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",    // too much repetition
+	strings.Repeat("(", 1000) + strings.Repeat(")", 1000),    // too deep
+	strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000), // too deep
+	"(" + strings.Repeat("(xx?)", 1000) + "){1000}",          // too long
+	strings.Repeat("(xx?){1000}", 1000),                      // too long
+	strings.Repeat(`\pL`, 27000),                             // too many runes
+}
+
+var onlyPerl = []string{
+	`[a-b-c]`,
+	`\Qabc\E`,
+	`\Q*+?{[\E`,
+	`\Q\\E`,
+	`\Q\\\E`,
+	`\Q\\\\E`,
+	`\Q\\\\\E`,
+	`(?:a)`,
+	`(?P<name>a)`,
+}
+
+var onlyPOSIX = []string{
+	"a++",
+	"a**",
+	"a?*",
+	"a+*",
+	"a{1}*",
+	".{1}{2}.{3}",
+}
+
+func TestParseInvalidRegexps(t *testing.T) {
+	for _, regexp := range invalidRegexps {
+		if re, err := Parse(regexp, Perl); err == nil {
+			t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
+		}
+		if re, err := Parse(regexp, POSIX); err == nil {
+			t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
+		}
+	}
+	for _, regexp := range onlyPerl {
+		if _, err := Parse(regexp, Perl); err != nil {
+			t.Errorf("Parse(%#q, Perl): %v", regexp, err)
+		}
+		if re, err := Parse(regexp, POSIX); err == nil {
+			t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
+		}
+	}
+	for _, regexp := range onlyPOSIX {
+		if re, err := Parse(regexp, Perl); err == nil {
+			t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
+		}
+		if _, err := Parse(regexp, POSIX); err != nil {
+			t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
+		}
+	}
+}
+
+func TestToStringEquivalentParse(t *testing.T) {
+	for _, tt := range parseTests {
+		re, err := Parse(tt.Regexp, testFlags)
+		if err != nil {
+			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
+			continue
+		}
+		if tt.Dump == "" {
+			// It parsed. That's all we care about.
+			continue
+		}
+		d := dump(re)
+		if d != tt.Dump {
+			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
+			continue
+		}
+
+		s := re.String()
+		if s != tt.Regexp {
+			// If ToString didn't return the original regexp,
+			// it must have found one with fewer parens.
+			// Unfortunately we can't check the length here, because
+			// ToString produces "\\{" for a literal brace,
+			// but "{" is a shorter equivalent in some contexts.
+			nre, err := Parse(s, testFlags)
+			if err != nil {
+				t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
+				continue
+			}
+			nd := dump(nre)
+			if d != nd {
+				t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
+			}
+
+			ns := nre.String()
+			if s != ns {
+				t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
+			}
+		}
+	}
+}
diff --git a/src/regexp/syntax/perl_groups.go b/src/regexp/syntax/perl_groups.go
new file mode 100644
index 0000000..effe4e6
--- /dev/null
+++ b/src/regexp/syntax/perl_groups.go
@@ -0,0 +1,134 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.go
+
+package syntax
+
+var code1 = []rune{ /* \d */
+	0x30, 0x39,
+}
+
+var code2 = []rune{ /* \s */
+	0x9, 0xa,
+	0xc, 0xd,
+	0x20, 0x20,
+}
+
+var code3 = []rune{ /* \w */
+	0x30, 0x39,
+	0x41, 0x5a,
+	0x5f, 0x5f,
+	0x61, 0x7a,
+}
+
+var perlGroup = map[string]charGroup{
+	`\d`: {+1, code1},
+	`\D`: {-1, code1},
+	`\s`: {+1, code2},
+	`\S`: {-1, code2},
+	`\w`: {+1, code3},
+	`\W`: {-1, code3},
+}
+var code4 = []rune{ /* [:alnum:] */
+	0x30, 0x39,
+	0x41, 0x5a,
+	0x61, 0x7a,
+}
+
+var code5 = []rune{ /* [:alpha:] */
+	0x41, 0x5a,
+	0x61, 0x7a,
+}
+
+var code6 = []rune{ /* [:ascii:] */
+	0x0, 0x7f,
+}
+
+var code7 = []rune{ /* [:blank:] */
+	0x9, 0x9,
+	0x20, 0x20,
+}
+
+var code8 = []rune{ /* [:cntrl:] */
+	0x0, 0x1f,
+	0x7f, 0x7f,
+}
+
+var code9 = []rune{ /* [:digit:] */
+	0x30, 0x39,
+}
+
+var code10 = []rune{ /* [:graph:] */
+	0x21, 0x7e,
+}
+
+var code11 = []rune{ /* [:lower:] */
+	0x61, 0x7a,
+}
+
+var code12 = []rune{ /* [:print:] */
+	0x20, 0x7e,
+}
+
+var code13 = []rune{ /* [:punct:] */
+	0x21, 0x2f,
+	0x3a, 0x40,
+	0x5b, 0x60,
+	0x7b, 0x7e,
+}
+
+var code14 = []rune{ /* [:space:] */
+	0x9, 0xd,
+	0x20, 0x20,
+}
+
+var code15 = []rune{ /* [:upper:] */
+	0x41, 0x5a,
+}
+
+var code16 = []rune{ /* [:word:] */
+	0x30, 0x39,
+	0x41, 0x5a,
+	0x5f, 0x5f,
+	0x61, 0x7a,
+}
+
+var code17 = []rune{ /* [:xdigit:] */
+	0x30, 0x39,
+	0x41, 0x46,
+	0x61, 0x66,
+}
+
+var posixGroup = map[string]charGroup{
+	`[:alnum:]`:   {+1, code4},
+	`[:^alnum:]`:  {-1, code4},
+	`[:alpha:]`:   {+1, code5},
+	`[:^alpha:]`:  {-1, code5},
+	`[:ascii:]`:   {+1, code6},
+	`[:^ascii:]`:  {-1, code6},
+	`[:blank:]`:   {+1, code7},
+	`[:^blank:]`:  {-1, code7},
+	`[:cntrl:]`:   {+1, code8},
+	`[:^cntrl:]`:  {-1, code8},
+	`[:digit:]`:   {+1, code9},
+	`[:^digit:]`:  {-1, code9},
+	`[:graph:]`:   {+1, code10},
+	`[:^graph:]`:  {-1, code10},
+	`[:lower:]`:   {+1, code11},
+	`[:^lower:]`:  {-1, code11},
+	`[:print:]`:   {+1, code12},
+	`[:^print:]`:  {-1, code12},
+	`[:punct:]`:   {+1, code13},
+	`[:^punct:]`:  {-1, code13},
+	`[:space:]`:   {+1, code14},
+	`[:^space:]`:  {-1, code14},
+	`[:upper:]`:   {+1, code15},
+	`[:^upper:]`:  {-1, code15},
+	`[:word:]`:    {+1, code16},
+	`[:^word:]`:   {-1, code16},
+	`[:xdigit:]`:  {+1, code17},
+	`[:^xdigit:]`: {-1, code17},
+}
diff --git a/src/regexp/syntax/prog.go b/src/regexp/syntax/prog.go
new file mode 100644
index 0000000..896cdc4
--- /dev/null
+++ b/src/regexp/syntax/prog.go
@@ -0,0 +1,347 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+	"strconv"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+// Compiled program.
+// May not belong in this package, but convenient for now.
+
+// A Prog is a compiled regular expression program.
+type Prog struct {
+	Inst   []Inst
+	Start  int // index of start instruction
+	NumCap int // number of InstCapture insts in re
+}
+
+// An InstOp is an instruction opcode.
+type InstOp uint8
+
+const (
+	InstAlt InstOp = iota
+	InstAltMatch
+	InstCapture
+	InstEmptyWidth
+	InstMatch
+	InstFail
+	InstNop
+	InstRune
+	InstRune1
+	InstRuneAny
+	InstRuneAnyNotNL
+)
+
+var instOpNames = []string{
+	"InstAlt",
+	"InstAltMatch",
+	"InstCapture",
+	"InstEmptyWidth",
+	"InstMatch",
+	"InstFail",
+	"InstNop",
+	"InstRune",
+	"InstRune1",
+	"InstRuneAny",
+	"InstRuneAnyNotNL",
+}
+
+func (i InstOp) String() string {
+	if uint(i) >= uint(len(instOpNames)) {
+		return ""
+	}
+	return instOpNames[i]
+}
+
+// An EmptyOp specifies a kind or mixture of zero-width assertions.
+type EmptyOp uint8
+
+const (
+	EmptyBeginLine EmptyOp = 1 << iota
+	EmptyEndLine
+	EmptyBeginText
+	EmptyEndText
+	EmptyWordBoundary
+	EmptyNoWordBoundary
+)
+
+// EmptyOpContext returns the zero-width assertions
+// satisfied at the position between the runes r1 and r2.
+// Passing r1 == -1 indicates that the position is
+// at the beginning of the text.
+// Passing r2 == -1 indicates that the position is
+// at the end of the text.
+func EmptyOpContext(r1, r2 rune) EmptyOp {
+	var op EmptyOp = EmptyNoWordBoundary
+	var boundary byte
+	switch {
+	case IsWordChar(r1):
+		boundary = 1
+	case r1 == '\n':
+		op |= EmptyBeginLine
+	case r1 < 0:
+		op |= EmptyBeginText | EmptyBeginLine
+	}
+	switch {
+	case IsWordChar(r2):
+		boundary ^= 1
+	case r2 == '\n':
+		op |= EmptyEndLine
+	case r2 < 0:
+		op |= EmptyEndText | EmptyEndLine
+	}
+	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
+		op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
+	}
+	return op
+}
+
+// IsWordChar reports whether r is considered a “word character”
+// during the evaluation of the \b and \B zero-width assertions.
+// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
+func IsWordChar(r rune) bool {
+	return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
+}
+
+// An Inst is a single instruction in a regular expression program.
+type Inst struct {
+	Op   InstOp
+	Out  uint32 // all but InstMatch, InstFail
+	Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
+	Rune []rune
+}
+
+func (p *Prog) String() string {
+	var b strings.Builder
+	dumpProg(&b, p)
+	return b.String()
+}
+
+// skipNop follows any no-op or capturing instructions.
+func (p *Prog) skipNop(pc uint32) *Inst {
+	i := &p.Inst[pc]
+	for i.Op == InstNop || i.Op == InstCapture {
+		i = &p.Inst[i.Out]
+	}
+	return i
+}
+
+// op returns i.Op but merges all the Rune special cases into InstRune
+func (i *Inst) op() InstOp {
+	op := i.Op
+	switch op {
+	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
+		op = InstRune
+	}
+	return op
+}
+
+// Prefix returns a literal string that all matches for the
+// regexp must start with. Complete is true if the prefix
+// is the entire match.
+func (p *Prog) Prefix() (prefix string, complete bool) {
+	i := p.skipNop(uint32(p.Start))
+
+	// Avoid allocation of buffer if prefix is empty.
+	if i.op() != InstRune || len(i.Rune) != 1 {
+		return "", i.Op == InstMatch
+	}
+
+	// Have prefix; gather characters.
+	var buf strings.Builder
+	for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
+		buf.WriteRune(i.Rune[0])
+		i = p.skipNop(i.Out)
+	}
+	return buf.String(), i.Op == InstMatch
+}
+
+// StartCond returns the leading empty-width conditions that must
+// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
+func (p *Prog) StartCond() EmptyOp {
+	var flag EmptyOp
+	pc := uint32(p.Start)
+	i := &p.Inst[pc]
+Loop:
+	for {
+		switch i.Op {
+		case InstEmptyWidth:
+			flag |= EmptyOp(i.Arg)
+		case InstFail:
+			return ^EmptyOp(0)
+		case InstCapture, InstNop:
+			// skip
+		default:
+			break Loop
+		}
+		pc = i.Out
+		i = &p.Inst[pc]
+	}
+	return flag
+}
+
+const noMatch = -1
+
+// MatchRune reports whether the instruction matches (and consumes) r.
+// It should only be called when i.Op == InstRune.
+func (i *Inst) MatchRune(r rune) bool {
+	return i.MatchRunePos(r) != noMatch
+}
+
+// MatchRunePos checks whether the instruction matches (and consumes) r.
+// If so, MatchRunePos returns the index of the matching rune pair
+// (or, when len(i.Rune) == 1, rune singleton).
+// If not, MatchRunePos returns -1.
+// MatchRunePos should only be called when i.Op == InstRune.
+func (i *Inst) MatchRunePos(r rune) int {
+	rune := i.Rune
+
+	switch len(rune) {
+	case 0:
+		return noMatch
+
+	case 1:
+		// Special case: single-rune slice is from literal string, not char class.
+		r0 := rune[0]
+		if r == r0 {
+			return 0
+		}
+		if Flags(i.Arg)&FoldCase != 0 {
+			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
+				if r == r1 {
+					return 0
+				}
+			}
+		}
+		return noMatch
+
+	case 2:
+		if r >= rune[0] && r <= rune[1] {
+			return 0
+		}
+		return noMatch
+
+	case 4, 6, 8:
+		// Linear search for a few pairs.
+		// Should handle ASCII well.
+		for j := 0; j < len(rune); j += 2 {
+			if r < rune[j] {
+				return noMatch
+			}
+			if r <= rune[j+1] {
+				return j / 2
+			}
+		}
+		return noMatch
+	}
+
+	// Otherwise binary search.
+	lo := 0
+	hi := len(rune) / 2
+	for lo < hi {
+		m := lo + (hi-lo)/2
+		if c := rune[2*m]; c <= r {
+			if r <= rune[2*m+1] {
+				return m
+			}
+			lo = m + 1
+		} else {
+			hi = m
+		}
+	}
+	return noMatch
+}
+
+// MatchEmptyWidth reports whether the instruction matches
+// an empty string between the runes before and after.
+// It should only be called when i.Op == InstEmptyWidth.
+func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
+	switch EmptyOp(i.Arg) {
+	case EmptyBeginLine:
+		return before == '\n' || before == -1
+	case EmptyEndLine:
+		return after == '\n' || after == -1
+	case EmptyBeginText:
+		return before == -1
+	case EmptyEndText:
+		return after == -1
+	case EmptyWordBoundary:
+		return IsWordChar(before) != IsWordChar(after)
+	case EmptyNoWordBoundary:
+		return IsWordChar(before) == IsWordChar(after)
+	}
+	panic("unknown empty width arg")
+}
+
+func (i *Inst) String() string {
+	var b strings.Builder
+	dumpInst(&b, i)
+	return b.String()
+}
+
+func bw(b *strings.Builder, args ...string) {
+	for _, s := range args {
+		b.WriteString(s)
+	}
+}
+
+func dumpProg(b *strings.Builder, p *Prog) {
+	for j := range p.Inst {
+		i := &p.Inst[j]
+		pc := strconv.Itoa(j)
+		if len(pc) < 3 {
+			b.WriteString("   "[len(pc):])
+		}
+		if j == p.Start {
+			pc += "*"
+		}
+		bw(b, pc, "\t")
+		dumpInst(b, i)
+		bw(b, "\n")
+	}
+}
+
+func u32(i uint32) string {
+	return strconv.FormatUint(uint64(i), 10)
+}
+
+func dumpInst(b *strings.Builder, i *Inst) {
+	switch i.Op {
+	case InstAlt:
+		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
+	case InstAltMatch:
+		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
+	case InstCapture:
+		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
+	case InstEmptyWidth:
+		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
+	case InstMatch:
+		bw(b, "match")
+	case InstFail:
+		bw(b, "fail")
+	case InstNop:
+		bw(b, "nop -> ", u32(i.Out))
+	case InstRune:
+		if i.Rune == nil {
+			// shouldn't happen
+			bw(b, "rune <nil>")
+		}
+		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
+		if Flags(i.Arg)&FoldCase != 0 {
+			bw(b, "/i")
+		}
+		bw(b, " -> ", u32(i.Out))
+	case InstRune1:
+		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
+	case InstRuneAny:
+		bw(b, "any -> ", u32(i.Out))
+	case InstRuneAnyNotNL:
+		bw(b, "anynotnl -> ", u32(i.Out))
+	}
+}
diff --git a/src/regexp/syntax/prog_test.go b/src/regexp/syntax/prog_test.go
new file mode 100644
index 0000000..5603aea
--- /dev/null
+++ b/src/regexp/syntax/prog_test.go
@@ -0,0 +1,129 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import "testing"
+
+var compileTests = []struct {
+	Regexp string
+	Prog   string
+}{
+	{"a", `  0	fail
+  1*	rune1 "a" -> 2
+  2	match
+`},
+	{"[A-M][n-z]", `  0	fail
+  1*	rune "AM" -> 2
+  2	rune "nz" -> 3
+  3	match
+`},
+	{"", `  0	fail
+  1*	nop -> 2
+  2	match
+`},
+	{"a?", `  0	fail
+  1	rune1 "a" -> 3
+  2*	alt -> 1, 3
+  3	match
+`},
+	{"a??", `  0	fail
+  1	rune1 "a" -> 3
+  2*	alt -> 3, 1
+  3	match
+`},
+	{"a+", `  0	fail
+  1*	rune1 "a" -> 2
+  2	alt -> 1, 3
+  3	match
+`},
+	{"a+?", `  0	fail
+  1*	rune1 "a" -> 2
+  2	alt -> 3, 1
+  3	match
+`},
+	{"a*", `  0	fail
+  1	rune1 "a" -> 2
+  2*	alt -> 1, 3
+  3	match
+`},
+	{"a*?", `  0	fail
+  1	rune1 "a" -> 2
+  2*	alt -> 3, 1
+  3	match
+`},
+	{"a+b+", `  0	fail
+  1*	rune1 "a" -> 2
+  2	alt -> 1, 3
+  3	rune1 "b" -> 4
+  4	alt -> 3, 5
+  5	match
+`},
+	{"(a+)(b+)", `  0	fail
+  1*	cap 2 -> 2
+  2	rune1 "a" -> 3
+  3	alt -> 2, 4
+  4	cap 3 -> 5
+  5	cap 4 -> 6
+  6	rune1 "b" -> 7
+  7	alt -> 6, 8
+  8	cap 5 -> 9
+  9	match
+`},
+	{"a+|b+", `  0	fail
+  1	rune1 "a" -> 2
+  2	alt -> 1, 6
+  3	rune1 "b" -> 4
+  4	alt -> 3, 6
+  5*	alt -> 1, 3
+  6	match
+`},
+	{"A[Aa]", `  0	fail
+  1*	rune1 "A" -> 2
+  2	rune "A"/i -> 3
+  3	match
+`},
+	{"(?:(?:^).)", `  0	fail
+  1*	empty 4 -> 2
+  2	anynotnl -> 3
+  3	match
+`},
+	{"(?:|a)+", `  0	fail
+  1	nop -> 4
+  2	rune1 "a" -> 4
+  3*	alt -> 1, 2
+  4	alt -> 3, 5
+  5	match
+`},
+	{"(?:|a)*", `  0	fail
+  1	nop -> 4
+  2	rune1 "a" -> 4
+  3	alt -> 1, 2
+  4	alt -> 3, 6
+  5*	alt -> 3, 6
+  6	match
+`},
+}
+
+func TestCompile(t *testing.T) {
+	for _, tt := range compileTests {
+		re, _ := Parse(tt.Regexp, Perl)
+		p, _ := Compile(re)
+		s := p.String()
+		if s != tt.Prog {
+			t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog)
+		}
+	}
+}
+
+func BenchmarkEmptyOpContext(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var r1 rune = -1
+		for _, r2 := range "foo, bar, baz\nsome input text.\n" {
+			EmptyOpContext(r1, r2)
+			r1 = r2
+		}
+		EmptyOpContext(r1, -1)
+	}
+}
diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go
new file mode 100644
index 0000000..3a4d2d2
--- /dev/null
+++ b/src/regexp/syntax/regexp.go
@@ -0,0 +1,320 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Note to implementers:
+// In this package, re is always a *Regexp and r is always a rune.
+
+import (
+	"strconv"
+	"strings"
+	"unicode"
+)
+
+// A Regexp is a node in a regular expression syntax tree.
+type Regexp struct {
+	Op       Op // operator
+	Flags    Flags
+	Sub      []*Regexp  // subexpressions, if any
+	Sub0     [1]*Regexp // storage for short Sub
+	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
+	Rune0    [2]rune    // storage for short Rune
+	Min, Max int        // min, max for OpRepeat
+	Cap      int        // capturing index, for OpCapture
+	Name     string     // capturing name, for OpCapture
+}
+
+//go:generate stringer -type Op -trimprefix Op
+
+// An Op is a single regular expression operator.
+type Op uint8
+
+// Operators are listed in precedence order, tightest binding to weakest.
+// Character class operators are listed simplest to most complex
+// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
+
+const (
+	OpNoMatch        Op = 1 + iota // matches no strings
+	OpEmptyMatch                   // matches empty string
+	OpLiteral                      // matches Runes sequence
+	OpCharClass                    // matches Runes interpreted as range pair list
+	OpAnyCharNotNL                 // matches any character except newline
+	OpAnyChar                      // matches any character
+	OpBeginLine                    // matches empty string at beginning of line
+	OpEndLine                      // matches empty string at end of line
+	OpBeginText                    // matches empty string at beginning of text
+	OpEndText                      // matches empty string at end of text
+	OpWordBoundary                 // matches word boundary `\b`
+	OpNoWordBoundary               // matches word non-boundary `\B`
+	OpCapture                      // capturing subexpression with index Cap, optional name Name
+	OpStar                         // matches Sub[0] zero or more times
+	OpPlus                         // matches Sub[0] one or more times
+	OpQuest                        // matches Sub[0] zero or one times
+	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
+	OpConcat                       // matches concatenation of Subs
+	OpAlternate                    // matches alternation of Subs
+)
+
+const opPseudo Op = 128 // where pseudo-ops start
+
+// Equal reports whether x and y have identical structure.
+func (x *Regexp) Equal(y *Regexp) bool {
+	if x == nil || y == nil {
+		return x == y
+	}
+	if x.Op != y.Op {
+		return false
+	}
+	switch x.Op {
+	case OpEndText:
+		// The parse flags remember whether this is \z or \Z.
+		if x.Flags&WasDollar != y.Flags&WasDollar {
+			return false
+		}
+
+	case OpLiteral, OpCharClass:
+		if len(x.Rune) != len(y.Rune) {
+			return false
+		}
+		for i, r := range x.Rune {
+			if r != y.Rune[i] {
+				return false
+			}
+		}
+
+	case OpAlternate, OpConcat:
+		if len(x.Sub) != len(y.Sub) {
+			return false
+		}
+		for i, sub := range x.Sub {
+			if !sub.Equal(y.Sub[i]) {
+				return false
+			}
+		}
+
+	case OpStar, OpPlus, OpQuest:
+		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
+			return false
+		}
+
+	case OpRepeat:
+		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
+			return false
+		}
+
+	case OpCapture:
+		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
+			return false
+		}
+	}
+	return true
+}
+
+// writeRegexp writes the Perl syntax for the regular expression re to b.
+func writeRegexp(b *strings.Builder, re *Regexp) {
+	switch re.Op {
+	default:
+		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
+	case OpNoMatch:
+		b.WriteString(`[^\x00-\x{10FFFF}]`)
+	case OpEmptyMatch:
+		b.WriteString(`(?:)`)
+	case OpLiteral:
+		if re.Flags&FoldCase != 0 {
+			b.WriteString(`(?i:`)
+		}
+		for _, r := range re.Rune {
+			escape(b, r, false)
+		}
+		if re.Flags&FoldCase != 0 {
+			b.WriteString(`)`)
+		}
+	case OpCharClass:
+		if len(re.Rune)%2 != 0 {
+			b.WriteString(`[invalid char class]`)
+			break
+		}
+		b.WriteRune('[')
+		if len(re.Rune) == 0 {
+			b.WriteString(`^\x00-\x{10FFFF}`)
+		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
+			// Contains 0 and MaxRune. Probably a negated class.
+			// Print the gaps.
+			b.WriteRune('^')
+			for i := 1; i < len(re.Rune)-1; i += 2 {
+				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
+				escape(b, lo, lo == '-')
+				if lo != hi {
+					b.WriteRune('-')
+					escape(b, hi, hi == '-')
+				}
+			}
+		} else {
+			for i := 0; i < len(re.Rune); i += 2 {
+				lo, hi := re.Rune[i], re.Rune[i+1]
+				escape(b, lo, lo == '-')
+				if lo != hi {
+					b.WriteRune('-')
+					escape(b, hi, hi == '-')
+				}
+			}
+		}
+		b.WriteRune(']')
+	case OpAnyCharNotNL:
+		b.WriteString(`(?-s:.)`)
+	case OpAnyChar:
+		b.WriteString(`(?s:.)`)
+	case OpBeginLine:
+		b.WriteString(`(?m:^)`)
+	case OpEndLine:
+		b.WriteString(`(?m:$)`)
+	case OpBeginText:
+		b.WriteString(`\A`)
+	case OpEndText:
+		if re.Flags&WasDollar != 0 {
+			b.WriteString(`(?-m:$)`)
+		} else {
+			b.WriteString(`\z`)
+		}
+	case OpWordBoundary:
+		b.WriteString(`\b`)
+	case OpNoWordBoundary:
+		b.WriteString(`\B`)
+	case OpCapture:
+		if re.Name != "" {
+			b.WriteString(`(?P<`)
+			b.WriteString(re.Name)
+			b.WriteRune('>')
+		} else {
+			b.WriteRune('(')
+		}
+		if re.Sub[0].Op != OpEmptyMatch {
+			writeRegexp(b, re.Sub[0])
+		}
+		b.WriteRune(')')
+	case OpStar, OpPlus, OpQuest, OpRepeat:
+		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
+			b.WriteString(`(?:`)
+			writeRegexp(b, sub)
+			b.WriteString(`)`)
+		} else {
+			writeRegexp(b, sub)
+		}
+		switch re.Op {
+		case OpStar:
+			b.WriteRune('*')
+		case OpPlus:
+			b.WriteRune('+')
+		case OpQuest:
+			b.WriteRune('?')
+		case OpRepeat:
+			b.WriteRune('{')
+			b.WriteString(strconv.Itoa(re.Min))
+			if re.Max != re.Min {
+				b.WriteRune(',')
+				if re.Max >= 0 {
+					b.WriteString(strconv.Itoa(re.Max))
+				}
+			}
+			b.WriteRune('}')
+		}
+		if re.Flags&NonGreedy != 0 {
+			b.WriteRune('?')
+		}
+	case OpConcat:
+		for _, sub := range re.Sub {
+			if sub.Op == OpAlternate {
+				b.WriteString(`(?:`)
+				writeRegexp(b, sub)
+				b.WriteString(`)`)
+			} else {
+				writeRegexp(b, sub)
+			}
+		}
+	case OpAlternate:
+		for i, sub := range re.Sub {
+			if i > 0 {
+				b.WriteRune('|')
+			}
+			writeRegexp(b, sub)
+		}
+	}
+}
+
+func (re *Regexp) String() string {
+	var b strings.Builder
+	writeRegexp(&b, re)
+	return b.String()
+}
+
+const meta = `\.+*?()|[]{}^$`
+
+func escape(b *strings.Builder, r rune, force bool) {
+	if unicode.IsPrint(r) {
+		if strings.ContainsRune(meta, r) || force {
+			b.WriteRune('\\')
+		}
+		b.WriteRune(r)
+		return
+	}
+
+	switch r {
+	case '\a':
+		b.WriteString(`\a`)
+	case '\f':
+		b.WriteString(`\f`)
+	case '\n':
+		b.WriteString(`\n`)
+	case '\r':
+		b.WriteString(`\r`)
+	case '\t':
+		b.WriteString(`\t`)
+	case '\v':
+		b.WriteString(`\v`)
+	default:
+		if r < 0x100 {
+			b.WriteString(`\x`)
+			s := strconv.FormatInt(int64(r), 16)
+			if len(s) == 1 {
+				b.WriteRune('0')
+			}
+			b.WriteString(s)
+			break
+		}
+		b.WriteString(`\x{`)
+		b.WriteString(strconv.FormatInt(int64(r), 16))
+		b.WriteString(`}`)
+	}
+}
+
+// MaxCap walks the regexp to find the maximum capture index.
+func (re *Regexp) MaxCap() int {
+	m := 0
+	if re.Op == OpCapture {
+		m = re.Cap
+	}
+	for _, sub := range re.Sub {
+		if n := sub.MaxCap(); m < n {
+			m = n
+		}
+	}
+	return m
+}
+
+// CapNames walks the regexp to find the names of capturing groups.
+func (re *Regexp) CapNames() []string {
+	names := make([]string, re.MaxCap()+1)
+	re.capNames(names)
+	return names
+}
+
+func (re *Regexp) capNames(names []string) {
+	if re.Op == OpCapture {
+		names[re.Cap] = re.Name
+	}
+	for _, sub := range re.Sub {
+		sub.capNames(names)
+	}
+}
diff --git a/src/regexp/syntax/simplify.go b/src/regexp/syntax/simplify.go
new file mode 100644
index 0000000..e439325
--- /dev/null
+++ b/src/regexp/syntax/simplify.go
@@ -0,0 +1,151 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Simplify returns a regexp equivalent to re but without counted repetitions
+// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
+// The resulting regexp will execute correctly but its string representation
+// will not produce the same parse tree, because capturing parentheses
+// may have been duplicated or removed. For example, the simplified form
+// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
+// The returned regexp may share structure with or be the original.
+func (re *Regexp) Simplify() *Regexp {
+	if re == nil {
+		return nil
+	}
+	switch re.Op {
+	case OpCapture, OpConcat, OpAlternate:
+		// Simplify children, building new Regexp if children change.
+		nre := re
+		for i, sub := range re.Sub {
+			nsub := sub.Simplify()
+			if nre == re && nsub != sub {
+				// Start a copy.
+				nre = new(Regexp)
+				*nre = *re
+				nre.Rune = nil
+				nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
+			}
+			if nre != re {
+				nre.Sub = append(nre.Sub, nsub)
+			}
+		}
+		return nre
+
+	case OpStar, OpPlus, OpQuest:
+		sub := re.Sub[0].Simplify()
+		return simplify1(re.Op, re.Flags, sub, re)
+
+	case OpRepeat:
+		// Special special case: x{0} matches the empty string
+		// and doesn't even need to consider x.
+		if re.Min == 0 && re.Max == 0 {
+			return &Regexp{Op: OpEmptyMatch}
+		}
+
+		// The fun begins.
+		sub := re.Sub[0].Simplify()
+
+		// x{n,} means at least n matches of x.
+		if re.Max == -1 {
+			// Special case: x{0,} is x*.
+			if re.Min == 0 {
+				return simplify1(OpStar, re.Flags, sub, nil)
+			}
+
+			// Special case: x{1,} is x+.
+			if re.Min == 1 {
+				return simplify1(OpPlus, re.Flags, sub, nil)
+			}
+
+			// General case: x{4,} is xxxx+.
+			nre := &Regexp{Op: OpConcat}
+			nre.Sub = nre.Sub0[:0]
+			for i := 0; i < re.Min-1; i++ {
+				nre.Sub = append(nre.Sub, sub)
+			}
+			nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
+			return nre
+		}
+
+		// Special case x{0} handled above.
+
+		// Special case: x{1} is just x.
+		if re.Min == 1 && re.Max == 1 {
+			return sub
+		}
+
+		// General case: x{n,m} means n copies of x and m copies of x?
+		// The machine will do less work if we nest the final m copies,
+		// so that x{2,5} = xx(x(x(x)?)?)?
+
+		// Build leading prefix: xx.
+		var prefix *Regexp
+		if re.Min > 0 {
+			prefix = &Regexp{Op: OpConcat}
+			prefix.Sub = prefix.Sub0[:0]
+			for i := 0; i < re.Min; i++ {
+				prefix.Sub = append(prefix.Sub, sub)
+			}
+		}
+
+		// Build and attach suffix: (x(x(x)?)?)?
+		if re.Max > re.Min {
+			suffix := simplify1(OpQuest, re.Flags, sub, nil)
+			for i := re.Min + 1; i < re.Max; i++ {
+				nre2 := &Regexp{Op: OpConcat}
+				nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
+				suffix = simplify1(OpQuest, re.Flags, nre2, nil)
+			}
+			if prefix == nil {
+				return suffix
+			}
+			prefix.Sub = append(prefix.Sub, suffix)
+		}
+		if prefix != nil {
+			return prefix
+		}
+
+		// Some degenerate case like min > max or min < max < 0.
+		// Handle as impossible match.
+		return &Regexp{Op: OpNoMatch}
+	}
+
+	return re
+}
+
+// simplify1 implements Simplify for the unary OpStar,
+// OpPlus, and OpQuest operators. It returns the simple regexp
+// equivalent to
+//
+//	Regexp{Op: op, Flags: flags, Sub: {sub}}
+//
+// under the assumption that sub is already simple, and
+// without first allocating that structure. If the regexp
+// to be returned turns out to be equivalent to re, simplify1
+// returns re instead.
+//
+// simplify1 is factored out of Simplify because the implementation
+// for other operators generates these unary expressions.
+// Letting them call simplify1 makes sure the expressions they
+// generate are simple.
+func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
+	// Special case: repeat the empty string as much as
+	// you want, but it's still the empty string.
+	if sub.Op == OpEmptyMatch {
+		return sub
+	}
+	// The operators are idempotent if the flags match.
+	if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
+		return sub
+	}
+	if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
+		return re
+	}
+
+	re = &Regexp{Op: op, Flags: flags}
+	re.Sub = append(re.Sub0[:0], sub)
+	return re
+}
diff --git a/src/regexp/syntax/simplify_test.go b/src/regexp/syntax/simplify_test.go
new file mode 100644
index 0000000..9877db3
--- /dev/null
+++ b/src/regexp/syntax/simplify_test.go
@@ -0,0 +1,151 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import "testing"
+
+var simplifyTests = []struct {
+	Regexp string
+	Simple string
+}{
+	// Already-simple constructs
+	{`a`, `a`},
+	{`ab`, `ab`},
+	{`a|b`, `[a-b]`},
+	{`ab|cd`, `ab|cd`},
+	{`(ab)*`, `(ab)*`},
+	{`(ab)+`, `(ab)+`},
+	{`(ab)?`, `(ab)?`},
+	{`.`, `(?s:.)`},
+	{`^`, `(?m:^)`},
+	{`$`, `(?m:$)`},
+	{`[ac]`, `[ac]`},
+	{`[^ac]`, `[^ac]`},
+
+	// Posix character classes
+	{`[[:alnum:]]`, `[0-9A-Za-z]`},
+	{`[[:alpha:]]`, `[A-Za-z]`},
+	{`[[:blank:]]`, `[\t ]`},
+	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
+	{`[[:digit:]]`, `[0-9]`},
+	{`[[:graph:]]`, `[!-~]`},
+	{`[[:lower:]]`, `[a-z]`},
+	{`[[:print:]]`, `[ -~]`},
+	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
+	{`[[:space:]]`, `[\t-\r ]`},
+	{`[[:upper:]]`, `[A-Z]`},
+	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
+
+	// Perl character classes
+	{`\d`, `[0-9]`},
+	{`\s`, `[\t-\n\f-\r ]`},
+	{`\w`, `[0-9A-Z_a-z]`},
+	{`\D`, `[^0-9]`},
+	{`\S`, `[^\t-\n\f-\r ]`},
+	{`\W`, `[^0-9A-Z_a-z]`},
+	{`[\d]`, `[0-9]`},
+	{`[\s]`, `[\t-\n\f-\r ]`},
+	{`[\w]`, `[0-9A-Z_a-z]`},
+	{`[\D]`, `[^0-9]`},
+	{`[\S]`, `[^\t-\n\f-\r ]`},
+	{`[\W]`, `[^0-9A-Z_a-z]`},
+
+	// Posix repetitions
+	{`a{1}`, `a`},
+	{`a{2}`, `aa`},
+	{`a{5}`, `aaaaa`},
+	{`a{0,1}`, `a?`},
+	// The next three are illegible because Simplify inserts (?:)
+	// parens instead of () parens to avoid creating extra
+	// captured subexpressions. The comments show a version with fewer parens.
+	{`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
+	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
+	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
+	{`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
+	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
+	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
+	{`a{0,}`, `a*`},
+	{`a{1,}`, `a+`},
+	{`a{2,}`, `aa+`},
+	{`a{5,}`, `aaaaa+`},
+
+	// Test that operators simplify their arguments.
+	{`(?:a{1,}){1,}`, `a+`},
+	{`(a{1,}b{1,})`, `(a+b+)`},
+	{`a{1,}|b{1,}`, `a+|b+`},
+	{`(?:a{1,})*`, `(?:a+)*`},
+	{`(?:a{1,})+`, `a+`},
+	{`(?:a{1,})?`, `(?:a+)?`},
+	{``, `(?:)`},
+	{`a{0}`, `(?:)`},
+
+	// Character class simplification
+	{`[ab]`, `[a-b]`},
+	{`[a-za-za-z]`, `[a-z]`},
+	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
+	{`[ABCDEFGH]`, `[A-H]`},
+	{`[AB-CD-EF-GH]`, `[A-H]`},
+	{`[W-ZP-XE-R]`, `[E-Z]`},
+	{`[a-ee-gg-m]`, `[a-m]`},
+	{`[a-ea-ha-m]`, `[a-m]`},
+	{`[a-ma-ha-e]`, `[a-m]`},
+	{`[a-zA-Z0-9 -~]`, `[ -~]`},
+
+	// Empty character classes
+	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
+
+	// Full character classes
+	{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
+
+	// Unicode case folding.
+	{`(?i)A`, `(?i:A)`},
+	{`(?i)a`, `(?i:A)`},
+	{`(?i)[A]`, `(?i:A)`},
+	{`(?i)[a]`, `(?i:A)`},
+	{`(?i)K`, `(?i:K)`},
+	{`(?i)k`, `(?i:K)`},
+	{`(?i)\x{212a}`, "(?i:K)"},
+	{`(?i)[K]`, "[Kk\u212A]"},
+	{`(?i)[k]`, "[Kk\u212A]"},
+	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
+	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
+	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
+	{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
+
+	// Empty string as a regular expression.
+	// The empty string must be preserved inside parens in order
+	// to make submatches work right, so these tests are less
+	// interesting than they might otherwise be. String inserts
+	// explicit (?:) in place of non-parenthesized empty strings,
+	// to make them easier to spot for other parsers.
+	{`(a|b|)`, `([a-b]|(?:))`},
+	{`(|)`, `()`},
+	{`a()`, `a()`},
+	{`(()|())`, `(()|())`},
+	{`(a|)`, `(a|(?:))`},
+	{`ab()cd()`, `ab()cd()`},
+	{`()`, `()`},
+	{`()*`, `()*`},
+	{`()+`, `()+`},
+	{`()?`, `()?`},
+	{`(){0}`, `(?:)`},
+	{`(){1}`, `()`},
+	{`(){1,}`, `()+`},
+	{`(){0,2}`, `(?:()()?)?`},
+}
+
+func TestSimplify(t *testing.T) {
+	for _, tt := range simplifyTests {
+		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
+		if err != nil {
+			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
+			continue
+		}
+		s := re.Simplify().String()
+		if s != tt.Simple {
+			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
+		}
+	}
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-16 19:23:18 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-16 19:23:18 +0000
commit	43a123c1ae6613b3efeed291fa552ecd909d3acf (patch)
tree	fd92518b7024bc74031f78a1cf9e454b65e73665 /src/regexp/syntax
parent	Initial commit. (diff)
download	golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.tar.xz golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.zip