diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:23:18 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:23:18 +0000 |
commit | 43a123c1ae6613b3efeed291fa552ecd909d3acf (patch) | |
tree | fd92518b7024bc74031f78a1cf9e454b65e73665 /src/regexp | |
parent | Initial commit. (diff) | |
download | golang-1.20-upstream.tar.xz golang-1.20-upstream.zip |
Adding upstream version 1.20.14.upstream/1.20.14upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/regexp')
29 files changed, 16656 insertions, 0 deletions
diff --git a/src/regexp/all_test.go b/src/regexp/all_test.go new file mode 100644 index 0000000..52de3fe --- /dev/null +++ b/src/regexp/all_test.go @@ -0,0 +1,949 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "reflect" + "regexp/syntax" + "strings" + "testing" + "unicode/utf8" +) + +var goodRe = []string{ + ``, + `.`, + `^.$`, + `a`, + `a*`, + `a+`, + `a?`, + `a|b`, + `a*|b*`, + `(a*|b)(c*|d)`, + `[a-z]`, + `[a-abc-c\-\]\[]`, + `[a-z]+`, + `[abc]`, + `[^1234]`, + `[^\n]`, + `\!\\`, +} + +type stringError struct { + re string + err string +} + +var badRe = []stringError{ + {`*`, "missing argument to repetition operator: `*`"}, + {`+`, "missing argument to repetition operator: `+`"}, + {`?`, "missing argument to repetition operator: `?`"}, + {`(abc`, "missing closing ): `(abc`"}, + {`abc)`, "unexpected ): `abc)`"}, + {`x[a-z`, "missing closing ]: `[a-z`"}, + {`[z-a]`, "invalid character class range: `z-a`"}, + {`abc\`, "trailing backslash at end of expression"}, + {`a**`, "invalid nested repetition operator: `**`"}, + {`a*+`, "invalid nested repetition operator: `*+`"}, + {`\x`, "invalid escape sequence: `\\x`"}, + {strings.Repeat(`\pL`, 27000), "expression too large"}, +} + +func compileTest(t *testing.T, expr string, error string) *Regexp { + re, err := Compile(expr) + if error == "" && err != nil { + t.Error("compiling `", expr, "`; unexpected error: ", err.Error()) + } + if error != "" && err == nil { + t.Error("compiling `", expr, "`; missing error") + } else if error != "" && !strings.Contains(err.Error(), error) { + t.Error("compiling `", expr, "`; wrong error: ", err.Error(), "; want ", error) + } + return re +} + +func TestGoodCompile(t *testing.T) { + for i := 0; i < len(goodRe); i++ { + compileTest(t, goodRe[i], "") + } +} + +func TestBadCompile(t *testing.T) { + for i := 0; i < len(badRe); i++ { + compileTest(t, badRe[i].re, badRe[i].err) + } +} + +func matchTest(t *testing.T, test *FindTest) { + re := compileTest(t, test.pat, "") + if re == nil { + return + } + m := re.MatchString(test.text) + if m != (len(test.matches) > 0) { + t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } + // now try bytes + m = re.Match([]byte(test.text)) + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } +} + +func TestMatch(t *testing.T) { + for _, test := range findTests { + matchTest(t, &test) + } +} + +func matchFunctionTest(t *testing.T, test *FindTest) { + m, err := MatchString(test.pat, test.text) + if err == nil { + return + } + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } +} + +func TestMatchFunction(t *testing.T) { + for _, test := range findTests { + matchFunctionTest(t, &test) + } +} + +func copyMatchTest(t *testing.T, test *FindTest) { + re := compileTest(t, test.pat, "") + if re == nil { + return + } + m1 := re.MatchString(test.text) + m2 := re.Copy().MatchString(test.text) + if m1 != m2 { + t.Errorf("Copied Regexp match failure on %s: original gave %t; copy gave %t; should be %t", + test, m1, m2, len(test.matches) > 0) + } +} + +func TestCopyMatch(t *testing.T) { + for _, test := range findTests { + copyMatchTest(t, &test) + } +} + +type ReplaceTest struct { + pattern, replacement, input, output string +} + +var replaceTests = []ReplaceTest{ + // Test empty input and/or replacement, with pattern that matches the empty string. + {"", "", "", ""}, + {"", "x", "", "x"}, + {"", "", "abc", "abc"}, + {"", "x", "abc", "xaxbxcx"}, + + // Test empty input and/or replacement, with pattern that does not match the empty string. + {"b", "", "", ""}, + {"b", "x", "", ""}, + {"b", "", "abc", "ac"}, + {"b", "x", "abc", "axc"}, + {"y", "", "", ""}, + {"y", "x", "", ""}, + {"y", "", "abc", "abc"}, + {"y", "x", "abc", "abc"}, + + // Multibyte characters -- verify that we don't try to match in the middle + // of a character. + {"[a-c]*", "x", "\u65e5", "x\u65e5x"}, + {"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, + + // Start and end of a string. + {"^[a-c]*", "x", "abcdabc", "xdabc"}, + {"[a-c]*$", "x", "abcdabc", "abcdx"}, + {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]*", "x", "abc", "x"}, + {"[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*", "x", "dabce", "xdabce"}, + {"[a-c]*$", "x", "dabce", "dabcex"}, + {"^[a-c]*$", "x", "dabce", "dabce"}, + {"^[a-c]*", "x", "", "x"}, + {"[a-c]*$", "x", "", "x"}, + {"^[a-c]*$", "x", "", "x"}, + + {"^[a-c]+", "x", "abcdabc", "xdabc"}, + {"[a-c]+$", "x", "abcdabc", "abcdx"}, + {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]+", "x", "abc", "x"}, + {"[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+", "x", "dabce", "dabce"}, + {"[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+", "x", "", ""}, + {"[a-c]+$", "x", "", ""}, + {"^[a-c]+$", "x", "", ""}, + + // Other cases. + {"abc", "def", "abcdefg", "defdefg"}, + {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, + {"abc", "", "abcdabc", "d"}, + {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, + {"abc", "d", "", ""}, + {"abc", "d", "abc", "d"}, + {".+", "x", "abc", "x"}, + {"[a-c]*", "x", "def", "xdxexfx"}, + {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, + {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, + + // Substitutions + {"a+", "($0)", "banana", "b(a)n(a)n(a)"}, + {"a+", "(${0})", "banana", "b(a)n(a)n(a)"}, + {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, + {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, + {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, world"}, + {"hello, (.+)", "goodbye, $1x", "hello, world", "goodbye, "}, + {"hello, (.+)", "goodbye, ${1}x", "hello, world", "goodbye, worldx"}, + {"hello, (.+)", "<$0><$1><$2><$3>", "hello, world", "<hello, world><world><><>"}, + {"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, world!"}, + {"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, world"}, + {"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "hihihi"}, + {"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "byebyebye"}, + {"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", ""}, + {"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "hiyz"}, + {"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $x"}, + {"a+", "${oops", "aaa", "${oops"}, + {"a+", "$$", "aaa", "$"}, + {"a+", "$", "aaa", "$"}, + + // Substitution when subexpression isn't found + {"(x)?", "$1", "123", "123"}, + {"abc", "$1", "123", "123"}, + + // Substitutions involving a (x){0} + {"(a)(b){0}(c)", ".$1|$3.", "xacxacx", "x.a|c.x.a|c.x"}, + {"(a)(((b))){0}c", ".$1.", "xacxacx", "x.a.x.a.x"}, + {"((a(b){0}){3}){5}(h)", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"}, + {"((a(b){0}){3}){5}h", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"}, +} + +var replaceLiteralTests = []ReplaceTest{ + // Substitutions + {"a+", "($0)", "banana", "b($0)n($0)n($0)"}, + {"a+", "(${0})", "banana", "b(${0})n(${0})n(${0})"}, + {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, + {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, + {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, ${1}"}, + {"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, $noun!"}, + {"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, ${noun}"}, + {"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "$x$x$x"}, + {"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "$x$x$x"}, + {"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", "$xyz"}, + {"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "${x}yz"}, + {"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $$x"}, + {"a+", "${oops", "aaa", "${oops"}, + {"a+", "$$", "aaa", "$$"}, + {"a+", "$", "aaa", "$"}, +} + +type ReplaceFuncTest struct { + pattern string + replacement func(string) string + input, output string +} + +var replaceFuncTests = []ReplaceFuncTest{ + {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, + {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, + {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, +} + +func TestReplaceAll(t *testing.T) { + for _, tc := range replaceTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceAllString(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.ReplaceAll(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +func TestReplaceAllLiteral(t *testing.T) { + // Run ReplaceAll tests that do not have $ expansions. + for _, tc := range replaceTests { + if strings.Contains(tc.replacement, "$") { + continue + } + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } + + // Run literal-specific tests. + for _, tc := range replaceLiteralTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +func TestReplaceAllFunc(t *testing.T) { + for _, tc := range replaceFuncTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllStringFunc(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q", + tc.pattern, tc.input, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) })) + if actual != tc.output { + t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q", + tc.pattern, tc.input, actual, tc.output) + } + } +} + +type MetaTest struct { + pattern, output, literal string + isLiteral bool +} + +var metaTests = []MetaTest{ + {``, ``, ``, true}, + {`foo`, `foo`, `foo`, true}, + {`日本語+`, `日本語\+`, `日本語`, false}, + {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator + {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators + {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false}, +} + +var literalPrefixTests = []MetaTest{ + // See golang.org/issue/11175. + // output is unused. + {`^0^0$`, ``, `0`, false}, + {`^0^`, ``, ``, false}, + {`^0$`, ``, `0`, true}, + {`$0^`, ``, ``, false}, + {`$0$`, ``, ``, false}, + {`^^0$$`, ``, ``, false}, + {`^$^$`, ``, ``, false}, + {`$$0^^`, ``, ``, false}, + {`a\x{fffd}b`, ``, `a`, false}, + {`\x{fffd}b`, ``, ``, false}, + {"\ufffd", ``, ``, false}, +} + +func TestQuoteMeta(t *testing.T) { + for _, tc := range metaTests { + // Verify that QuoteMeta returns the expected string. + quoted := QuoteMeta(tc.pattern) + if quoted != tc.output { + t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`", + tc.pattern, quoted, tc.output) + continue + } + + // Verify that the quoted string is in fact treated as expected + // by Compile -- i.e. that it matches the original, unquoted string. + if tc.pattern != "" { + re, err := Compile(quoted) + if err != nil { + t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err) + continue + } + src := "abc" + tc.pattern + "def" + repl := "xyz" + replaced := re.ReplaceAllString(src, repl) + expected := "abcxyzdef" + if replaced != expected { + t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`", + tc.pattern, src, repl, replaced, expected) + } + } + } +} + +func TestLiteralPrefix(t *testing.T) { + for _, tc := range append(metaTests, literalPrefixTests...) { + // Literal method needs to scan the pattern. + re := MustCompile(tc.pattern) + str, complete := re.LiteralPrefix() + if complete != tc.isLiteral { + t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) + } + if str != tc.literal { + t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal) + } + } +} + +type subexpIndex struct { + name string + index int +} + +type subexpCase struct { + input string + num int + names []string + indices []subexpIndex +} + +var emptySubexpIndices = []subexpIndex{{"", -1}, {"missing", -1}} + +var subexpCases = []subexpCase{ + {``, 0, nil, emptySubexpIndices}, + {`.*`, 0, nil, emptySubexpIndices}, + {`abba`, 0, nil, emptySubexpIndices}, + {`ab(b)a`, 1, []string{"", ""}, emptySubexpIndices}, + {`ab(.*)a`, 1, []string{"", ""}, emptySubexpIndices}, + {`(.*)ab(.*)a`, 2, []string{"", "", ""}, emptySubexpIndices}, + {`(.*)(ab)(.*)a`, 3, []string{"", "", "", ""}, emptySubexpIndices}, + {`(.*)((a)b)(.*)a`, 4, []string{"", "", "", "", ""}, emptySubexpIndices}, + {`(.*)(\(ab)(.*)a`, 3, []string{"", "", "", ""}, emptySubexpIndices}, + {`(.*)(\(a\)b)(.*)a`, 3, []string{"", "", "", ""}, emptySubexpIndices}, + {`(?P<foo>.*)(?P<bar>(a)b)(?P<foo>.*)a`, 4, []string{"", "foo", "bar", "", "foo"}, []subexpIndex{{"", -1}, {"missing", -1}, {"foo", 1}, {"bar", 2}}}, +} + +func TestSubexp(t *testing.T) { + for _, c := range subexpCases { + re := MustCompile(c.input) + n := re.NumSubexp() + if n != c.num { + t.Errorf("%q: NumSubexp = %d, want %d", c.input, n, c.num) + continue + } + names := re.SubexpNames() + if len(names) != 1+n { + t.Errorf("%q: len(SubexpNames) = %d, want %d", c.input, len(names), n) + continue + } + if c.names != nil { + for i := 0; i < 1+n; i++ { + if names[i] != c.names[i] { + t.Errorf("%q: SubexpNames[%d] = %q, want %q", c.input, i, names[i], c.names[i]) + } + } + } + for _, subexp := range c.indices { + index := re.SubexpIndex(subexp.name) + if index != subexp.index { + t.Errorf("%q: SubexpIndex(%q) = %d, want %d", c.input, subexp.name, index, subexp.index) + } + } + } +} + +var splitTests = []struct { + s string + r string + n int + out []string +}{ + {"foo:and:bar", ":", -1, []string{"foo", "and", "bar"}}, + {"foo:and:bar", ":", 1, []string{"foo:and:bar"}}, + {"foo:and:bar", ":", 2, []string{"foo", "and:bar"}}, + {"foo:and:bar", "foo", -1, []string{"", ":and:bar"}}, + {"foo:and:bar", "bar", -1, []string{"foo:and:", ""}}, + {"foo:and:bar", "baz", -1, []string{"foo:and:bar"}}, + {"baabaab", "a", -1, []string{"b", "", "b", "", "b"}}, + {"baabaab", "a*", -1, []string{"b", "b", "b"}}, + {"baabaab", "ba*", -1, []string{"", "", "", ""}}, + {"foobar", "f*b*", -1, []string{"", "o", "o", "a", "r"}}, + {"foobar", "f+.*b+", -1, []string{"", "ar"}}, + {"foobooboar", "o{2}", -1, []string{"f", "b", "boar"}}, + {"a,b,c,d,e,f", ",", 3, []string{"a", "b", "c,d,e,f"}}, + {"a,b,c,d,e,f", ",", 0, nil}, + {",", ",", -1, []string{"", ""}}, + {",,,", ",", -1, []string{"", "", "", ""}}, + {"", ",", -1, []string{""}}, + {"", ".*", -1, []string{""}}, + {"", ".+", -1, []string{""}}, + {"", "", -1, []string{}}, + {"foobar", "", -1, []string{"f", "o", "o", "b", "a", "r"}}, + {"abaabaccadaaae", "a*", 5, []string{"", "b", "b", "c", "cadaaae"}}, + {":x:y:z:", ":", -1, []string{"", "x", "y", "z", ""}}, +} + +func TestSplit(t *testing.T) { + for i, test := range splitTests { + re, err := Compile(test.r) + if err != nil { + t.Errorf("#%d: %q: compile error: %s", i, test.r, err.Error()) + continue + } + + split := re.Split(test.s, test.n) + if !reflect.DeepEqual(split, test.out) { + t.Errorf("#%d: %q: got %q; want %q", i, test.r, split, test.out) + } + + if QuoteMeta(test.r) == test.r { + strsplit := strings.SplitN(test.s, test.r, test.n) + if !reflect.DeepEqual(split, strsplit) { + t.Errorf("#%d: Split(%q, %q, %d): regexp vs strings mismatch\nregexp=%q\nstrings=%q", i, test.s, test.r, test.n, split, strsplit) + } + } + } +} + +// The following sequence of Match calls used to panic. See issue #12980. +func TestParseAndCompile(t *testing.T) { + expr := "a$" + s := "a\nb" + + for i, tc := range []struct { + reFlags syntax.Flags + expMatch bool + }{ + {syntax.Perl | syntax.OneLine, false}, + {syntax.Perl &^ syntax.OneLine, true}, + } { + parsed, err := syntax.Parse(expr, tc.reFlags) + if err != nil { + t.Fatalf("%d: parse: %v", i, err) + } + re, err := Compile(parsed.String()) + if err != nil { + t.Fatalf("%d: compile: %v", i, err) + } + if match := re.MatchString(s); match != tc.expMatch { + t.Errorf("%d: %q.MatchString(%q)=%t; expected=%t", i, re, s, match, tc.expMatch) + } + } +} + +// Check that one-pass cutoff does trigger. +func TestOnePassCutoff(t *testing.T) { + re, err := syntax.Parse(`^x{1,1000}y{1,1000}$`, syntax.Perl) + if err != nil { + t.Fatalf("parse: %v", err) + } + p, err := syntax.Compile(re.Simplify()) + if err != nil { + t.Fatalf("compile: %v", err) + } + if compileOnePass(p) != nil { + t.Fatalf("makeOnePass succeeded; wanted nil") + } +} + +// Check that the same machine can be used with the standard matcher +// and then the backtracker when there are no captures. +func TestSwitchBacktrack(t *testing.T) { + re := MustCompile(`a|b`) + long := make([]byte, maxBacktrackVector+1) + + // The following sequence of Match calls used to panic. See issue #10319. + re.Match(long) // triggers standard matcher + re.Match(long[:1]) // triggers backtracker +} + +func BenchmarkFind(b *testing.B) { + b.StopTimer() + re := MustCompile("a+b+") + wantSubs := "aaabb" + s := []byte("acbb" + wantSubs + "dd") + b.StartTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + subs := re.Find(s) + if string(subs) != wantSubs { + b.Fatalf("Find(%q) = %q; want %q", s, subs, wantSubs) + } + } +} + +func BenchmarkFindAllNoMatches(b *testing.B) { + re := MustCompile("a+b+") + s := []byte("acddee") + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + all := re.FindAll(s, -1) + if all != nil { + b.Fatalf("FindAll(%q) = %q; want nil", s, all) + } + } +} + +func BenchmarkFindString(b *testing.B) { + b.StopTimer() + re := MustCompile("a+b+") + wantSubs := "aaabb" + s := "acbb" + wantSubs + "dd" + b.StartTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + subs := re.FindString(s) + if subs != wantSubs { + b.Fatalf("FindString(%q) = %q; want %q", s, subs, wantSubs) + } + } +} + +func BenchmarkFindSubmatch(b *testing.B) { + b.StopTimer() + re := MustCompile("a(a+b+)b") + wantSubs := "aaabb" + s := []byte("acbb" + wantSubs + "dd") + b.StartTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + subs := re.FindSubmatch(s) + if string(subs[0]) != wantSubs { + b.Fatalf("FindSubmatch(%q)[0] = %q; want %q", s, subs[0], wantSubs) + } + if string(subs[1]) != "aab" { + b.Fatalf("FindSubmatch(%q)[1] = %q; want %q", s, subs[1], "aab") + } + } +} + +func BenchmarkFindStringSubmatch(b *testing.B) { + b.StopTimer() + re := MustCompile("a(a+b+)b") + wantSubs := "aaabb" + s := "acbb" + wantSubs + "dd" + b.StartTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + subs := re.FindStringSubmatch(s) + if subs[0] != wantSubs { + b.Fatalf("FindStringSubmatch(%q)[0] = %q; want %q", s, subs[0], wantSubs) + } + if subs[1] != "aab" { + b.Fatalf("FindStringSubmatch(%q)[1] = %q; want %q", s, subs[1], "aab") + } + } +} + +func BenchmarkLiteral(b *testing.B) { + x := strings.Repeat("x", 50) + "y" + b.StopTimer() + re := MustCompile("y") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + b.Fatalf("no match!") + } + } +} + +func BenchmarkNotLiteral(b *testing.B) { + x := strings.Repeat("x", 50) + "y" + b.StopTimer() + re := MustCompile(".y") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + b.Fatalf("no match!") + } + } +} + +func BenchmarkMatchClass(b *testing.B) { + b.StopTimer() + x := strings.Repeat("xxxx", 20) + "w" + re := MustCompile("[abcdw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + b.Fatalf("no match!") + } + } +} + +func BenchmarkMatchClass_InRange(b *testing.B) { + b.StopTimer() + // 'b' is between 'a' and 'c', so the charclass + // range checking is no help here. + x := strings.Repeat("bbbb", 20) + "c" + re := MustCompile("[ac]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + b.Fatalf("no match!") + } + } +} + +func BenchmarkReplaceAll(b *testing.B) { + x := "abcdefghijklmnopqrstuvwxyz" + b.StopTimer() + re := MustCompile("[cjrw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.ReplaceAllString(x, "") + } +} + +func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredShortMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLongMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkOnePassShortA(b *testing.B) { + b.StopTimer() + x := []byte("abcddddddeeeededd") + re := MustCompile("^.bc(d|e)*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkNotOnePassShortA(b *testing.B) { + b.StopTimer() + x := []byte("abcddddddeeeededd") + re := MustCompile(".bc(d|e)*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkOnePassShortB(b *testing.B) { + b.StopTimer() + x := []byte("abcddddddeeeededd") + re := MustCompile("^.bc(?:d|e)*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkNotOnePassShortB(b *testing.B) { + b.StopTimer() + x := []byte("abcddddddeeeededd") + re := MustCompile(".bc(?:d|e)*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkOnePassLongPrefix(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^abcdefghijklmnopqrstuvwxyz.*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkOnePassLongNotPrefix(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^.bcdefghijklmnopqrstuvwxyz.*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkMatchParallelShared(b *testing.B) { + x := []byte("this is a long line that contains foo bar baz") + re := MustCompile("foo (ba+r)? baz") + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + re.Match(x) + } + }) +} + +func BenchmarkMatchParallelCopied(b *testing.B) { + x := []byte("this is a long line that contains foo bar baz") + re := MustCompile("foo (ba+r)? baz") + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + re := re.Copy() + for pb.Next() { + re.Match(x) + } + }) +} + +var sink string + +func BenchmarkQuoteMetaAll(b *testing.B) { + specials := make([]byte, 0) + for i := byte(0); i < utf8.RuneSelf; i++ { + if special(i) { + specials = append(specials, i) + } + } + s := string(specials) + b.SetBytes(int64(len(s))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + sink = QuoteMeta(s) + } +} + +func BenchmarkQuoteMetaNone(b *testing.B) { + s := "abcdefghijklmnopqrstuvwxyz" + b.SetBytes(int64(len(s))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + sink = QuoteMeta(s) + } +} + +var compileBenchData = []struct{ name, re string }{ + {"Onepass", `^a.[l-nA-Cg-j]?e$`}, + {"Medium", `^((a|b|[d-z0-9])*(日){4,5}.)+$`}, + {"Hard", strings.Repeat(`((abc)*|`, 50) + strings.Repeat(`)`, 50)}, +} + +func BenchmarkCompile(b *testing.B) { + for _, data := range compileBenchData { + b.Run(data.name, func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + if _, err := Compile(data.re); err != nil { + b.Fatal(err) + } + } + }) + } +} + +func TestDeepEqual(t *testing.T) { + re1 := MustCompile("a.*b.*c.*d") + re2 := MustCompile("a.*b.*c.*d") + if !reflect.DeepEqual(re1, re2) { // has always been true, since Go 1. + t.Errorf("DeepEqual(re1, re2) = false, want true") + } + + re1.MatchString("abcdefghijklmn") + if !reflect.DeepEqual(re1, re2) { + t.Errorf("DeepEqual(re1, re2) = false, want true") + } + + re2.MatchString("abcdefghijklmn") + if !reflect.DeepEqual(re1, re2) { + t.Errorf("DeepEqual(re1, re2) = false, want true") + } + + re2.MatchString(strings.Repeat("abcdefghijklmn", 100)) + if !reflect.DeepEqual(re1, re2) { + t.Errorf("DeepEqual(re1, re2) = false, want true") + } +} + +var minInputLenTests = []struct { + Regexp string + min int +}{ + {``, 0}, + {`a`, 1}, + {`aa`, 2}, + {`(aa)a`, 3}, + {`(?:aa)a`, 3}, + {`a?a`, 1}, + {`(aaa)|(aa)`, 2}, + {`(aa)+a`, 3}, + {`(aa)*a`, 1}, + {`(aa){3,5}`, 6}, + {`[a-z]`, 1}, + {`日`, 3}, +} + +func TestMinInputLen(t *testing.T) { + for _, tt := range minInputLenTests { + re, _ := syntax.Parse(tt.Regexp, syntax.Perl) + m := minInputLen(re) + if m != tt.min { + t.Errorf("regexp %#q has minInputLen %d, should be %d", tt.Regexp, m, tt.min) + } + } +} diff --git a/src/regexp/backtrack.go b/src/regexp/backtrack.go new file mode 100644 index 0000000..0739f5f --- /dev/null +++ b/src/regexp/backtrack.go @@ -0,0 +1,367 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// backtrack is a regular expression search with submatch +// tracking for small regular expressions and texts. It allocates +// a bit vector with (length of input) * (length of prog) bits, +// to make sure it never explores the same (character position, instruction) +// state multiple times. This limits the search to run in time linear in +// the length of the test. +// +// backtrack is a fast replacement for the NFA code on small +// regexps when onepass cannot be used. + +package regexp + +import ( + "regexp/syntax" + "sync" +) + +// A job is an entry on the backtracker's job stack. It holds +// the instruction pc and the position in the input. +type job struct { + pc uint32 + arg bool + pos int +} + +const ( + visitedBits = 32 + maxBacktrackProg = 500 // len(prog.Inst) <= max + maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits) +) + +// bitState holds state for the backtracker. +type bitState struct { + end int + cap []int + matchcap []int + jobs []job + visited []uint32 + + inputs inputs +} + +var bitStatePool sync.Pool + +func newBitState() *bitState { + b, ok := bitStatePool.Get().(*bitState) + if !ok { + b = new(bitState) + } + return b +} + +func freeBitState(b *bitState) { + b.inputs.clear() + bitStatePool.Put(b) +} + +// maxBitStateLen returns the maximum length of a string to search with +// the backtracker using prog. +func maxBitStateLen(prog *syntax.Prog) int { + if !shouldBacktrack(prog) { + return 0 + } + return maxBacktrackVector / len(prog.Inst) +} + +// shouldBacktrack reports whether the program is too +// long for the backtracker to run. +func shouldBacktrack(prog *syntax.Prog) bool { + return len(prog.Inst) <= maxBacktrackProg +} + +// reset resets the state of the backtracker. +// end is the end position in the input. +// ncap is the number of captures. +func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) { + b.end = end + + if cap(b.jobs) == 0 { + b.jobs = make([]job, 0, 256) + } else { + b.jobs = b.jobs[:0] + } + + visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits + if cap(b.visited) < visitedSize { + b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) + } else { + b.visited = b.visited[:visitedSize] + for i := range b.visited { + b.visited[i] = 0 + } + } + + if cap(b.cap) < ncap { + b.cap = make([]int, ncap) + } else { + b.cap = b.cap[:ncap] + } + for i := range b.cap { + b.cap[i] = -1 + } + + if cap(b.matchcap) < ncap { + b.matchcap = make([]int, ncap) + } else { + b.matchcap = b.matchcap[:ncap] + } + for i := range b.matchcap { + b.matchcap[i] = -1 + } +} + +// shouldVisit reports whether the combination of (pc, pos) has not +// been visited yet. +func (b *bitState) shouldVisit(pc uint32, pos int) bool { + n := uint(int(pc)*(b.end+1) + pos) + if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { + return false + } + b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) + return true +} + +// push pushes (pc, pos, arg) onto the job stack if it should be +// visited. +func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) { + // Only check shouldVisit when arg is false. + // When arg is true, we are continuing a previous visit. + if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) { + b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) + } +} + +// tryBacktrack runs a backtracking search starting at pos. +func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool { + longest := re.longest + + b.push(re, pc, pos, false) + for len(b.jobs) > 0 { + l := len(b.jobs) - 1 + // Pop job off the stack. + pc := b.jobs[l].pc + pos := b.jobs[l].pos + arg := b.jobs[l].arg + b.jobs = b.jobs[:l] + + // Optimization: rather than push and pop, + // code that is going to Push and continue + // the loop simply updates ip, p, and arg + // and jumps to CheckAndLoop. We have to + // do the ShouldVisit check that Push + // would have, but we avoid the stack + // manipulation. + goto Skip + CheckAndLoop: + if !b.shouldVisit(pc, pos) { + continue + } + Skip: + + inst := &re.prog.Inst[pc] + + switch inst.Op { + default: + panic("bad inst") + case syntax.InstFail: + panic("unexpected InstFail") + case syntax.InstAlt: + // Cannot just + // b.push(inst.Out, pos, false) + // b.push(inst.Arg, pos, false) + // If during the processing of inst.Out, we encounter + // inst.Arg via another path, we want to process it then. + // Pushing it here will inhibit that. Instead, re-push + // inst with arg==true as a reminder to push inst.Arg out + // later. + if arg { + // Finished inst.Out; try inst.Arg. + arg = false + pc = inst.Arg + goto CheckAndLoop + } else { + b.push(re, pc, pos, true) + pc = inst.Out + goto CheckAndLoop + } + + case syntax.InstAltMatch: + // One opcode consumes runes; the other leads to match. + switch re.prog.Inst[inst.Out].Op { + case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + // inst.Arg is the match. + b.push(re, inst.Arg, pos, false) + pc = inst.Arg + pos = b.end + goto CheckAndLoop + } + // inst.Out is the match - non-greedy + b.push(re, inst.Out, b.end, false) + pc = inst.Out + goto CheckAndLoop + + case syntax.InstRune: + r, width := i.step(pos) + if !inst.MatchRune(r) { + continue + } + pos += width + pc = inst.Out + goto CheckAndLoop + + case syntax.InstRune1: + r, width := i.step(pos) + if r != inst.Rune[0] { + continue + } + pos += width + pc = inst.Out + goto CheckAndLoop + + case syntax.InstRuneAnyNotNL: + r, width := i.step(pos) + if r == '\n' || r == endOfText { + continue + } + pos += width + pc = inst.Out + goto CheckAndLoop + + case syntax.InstRuneAny: + r, width := i.step(pos) + if r == endOfText { + continue + } + pos += width + pc = inst.Out + goto CheckAndLoop + + case syntax.InstCapture: + if arg { + // Finished inst.Out; restore the old value. + b.cap[inst.Arg] = pos + continue + } else { + if inst.Arg < uint32(len(b.cap)) { + // Capture pos to register, but save old value. + b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done. + b.cap[inst.Arg] = pos + } + pc = inst.Out + goto CheckAndLoop + } + + case syntax.InstEmptyWidth: + flag := i.context(pos) + if !flag.match(syntax.EmptyOp(inst.Arg)) { + continue + } + pc = inst.Out + goto CheckAndLoop + + case syntax.InstNop: + pc = inst.Out + goto CheckAndLoop + + case syntax.InstMatch: + // We found a match. If the caller doesn't care + // where the match is, no point going further. + if len(b.cap) == 0 { + return true + } + + // Record best match so far. + // Only need to check end point, because this entire + // call is only considering one start position. + if len(b.cap) > 1 { + b.cap[1] = pos + } + if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) { + copy(b.matchcap, b.cap) + } + + // If going for first match, we're done. + if !longest { + return true + } + + // If we used the entire text, no longer match is possible. + if pos == b.end { + return true + } + + // Otherwise, continue on in hope of a longer match. + continue + } + } + + return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0 +} + +// backtrack runs a backtracking search of prog on the input starting at pos. +func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int { + startCond := re.cond + if startCond == ^syntax.EmptyOp(0) { // impossible + return nil + } + if startCond&syntax.EmptyBeginText != 0 && pos != 0 { + // Anchored match, past beginning of text. + return nil + } + + b := newBitState() + i, end := b.inputs.init(nil, ib, is) + b.reset(re.prog, end, ncap) + + // Anchored search must start at the beginning of the input + if startCond&syntax.EmptyBeginText != 0 { + if len(b.cap) > 0 { + b.cap[0] = pos + } + if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) { + freeBitState(b) + return nil + } + } else { + + // Unanchored search, starting from each possible text position. + // Notice that we have to try the empty string at the end of + // the text, so the loop condition is pos <= end, not pos < end. + // This looks like it's quadratic in the size of the text, + // but we are not clearing visited between calls to TrySearch, + // so no work is duplicated and it ends up still being linear. + width := -1 + for ; pos <= end && width != 0; pos += width { + if len(re.prefix) > 0 { + // Match requires literal prefix; fast search for it. + advance := i.index(re, pos) + if advance < 0 { + freeBitState(b) + return nil + } + pos += advance + } + + if len(b.cap) > 0 { + b.cap[0] = pos + } + if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) { + // Match must be leftmost; done. + goto Match + } + _, width = i.step(pos) + } + freeBitState(b) + return nil + } + +Match: + dstCap = append(dstCap, b.matchcap...) + freeBitState(b) + return dstCap +} diff --git a/src/regexp/example_test.go b/src/regexp/example_test.go new file mode 100644 index 0000000..466b38b --- /dev/null +++ b/src/regexp/example_test.go @@ -0,0 +1,433 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp_test + +import ( + "fmt" + "regexp" + "strings" +) + +func Example() { + // Compile the expression once, usually at init time. + // Use raw strings to avoid having to quote the backslashes. + var validID = regexp.MustCompile(`^[a-z]+\[[0-9]+\]$`) + + fmt.Println(validID.MatchString("adam[23]")) + fmt.Println(validID.MatchString("eve[7]")) + fmt.Println(validID.MatchString("Job[48]")) + fmt.Println(validID.MatchString("snakey")) + // Output: + // true + // true + // false + // false +} + +func ExampleMatch() { + matched, err := regexp.Match(`foo.*`, []byte(`seafood`)) + fmt.Println(matched, err) + matched, err = regexp.Match(`bar.*`, []byte(`seafood`)) + fmt.Println(matched, err) + matched, err = regexp.Match(`a(b`, []byte(`seafood`)) + fmt.Println(matched, err) + + // Output: + // true <nil> + // false <nil> + // false error parsing regexp: missing closing ): `a(b` +} + +func ExampleMatchString() { + matched, err := regexp.MatchString(`foo.*`, "seafood") + fmt.Println(matched, err) + matched, err = regexp.MatchString(`bar.*`, "seafood") + fmt.Println(matched, err) + matched, err = regexp.MatchString(`a(b`, "seafood") + fmt.Println(matched, err) + // Output: + // true <nil> + // false <nil> + // false error parsing regexp: missing closing ): `a(b` +} + +func ExampleQuoteMeta() { + fmt.Println(regexp.QuoteMeta(`Escaping symbols like: .+*?()|[]{}^$`)) + // Output: + // Escaping symbols like: \.\+\*\?\(\)\|\[\]\{\}\^\$ +} + +func ExampleRegexp_Find() { + re := regexp.MustCompile(`foo.?`) + fmt.Printf("%q\n", re.Find([]byte(`seafood fool`))) + + // Output: + // "food" +} + +func ExampleRegexp_FindAll() { + re := regexp.MustCompile(`foo.?`) + fmt.Printf("%q\n", re.FindAll([]byte(`seafood fool`), -1)) + + // Output: + // ["food" "fool"] +} + +func ExampleRegexp_FindAllSubmatch() { + re := regexp.MustCompile(`foo(.?)`) + fmt.Printf("%q\n", re.FindAllSubmatch([]byte(`seafood fool`), -1)) + + // Output: + // [["food" "d"] ["fool" "l"]] +} + +func ExampleRegexp_FindSubmatch() { + re := regexp.MustCompile(`foo(.?)`) + fmt.Printf("%q\n", re.FindSubmatch([]byte(`seafood fool`))) + + // Output: + // ["food" "d"] +} + +func ExampleRegexp_Match() { + re := regexp.MustCompile(`foo.?`) + fmt.Println(re.Match([]byte(`seafood fool`))) + fmt.Println(re.Match([]byte(`something else`))) + + // Output: + // true + // false +} + +func ExampleRegexp_FindString() { + re := regexp.MustCompile(`foo.?`) + fmt.Printf("%q\n", re.FindString("seafood fool")) + fmt.Printf("%q\n", re.FindString("meat")) + // Output: + // "food" + // "" +} + +func ExampleRegexp_FindStringIndex() { + re := regexp.MustCompile(`ab?`) + fmt.Println(re.FindStringIndex("tablett")) + fmt.Println(re.FindStringIndex("foo") == nil) + // Output: + // [1 3] + // true +} + +func ExampleRegexp_FindStringSubmatch() { + re := regexp.MustCompile(`a(x*)b(y|z)c`) + fmt.Printf("%q\n", re.FindStringSubmatch("-axxxbyc-")) + fmt.Printf("%q\n", re.FindStringSubmatch("-abzc-")) + // Output: + // ["axxxbyc" "xxx" "y"] + // ["abzc" "" "z"] +} + +func ExampleRegexp_FindAllString() { + re := regexp.MustCompile(`a.`) + fmt.Println(re.FindAllString("paranormal", -1)) + fmt.Println(re.FindAllString("paranormal", 2)) + fmt.Println(re.FindAllString("graal", -1)) + fmt.Println(re.FindAllString("none", -1)) + // Output: + // [ar an al] + // [ar an] + // [aa] + // [] +} + +func ExampleRegexp_FindAllStringSubmatch() { + re := regexp.MustCompile(`a(x*)b`) + fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-", -1)) + fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-", -1)) + fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-axb-", -1)) + fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-ab-", -1)) + // Output: + // [["ab" ""]] + // [["axxb" "xx"]] + // [["ab" ""] ["axb" "x"]] + // [["axxb" "xx"] ["ab" ""]] +} + +func ExampleRegexp_FindAllStringSubmatchIndex() { + re := regexp.MustCompile(`a(x*)b`) + // Indices: + // 01234567 012345678 + // -ab-axb- -axxb-ab- + fmt.Println(re.FindAllStringSubmatchIndex("-ab-", -1)) + fmt.Println(re.FindAllStringSubmatchIndex("-axxb-", -1)) + fmt.Println(re.FindAllStringSubmatchIndex("-ab-axb-", -1)) + fmt.Println(re.FindAllStringSubmatchIndex("-axxb-ab-", -1)) + fmt.Println(re.FindAllStringSubmatchIndex("-foo-", -1)) + // Output: + // [[1 3 2 2]] + // [[1 5 2 4]] + // [[1 3 2 2] [4 7 5 6]] + // [[1 5 2 4] [6 8 7 7]] + // [] +} + +func ExampleRegexp_FindSubmatchIndex() { + re := regexp.MustCompile(`a(x*)b`) + // Indices: + // 01234567 012345678 + // -ab-axb- -axxb-ab- + fmt.Println(re.FindSubmatchIndex([]byte("-ab-"))) + fmt.Println(re.FindSubmatchIndex([]byte("-axxb-"))) + fmt.Println(re.FindSubmatchIndex([]byte("-ab-axb-"))) + fmt.Println(re.FindSubmatchIndex([]byte("-axxb-ab-"))) + fmt.Println(re.FindSubmatchIndex([]byte("-foo-"))) + // Output: + // [1 3 2 2] + // [1 5 2 4] + // [1 3 2 2] + // [1 5 2 4] + // [] +} + +func ExampleRegexp_Longest() { + re := regexp.MustCompile(`a(|b)`) + fmt.Println(re.FindString("ab")) + re.Longest() + fmt.Println(re.FindString("ab")) + // Output: + // a + // ab +} + +func ExampleRegexp_MatchString() { + re := regexp.MustCompile(`(gopher){2}`) + fmt.Println(re.MatchString("gopher")) + fmt.Println(re.MatchString("gophergopher")) + fmt.Println(re.MatchString("gophergophergopher")) + // Output: + // false + // true + // true +} + +func ExampleRegexp_NumSubexp() { + re0 := regexp.MustCompile(`a.`) + fmt.Printf("%d\n", re0.NumSubexp()) + + re := regexp.MustCompile(`(.*)((a)b)(.*)a`) + fmt.Println(re.NumSubexp()) + // Output: + // 0 + // 4 +} + +func ExampleRegexp_ReplaceAll() { + re := regexp.MustCompile(`a(x*)b`) + fmt.Printf("%s\n", re.ReplaceAll([]byte("-ab-axxb-"), []byte("T"))) + fmt.Printf("%s\n", re.ReplaceAll([]byte("-ab-axxb-"), []byte("$1"))) + fmt.Printf("%s\n", re.ReplaceAll([]byte("-ab-axxb-"), []byte("$1W"))) + fmt.Printf("%s\n", re.ReplaceAll([]byte("-ab-axxb-"), []byte("${1}W"))) + // Output: + // -T-T- + // --xx- + // --- + // -W-xxW- +} + +func ExampleRegexp_ReplaceAllLiteralString() { + re := regexp.MustCompile(`a(x*)b`) + fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "T")) + fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "$1")) + fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "${1}")) + // Output: + // -T-T- + // -$1-$1- + // -${1}-${1}- +} + +func ExampleRegexp_ReplaceAllString() { + re := regexp.MustCompile(`a(x*)b`) + fmt.Println(re.ReplaceAllString("-ab-axxb-", "T")) + fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1")) + fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1W")) + fmt.Println(re.ReplaceAllString("-ab-axxb-", "${1}W")) + // Output: + // -T-T- + // --xx- + // --- + // -W-xxW- +} + +func ExampleRegexp_ReplaceAllStringFunc() { + re := regexp.MustCompile(`[^aeiou]`) + fmt.Println(re.ReplaceAllStringFunc("seafood fool", strings.ToUpper)) + // Output: + // SeaFooD FooL +} + +func ExampleRegexp_SubexpNames() { + re := regexp.MustCompile(`(?P<first>[a-zA-Z]+) (?P<last>[a-zA-Z]+)`) + fmt.Println(re.MatchString("Alan Turing")) + fmt.Printf("%q\n", re.SubexpNames()) + reversed := fmt.Sprintf("${%s} ${%s}", re.SubexpNames()[2], re.SubexpNames()[1]) + fmt.Println(reversed) + fmt.Println(re.ReplaceAllString("Alan Turing", reversed)) + // Output: + // true + // ["" "first" "last"] + // ${last} ${first} + // Turing Alan +} + +func ExampleRegexp_SubexpIndex() { + re := regexp.MustCompile(`(?P<first>[a-zA-Z]+) (?P<last>[a-zA-Z]+)`) + fmt.Println(re.MatchString("Alan Turing")) + matches := re.FindStringSubmatch("Alan Turing") + lastIndex := re.SubexpIndex("last") + fmt.Printf("last => %d\n", lastIndex) + fmt.Println(matches[lastIndex]) + // Output: + // true + // last => 2 + // Turing +} + +func ExampleRegexp_Split() { + a := regexp.MustCompile(`a`) + fmt.Println(a.Split("banana", -1)) + fmt.Println(a.Split("banana", 0)) + fmt.Println(a.Split("banana", 1)) + fmt.Println(a.Split("banana", 2)) + zp := regexp.MustCompile(`z+`) + fmt.Println(zp.Split("pizza", -1)) + fmt.Println(zp.Split("pizza", 0)) + fmt.Println(zp.Split("pizza", 1)) + fmt.Println(zp.Split("pizza", 2)) + // Output: + // [b n n ] + // [] + // [banana] + // [b nana] + // [pi a] + // [] + // [pizza] + // [pi a] +} + +func ExampleRegexp_Expand() { + content := []byte(` + # comment line + option1: value1 + option2: value2 + + # another comment line + option3: value3 +`) + + // Regex pattern captures "key: value" pair from the content. + pattern := regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`) + + // Template to convert "key: value" to "key=value" by + // referencing the values captured by the regex pattern. + template := []byte("$key=$value\n") + + result := []byte{} + + // For each match of the regex in the content. + for _, submatches := range pattern.FindAllSubmatchIndex(content, -1) { + // Apply the captured submatches to the template and append the output + // to the result. + result = pattern.Expand(result, template, content, submatches) + } + fmt.Println(string(result)) + // Output: + // option1=value1 + // option2=value2 + // option3=value3 +} + +func ExampleRegexp_ExpandString() { + content := ` + # comment line + option1: value1 + option2: value2 + + # another comment line + option3: value3 +` + + // Regex pattern captures "key: value" pair from the content. + pattern := regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`) + + // Template to convert "key: value" to "key=value" by + // referencing the values captured by the regex pattern. + template := "$key=$value\n" + + result := []byte{} + + // For each match of the regex in the content. + for _, submatches := range pattern.FindAllStringSubmatchIndex(content, -1) { + // Apply the captured submatches to the template and append the output + // to the result. + result = pattern.ExpandString(result, template, content, submatches) + } + fmt.Println(string(result)) + // Output: + // option1=value1 + // option2=value2 + // option3=value3 +} + +func ExampleRegexp_FindIndex() { + content := []byte(` + # comment line + option1: value1 + option2: value2 +`) + // Regex pattern captures "key: value" pair from the content. + pattern := regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`) + + loc := pattern.FindIndex(content) + fmt.Println(loc) + fmt.Println(string(content[loc[0]:loc[1]])) + // Output: + // [18 33] + // option1: value1 +} + +func ExampleRegexp_FindAllSubmatchIndex() { + content := []byte(` + # comment line + option1: value1 + option2: value2 +`) + // Regex pattern captures "key: value" pair from the content. + pattern := regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`) + allIndexes := pattern.FindAllSubmatchIndex(content, -1) + for _, loc := range allIndexes { + fmt.Println(loc) + fmt.Println(string(content[loc[0]:loc[1]])) + fmt.Println(string(content[loc[2]:loc[3]])) + fmt.Println(string(content[loc[4]:loc[5]])) + } + // Output: + // [18 33 18 25 27 33] + // option1: value1 + // option1 + // value1 + // [35 50 35 42 44 50] + // option2: value2 + // option2 + // value2 +} + +func ExampleRegexp_FindAllIndex() { + content := []byte("London") + re := regexp.MustCompile(`o.`) + fmt.Println(re.FindAllIndex(content, 1)) + fmt.Println(re.FindAllIndex(content, -1)) + // Output: + // [[1 3]] + // [[1 3] [4 6]] +} diff --git a/src/regexp/exec.go b/src/regexp/exec.go new file mode 100644 index 0000000..3fc4b68 --- /dev/null +++ b/src/regexp/exec.go @@ -0,0 +1,554 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "io" + "regexp/syntax" + "sync" +) + +// A queue is a 'sparse array' holding pending threads of execution. +// See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html +type queue struct { + sparse []uint32 + dense []entry +} + +// An entry is an entry on a queue. +// It holds both the instruction pc and the actual thread. +// Some queue entries are just place holders so that the machine +// knows it has considered that pc. Such entries have t == nil. +type entry struct { + pc uint32 + t *thread +} + +// A thread is the state of a single path through the machine: +// an instruction and a corresponding capture array. +// See https://swtch.com/~rsc/regexp/regexp2.html +type thread struct { + inst *syntax.Inst + cap []int +} + +// A machine holds all the state during an NFA simulation for p. +type machine struct { + re *Regexp // corresponding Regexp + p *syntax.Prog // compiled program + q0, q1 queue // two queues for runq, nextq + pool []*thread // pool of available threads + matched bool // whether a match was found + matchcap []int // capture information for the match + + inputs inputs +} + +type inputs struct { + // cached inputs, to avoid allocation + bytes inputBytes + string inputString + reader inputReader +} + +func (i *inputs) newBytes(b []byte) input { + i.bytes.str = b + return &i.bytes +} + +func (i *inputs) newString(s string) input { + i.string.str = s + return &i.string +} + +func (i *inputs) newReader(r io.RuneReader) input { + i.reader.r = r + i.reader.atEOT = false + i.reader.pos = 0 + return &i.reader +} + +func (i *inputs) clear() { + // We need to clear 1 of these. + // Avoid the expense of clearing the others (pointer write barrier). + if i.bytes.str != nil { + i.bytes.str = nil + } else if i.reader.r != nil { + i.reader.r = nil + } else { + i.string.str = "" + } +} + +func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) { + if r != nil { + return i.newReader(r), 0 + } + if b != nil { + return i.newBytes(b), len(b) + } + return i.newString(s), len(s) +} + +func (m *machine) init(ncap int) { + for _, t := range m.pool { + t.cap = t.cap[:ncap] + } + m.matchcap = m.matchcap[:ncap] +} + +// alloc allocates a new thread with the given instruction. +// It uses the free pool if possible. +func (m *machine) alloc(i *syntax.Inst) *thread { + var t *thread + if n := len(m.pool); n > 0 { + t = m.pool[n-1] + m.pool = m.pool[:n-1] + } else { + t = new(thread) + t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) + } + t.inst = i + return t +} + +// A lazyFlag is a lazily-evaluated syntax.EmptyOp, +// for checking zero-width flags like ^ $ \A \z \B \b. +// It records the pair of relevant runes and does not +// determine the implied flags until absolutely necessary +// (most of the time, that means never). +type lazyFlag uint64 + +func newLazyFlag(r1, r2 rune) lazyFlag { + return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2))) +} + +func (f lazyFlag) match(op syntax.EmptyOp) bool { + if op == 0 { + return true + } + r1 := rune(f >> 32) + if op&syntax.EmptyBeginLine != 0 { + if r1 != '\n' && r1 >= 0 { + return false + } + op &^= syntax.EmptyBeginLine + } + if op&syntax.EmptyBeginText != 0 { + if r1 >= 0 { + return false + } + op &^= syntax.EmptyBeginText + } + if op == 0 { + return true + } + r2 := rune(f) + if op&syntax.EmptyEndLine != 0 { + if r2 != '\n' && r2 >= 0 { + return false + } + op &^= syntax.EmptyEndLine + } + if op&syntax.EmptyEndText != 0 { + if r2 >= 0 { + return false + } + op &^= syntax.EmptyEndText + } + if op == 0 { + return true + } + if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) { + op &^= syntax.EmptyWordBoundary + } else { + op &^= syntax.EmptyNoWordBoundary + } + return op == 0 +} + +// match runs the machine over the input starting at pos. +// It reports whether a match was found. +// If so, m.matchcap holds the submatch information. +func (m *machine) match(i input, pos int) bool { + startCond := m.re.cond + if startCond == ^syntax.EmptyOp(0) { // impossible + return false + } + m.matched = false + for i := range m.matchcap { + m.matchcap[i] = -1 + } + runq, nextq := &m.q0, &m.q1 + r, r1 := endOfText, endOfText + width, width1 := 0, 0 + r, width = i.step(pos) + if r != endOfText { + r1, width1 = i.step(pos + width) + } + var flag lazyFlag + if pos == 0 { + flag = newLazyFlag(-1, r) + } else { + flag = i.context(pos) + } + for { + if len(runq.dense) == 0 { + if startCond&syntax.EmptyBeginText != 0 && pos != 0 { + // Anchored match, past beginning of text. + break + } + if m.matched { + // Have match; finished exploring alternatives. + break + } + if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { + // Match requires literal prefix; fast search for it. + advance := i.index(m.re, pos) + if advance < 0 { + break + } + pos += advance + r, width = i.step(pos) + r1, width1 = i.step(pos + width) + } + } + if !m.matched { + if len(m.matchcap) > 0 { + m.matchcap[0] = pos + } + m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil) + } + flag = newLazyFlag(r, r1) + m.step(runq, nextq, pos, pos+width, r, &flag) + if width == 0 { + break + } + if len(m.matchcap) == 0 && m.matched { + // Found a match and not paying attention + // to where it is, so any match will do. + break + } + pos += width + r, width = r1, width1 + if r != endOfText { + r1, width1 = i.step(pos + width) + } + runq, nextq = nextq, runq + } + m.clear(nextq) + return m.matched +} + +// clear frees all threads on the thread queue. +func (m *machine) clear(q *queue) { + for _, d := range q.dense { + if d.t != nil { + m.pool = append(m.pool, d.t) + } + } + q.dense = q.dense[:0] +} + +// step executes one step of the machine, running each of the threads +// on runq and appending new threads to nextq. +// The step processes the rune c (which may be endOfText), +// which starts at position pos and ends at nextPos. +// nextCond gives the setting for the empty-width flags after c. +func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) { + longest := m.re.longest + for j := 0; j < len(runq.dense); j++ { + d := &runq.dense[j] + t := d.t + if t == nil { + continue + } + if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { + m.pool = append(m.pool, t) + continue + } + i := t.inst + add := false + switch i.Op { + default: + panic("bad inst") + + case syntax.InstMatch: + if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { + t.cap[1] = pos + copy(m.matchcap, t.cap) + } + if !longest { + // First-match mode: cut off all lower-priority threads. + for _, d := range runq.dense[j+1:] { + if d.t != nil { + m.pool = append(m.pool, d.t) + } + } + runq.dense = runq.dense[:0] + } + m.matched = true + + case syntax.InstRune: + add = i.MatchRune(c) + case syntax.InstRune1: + add = c == i.Rune[0] + case syntax.InstRuneAny: + add = true + case syntax.InstRuneAnyNotNL: + add = c != '\n' + } + if add { + t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) + } + if t != nil { + m.pool = append(m.pool, t) + } + } + runq.dense = runq.dense[:0] +} + +// add adds an entry to q for pc, unless the q already has such an entry. +// It also recursively adds an entry for all instructions reachable from pc by following +// empty-width conditions satisfied by cond. pos gives the current position +// in the input. +func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread { +Again: + if pc == 0 { + return t + } + if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { + return t + } + + j := len(q.dense) + q.dense = q.dense[:j+1] + d := &q.dense[j] + d.t = nil + d.pc = pc + q.sparse[pc] = uint32(j) + + i := &m.p.Inst[pc] + switch i.Op { + default: + panic("unhandled") + case syntax.InstFail: + // nothing + case syntax.InstAlt, syntax.InstAltMatch: + t = m.add(q, i.Out, pos, cap, cond, t) + pc = i.Arg + goto Again + case syntax.InstEmptyWidth: + if cond.match(syntax.EmptyOp(i.Arg)) { + pc = i.Out + goto Again + } + case syntax.InstNop: + pc = i.Out + goto Again + case syntax.InstCapture: + if int(i.Arg) < len(cap) { + opos := cap[i.Arg] + cap[i.Arg] = pos + m.add(q, i.Out, pos, cap, cond, nil) + cap[i.Arg] = opos + } else { + pc = i.Out + goto Again + } + case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + if t == nil { + t = m.alloc(i) + } else { + t.inst = i + } + if len(cap) > 0 && &t.cap[0] != &cap[0] { + copy(t.cap, cap) + } + d.t = t + t = nil + } + return t +} + +type onePassMachine struct { + inputs inputs + matchcap []int +} + +var onePassPool sync.Pool + +func newOnePassMachine() *onePassMachine { + m, ok := onePassPool.Get().(*onePassMachine) + if !ok { + m = new(onePassMachine) + } + return m +} + +func freeOnePassMachine(m *onePassMachine) { + m.inputs.clear() + onePassPool.Put(m) +} + +// doOnePass implements r.doExecute using the one-pass execution engine. +func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int { + startCond := re.cond + if startCond == ^syntax.EmptyOp(0) { // impossible + return nil + } + + m := newOnePassMachine() + if cap(m.matchcap) < ncap { + m.matchcap = make([]int, ncap) + } else { + m.matchcap = m.matchcap[:ncap] + } + + matched := false + for i := range m.matchcap { + m.matchcap[i] = -1 + } + + i, _ := m.inputs.init(ir, ib, is) + + r, r1 := endOfText, endOfText + width, width1 := 0, 0 + r, width = i.step(pos) + if r != endOfText { + r1, width1 = i.step(pos + width) + } + var flag lazyFlag + if pos == 0 { + flag = newLazyFlag(-1, r) + } else { + flag = i.context(pos) + } + pc := re.onepass.Start + inst := &re.onepass.Inst[pc] + // If there is a simple literal prefix, skip over it. + if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) && + len(re.prefix) > 0 && i.canCheckPrefix() { + // Match requires literal prefix; fast search for it. + if !i.hasPrefix(re) { + goto Return + } + pos += len(re.prefix) + r, width = i.step(pos) + r1, width1 = i.step(pos + width) + flag = i.context(pos) + pc = int(re.prefixEnd) + } + for { + inst = &re.onepass.Inst[pc] + pc = int(inst.Out) + switch inst.Op { + default: + panic("bad inst") + case syntax.InstMatch: + matched = true + if len(m.matchcap) > 0 { + m.matchcap[0] = 0 + m.matchcap[1] = pos + } + goto Return + case syntax.InstRune: + if !inst.MatchRune(r) { + goto Return + } + case syntax.InstRune1: + if r != inst.Rune[0] { + goto Return + } + case syntax.InstRuneAny: + // Nothing + case syntax.InstRuneAnyNotNL: + if r == '\n' { + goto Return + } + // peek at the input rune to see which branch of the Alt to take + case syntax.InstAlt, syntax.InstAltMatch: + pc = int(onePassNext(inst, r)) + continue + case syntax.InstFail: + goto Return + case syntax.InstNop: + continue + case syntax.InstEmptyWidth: + if !flag.match(syntax.EmptyOp(inst.Arg)) { + goto Return + } + continue + case syntax.InstCapture: + if int(inst.Arg) < len(m.matchcap) { + m.matchcap[inst.Arg] = pos + } + continue + } + if width == 0 { + break + } + flag = newLazyFlag(r, r1) + pos += width + r, width = r1, width1 + if r != endOfText { + r1, width1 = i.step(pos + width) + } + } + +Return: + if !matched { + freeOnePassMachine(m) + return nil + } + + dstCap = append(dstCap, m.matchcap...) + freeOnePassMachine(m) + return dstCap +} + +// doMatch reports whether either r, b or s match the regexp. +func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool { + return re.doExecute(r, b, s, 0, 0, nil) != nil +} + +// doExecute finds the leftmost match in the input, appends the position +// of its subexpressions to dstCap and returns dstCap. +// +// nil is returned if no matches are found and non-nil if matches are found. +func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int { + if dstCap == nil { + // Make sure 'return dstCap' is non-nil. + dstCap = arrayNoInts[:0:0] + } + + if r == nil && len(b)+len(s) < re.minInputLen { + return nil + } + + if re.onepass != nil { + return re.doOnePass(r, b, s, pos, ncap, dstCap) + } + if r == nil && len(b)+len(s) < re.maxBitStateLen { + return re.backtrack(b, s, pos, ncap, dstCap) + } + + m := re.get() + i, _ := m.inputs.init(r, b, s) + + m.init(ncap) + if !m.match(i, pos) { + re.put(m) + return nil + } + + dstCap = append(dstCap, m.matchcap...) + re.put(m) + return dstCap +} + +// arrayNoInts is returned by doExecute match if nil dstCap is passed +// to it with ncap=0. +var arrayNoInts [0]int diff --git a/src/regexp/exec2_test.go b/src/regexp/exec2_test.go new file mode 100644 index 0000000..b6dac4a --- /dev/null +++ b/src/regexp/exec2_test.go @@ -0,0 +1,20 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !race + +package regexp + +import ( + "testing" +) + +// This test is excluded when running under the race detector because +// it is a very expensive test and takes too long. +func TestRE2Exhaustive(t *testing.T) { + if testing.Short() { + t.Skip("skipping TestRE2Exhaustive during short test") + } + testRE2(t, "testdata/re2-exhaustive.txt.bz2") +} diff --git a/src/regexp/exec_test.go b/src/regexp/exec_test.go new file mode 100644 index 0000000..1694230 --- /dev/null +++ b/src/regexp/exec_test.go @@ -0,0 +1,747 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "bufio" + "compress/bzip2" + "fmt" + "internal/testenv" + "io" + "os" + "path/filepath" + "regexp/syntax" + "strconv" + "strings" + "testing" + "unicode/utf8" +) + +// TestRE2 tests this package's regexp API against test cases +// considered during RE2's exhaustive tests, which run all possible +// regexps over a given set of atoms and operators, up to a given +// complexity, over all possible strings over a given alphabet, +// up to a given size. Rather than try to link with RE2, we read a +// log file containing the test cases and the expected matches. +// The log file, re2-exhaustive.txt, is generated by running 'make log' +// in the open source RE2 distribution https://github.com/google/re2/. +// +// The test file format is a sequence of stanzas like: +// +// strings +// "abc" +// "123x" +// regexps +// "[a-z]+" +// 0-3;0-3 +// -;- +// "([0-9])([0-9])([0-9])" +// -;- +// -;0-3 0-1 1-2 2-3 +// +// The stanza begins by defining a set of strings, quoted +// using Go double-quote syntax, one per line. Then the +// regexps section gives a sequence of regexps to run on +// the strings. In the block that follows a regexp, each line +// gives the semicolon-separated match results of running +// the regexp on the corresponding string. +// Each match result is either a single -, meaning no match, or a +// space-separated sequence of pairs giving the match and +// submatch indices. An unmatched subexpression formats +// its pair as a single - (not illustrated above). For now +// each regexp run produces two match results, one for a +// “full match” that restricts the regexp to matching the entire +// string or nothing, and one for a “partial match” that gives +// the leftmost first match found in the string. +// +// Lines beginning with # are comments. Lines beginning with +// a capital letter are test names printed during RE2's test suite +// and are echoed into t but otherwise ignored. +// +// At time of writing, re2-exhaustive.txt is 59 MB but compresses to 385 kB, +// so we store re2-exhaustive.txt.bz2 in the repository and decompress it on the fly. +func TestRE2Search(t *testing.T) { + testRE2(t, "testdata/re2-search.txt") +} + +func testRE2(t *testing.T, file string) { + f, err := os.Open(file) + if err != nil { + t.Fatal(err) + } + defer f.Close() + var txt io.Reader + if strings.HasSuffix(file, ".bz2") { + z := bzip2.NewReader(f) + txt = z + file = file[:len(file)-len(".bz2")] // for error messages + } else { + txt = f + } + lineno := 0 + scanner := bufio.NewScanner(txt) + var ( + str []string + input []string + inStrings bool + re *Regexp + refull *Regexp + nfail int + ncase int + ) + for lineno := 1; scanner.Scan(); lineno++ { + line := scanner.Text() + switch { + case line == "": + t.Fatalf("%s:%d: unexpected blank line", file, lineno) + case line[0] == '#': + continue + case 'A' <= line[0] && line[0] <= 'Z': + // Test name. + t.Logf("%s\n", line) + continue + case line == "strings": + str = str[:0] + inStrings = true + case line == "regexps": + inStrings = false + case line[0] == '"': + q, err := strconv.Unquote(line) + if err != nil { + // Fatal because we'll get out of sync. + t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err) + } + if inStrings { + str = append(str, q) + continue + } + // Is a regexp. + if len(input) != 0 { + t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q) + } + re, err = tryCompile(q) + if err != nil { + if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" { + // We don't and likely never will support \C; keep going. + continue + } + t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err) + if nfail++; nfail >= 100 { + t.Fatalf("stopping after %d errors", nfail) + } + continue + } + full := `\A(?:` + q + `)\z` + refull, err = tryCompile(full) + if err != nil { + // Fatal because q worked, so this should always work. + t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err) + } + input = str + case line[0] == '-' || '0' <= line[0] && line[0] <= '9': + // A sequence of match results. + ncase++ + if re == nil { + // Failed to compile: skip results. + continue + } + if len(input) == 0 { + t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno) + } + var text string + text, input = input[0], input[1:] + if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { + // RE2's \B considers every byte position, + // so it sees 'not word boundary' in the + // middle of UTF-8 sequences. This package + // only considers the positions between runes, + // so it disagrees. Skip those cases. + continue + } + res := strings.Split(line, ";") + if len(res) != len(run) { + t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run)) + } + for i := range res { + have, suffix := run[i](re, refull, text) + want := parseResult(t, file, lineno, res[i]) + if !same(have, want) { + t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want) + if nfail++; nfail >= 100 { + t.Fatalf("stopping after %d errors", nfail) + } + continue + } + b, suffix := match[i](re, refull, text) + if b != (want != nil) { + t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b) + if nfail++; nfail >= 100 { + t.Fatalf("stopping after %d errors", nfail) + } + continue + } + } + + default: + t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line) + } + } + if err := scanner.Err(); err != nil { + t.Fatalf("%s:%d: %v", file, lineno, err) + } + if len(input) != 0 { + t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input)) + } + t.Logf("%d cases tested", ncase) +} + +var run = []func(*Regexp, *Regexp, string) ([]int, string){ + runFull, + runPartial, + runFullLongest, + runPartialLongest, +} + +func runFull(re, refull *Regexp, text string) ([]int, string) { + refull.longest = false + return refull.FindStringSubmatchIndex(text), "[full]" +} + +func runPartial(re, refull *Regexp, text string) ([]int, string) { + re.longest = false + return re.FindStringSubmatchIndex(text), "" +} + +func runFullLongest(re, refull *Regexp, text string) ([]int, string) { + refull.longest = true + return refull.FindStringSubmatchIndex(text), "[full,longest]" +} + +func runPartialLongest(re, refull *Regexp, text string) ([]int, string) { + re.longest = true + return re.FindStringSubmatchIndex(text), "[longest]" +} + +var match = []func(*Regexp, *Regexp, string) (bool, string){ + matchFull, + matchPartial, + matchFullLongest, + matchPartialLongest, +} + +func matchFull(re, refull *Regexp, text string) (bool, string) { + refull.longest = false + return refull.MatchString(text), "[full]" +} + +func matchPartial(re, refull *Regexp, text string) (bool, string) { + re.longest = false + return re.MatchString(text), "" +} + +func matchFullLongest(re, refull *Regexp, text string) (bool, string) { + refull.longest = true + return refull.MatchString(text), "[full,longest]" +} + +func matchPartialLongest(re, refull *Regexp, text string) (bool, string) { + re.longest = true + return re.MatchString(text), "[longest]" +} + +func isSingleBytes(s string) bool { + for _, c := range s { + if c >= utf8.RuneSelf { + return false + } + } + return true +} + +func tryCompile(s string) (re *Regexp, err error) { + // Protect against panic during Compile. + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("panic: %v", r) + } + }() + return Compile(s) +} + +func parseResult(t *testing.T, file string, lineno int, res string) []int { + // A single - indicates no match. + if res == "-" { + return nil + } + // Otherwise, a space-separated list of pairs. + n := 1 + for j := 0; j < len(res); j++ { + if res[j] == ' ' { + n++ + } + } + out := make([]int, 2*n) + i := 0 + n = 0 + for j := 0; j <= len(res); j++ { + if j == len(res) || res[j] == ' ' { + // Process a single pair. - means no submatch. + pair := res[i:j] + if pair == "-" { + out[n] = -1 + out[n+1] = -1 + } else { + loStr, hiStr, _ := strings.Cut(pair, "-") + lo, err1 := strconv.Atoi(loStr) + hi, err2 := strconv.Atoi(hiStr) + if err1 != nil || err2 != nil || lo > hi { + t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) + } + out[n] = lo + out[n+1] = hi + } + n += 2 + i = j + 1 + } + } + return out +} + +func same(x, y []int) bool { + if len(x) != len(y) { + return false + } + for i, xi := range x { + if xi != y[i] { + return false + } + } + return true +} + +// TestFowler runs this package's regexp API against the +// POSIX regular expression tests collected by Glenn Fowler +// at http://www2.research.att.com/~astopen/testregex/testregex.html. +func TestFowler(t *testing.T) { + files, err := filepath.Glob("testdata/*.dat") + if err != nil { + t.Fatal(err) + } + for _, file := range files { + t.Log(file) + testFowler(t, file) + } +} + +var notab = MustCompilePOSIX(`[^\t]+`) + +func testFowler(t *testing.T, file string) { + f, err := os.Open(file) + if err != nil { + t.Error(err) + return + } + defer f.Close() + b := bufio.NewReader(f) + lineno := 0 + lastRegexp := "" +Reading: + for { + lineno++ + line, err := b.ReadString('\n') + if err != nil { + if err != io.EOF { + t.Errorf("%s:%d: %v", file, lineno, err) + } + break Reading + } + + // http://www2.research.att.com/~astopen/man/man1/testregex.html + // + // INPUT FORMAT + // Input lines may be blank, a comment beginning with #, or a test + // specification. A specification is five fields separated by one + // or more tabs. NULL denotes the empty string and NIL denotes the + // 0 pointer. + if line[0] == '#' || line[0] == '\n' { + continue Reading + } + line = line[:len(line)-1] + field := notab.FindAllString(line, -1) + for i, f := range field { + if f == "NULL" { + field[i] = "" + } + if f == "NIL" { + t.Logf("%s:%d: skip: %s", file, lineno, line) + continue Reading + } + } + if len(field) == 0 { + continue Reading + } + + // Field 1: the regex(3) flags to apply, one character per REG_feature + // flag. The test is skipped if REG_feature is not supported by the + // implementation. If the first character is not [BEASKLP] then the + // specification is a global control line. One or more of [BEASKLP] may be + // specified; the test will be repeated for each mode. + // + // B basic BRE (grep, ed, sed) + // E REG_EXTENDED ERE (egrep) + // A REG_AUGMENTED ARE (egrep with negation) + // S REG_SHELL SRE (sh glob) + // K REG_SHELL|REG_AUGMENTED KRE (ksh glob) + // L REG_LITERAL LRE (fgrep) + // + // a REG_LEFT|REG_RIGHT implicit ^...$ + // b REG_NOTBOL lhs does not match ^ + // c REG_COMMENT ignore space and #...\n + // d REG_SHELL_DOT explicit leading . match + // e REG_NOTEOL rhs does not match $ + // f REG_MULTIPLE multiple \n separated patterns + // g FNM_LEADING_DIR testfnmatch only -- match until / + // h REG_MULTIREF multiple digit backref + // i REG_ICASE ignore case + // j REG_SPAN . matches \n + // k REG_ESCAPE \ to escape [...] delimiter + // l REG_LEFT implicit ^... + // m REG_MINIMAL minimal match + // n REG_NEWLINE explicit \n match + // o REG_ENCLOSED (|&) magic inside [@|&](...) + // p REG_SHELL_PATH explicit / match + // q REG_DELIMITED delimited pattern + // r REG_RIGHT implicit ...$ + // s REG_SHELL_ESCAPED \ not special + // t REG_MUSTDELIM all delimiters must be specified + // u standard unspecified behavior -- errors not counted + // v REG_CLASS_ESCAPE \ special inside [...] + // w REG_NOSUB no subexpression match array + // x REG_LENIENT let some errors slide + // y REG_LEFT regexec() implicit ^... + // z REG_NULL NULL subexpressions ok + // $ expand C \c escapes in fields 2 and 3 + // / field 2 is a regsubcomp() expression + // = field 3 is a regdecomp() expression + // + // Field 1 control lines: + // + // C set LC_COLLATE and LC_CTYPE to locale in field 2 + // + // ?test ... output field 5 if passed and != EXPECTED, silent otherwise + // &test ... output field 5 if current and previous passed + // |test ... output field 5 if current passed and previous failed + // ; ... output field 2 if previous failed + // {test ... skip if failed until } + // } end of skip + // + // : comment comment copied as output NOTE + // :comment:test :comment: ignored + // N[OTE] comment comment copied as output NOTE + // T[EST] comment comment + // + // number use number for nmatch (20 by default) + flag := field[0] + switch flag[0] { + case '?', '&', '|', ';', '{', '}': + // Ignore all the control operators. + // Just run everything. + flag = flag[1:] + if flag == "" { + continue Reading + } + case ':': + var ok bool + if _, flag, ok = strings.Cut(flag[1:], ":"); !ok { + t.Logf("skip: %s", line) + continue Reading + } + case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + t.Logf("skip: %s", line) + continue Reading + } + + // Can check field count now that we've handled the myriad comment formats. + if len(field) < 4 { + t.Errorf("%s:%d: too few fields: %s", file, lineno, line) + continue Reading + } + + // Expand C escapes (a.k.a. Go escapes). + if strings.Contains(flag, "$") { + f := `"` + field[1] + `"` + if field[1], err = strconv.Unquote(f); err != nil { + t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) + } + f = `"` + field[2] + `"` + if field[2], err = strconv.Unquote(f); err != nil { + t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) + } + } + + // Field 2: the regular expression pattern; SAME uses the pattern from + // the previous specification. + // + if field[1] == "SAME" { + field[1] = lastRegexp + } + lastRegexp = field[1] + + // Field 3: the string to match. + text := field[2] + + // Field 4: the test outcome... + ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3]) + if !ok { + t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3]) + continue Reading + } + + // Field 5: optional comment appended to the report. + + Testing: + // Run test once for each specified capital letter mode that we support. + for _, c := range flag { + pattern := field[1] + syn := syntax.POSIX | syntax.ClassNL + switch c { + default: + continue Testing + case 'E': + // extended regexp (what we support) + case 'L': + // literal + pattern = QuoteMeta(pattern) + } + + for _, c := range flag { + switch c { + case 'i': + syn |= syntax.FoldCase + } + } + + re, err := compile(pattern, syn, true) + if err != nil { + if shouldCompile { + t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern) + } + continue Testing + } + if !shouldCompile { + t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern) + continue Testing + } + match := re.MatchString(text) + if match != shouldMatch { + t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch) + continue Testing + } + have := re.FindStringSubmatchIndex(text) + if (len(have) > 0) != match { + t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have) + continue Testing + } + if len(have) > len(pos) { + have = have[:len(pos)] + } + if !same(have, pos) { + t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos) + } + } + } +} + +func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) { + // Field 4: the test outcome. This is either one of the posix error + // codes (with REG_ omitted) or the match array, a list of (m,n) + // entries with m and n being first and last+1 positions in the + // field 3 string, or NULL if REG_NOSUB is in effect and success + // is expected. BADPAT is acceptable in place of any regcomp(3) + // error code. The match[] array is initialized to (-2,-2) before + // each test. All array elements from 0 to nmatch-1 must be specified + // in the outcome. Unspecified endpoints (offset -1) are denoted by ?. + // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a + // matched (?{...}) expression, where x is the text enclosed by {...}, + // o is the expression ordinal counting from 1, and n is the length of + // the unmatched portion of the subject string. If x starts with a + // number then that is the return value of re_execf(), otherwise 0 is + // returned. + switch { + case s == "": + // Match with no position information. + ok = true + compiled = true + matched = true + return + case s == "NOMATCH": + // Match failure. + ok = true + compiled = true + matched = false + return + case 'A' <= s[0] && s[0] <= 'Z': + // All the other error codes are compile errors. + ok = true + compiled = false + return + } + compiled = true + + var x []int + for s != "" { + var end byte = ')' + if len(x)%2 == 0 { + if s[0] != '(' { + ok = false + return + } + s = s[1:] + end = ',' + } + i := 0 + for i < len(s) && s[i] != end { + i++ + } + if i == 0 || i == len(s) { + ok = false + return + } + var v = -1 + var err error + if s[:i] != "?" { + v, err = strconv.Atoi(s[:i]) + if err != nil { + ok = false + return + } + } + x = append(x, v) + s = s[i+1:] + } + if len(x)%2 != 0 { + ok = false + return + } + ok = true + matched = true + pos = x + return +} + +var text []byte + +func makeText(n int) []byte { + if len(text) >= n { + return text[:n] + } + text = make([]byte, n) + x := ^uint32(0) + for i := range text { + x += x + x ^= 1 + if int32(x) < 0 { + x ^= 0x88888eef + } + if x%31 == 0 { + text[i] = '\n' + } else { + text[i] = byte(x%(0x7E+1-0x20) + 0x20) + } + } + return text +} + +func BenchmarkMatch(b *testing.B) { + isRaceBuilder := strings.HasSuffix(testenv.Builder(), "-race") + + for _, data := range benchData { + r := MustCompile(data.re) + for _, size := range benchSizes { + if (isRaceBuilder || testing.Short()) && size.n > 1<<10 { + continue + } + t := makeText(size.n) + b.Run(data.name+"/"+size.name, func(b *testing.B) { + b.SetBytes(int64(size.n)) + for i := 0; i < b.N; i++ { + if r.Match(t) { + b.Fatal("match!") + } + } + }) + } + } +} + +func BenchmarkMatch_onepass_regex(b *testing.B) { + isRaceBuilder := strings.HasSuffix(testenv.Builder(), "-race") + r := MustCompile(`(?s)\A.*\z`) + if r.onepass == nil { + b.Fatalf("want onepass regex, but %q is not onepass", r) + } + for _, size := range benchSizes { + if (isRaceBuilder || testing.Short()) && size.n > 1<<10 { + continue + } + t := makeText(size.n) + b.Run(size.name, func(b *testing.B) { + b.SetBytes(int64(size.n)) + b.ReportAllocs() + for i := 0; i < b.N; i++ { + if !r.Match(t) { + b.Fatal("not match!") + } + } + }) + } +} + +var benchData = []struct{ name, re string }{ + {"Easy0", "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"}, + {"Easy0i", "(?i)ABCDEFGHIJklmnopqrstuvwxyz$"}, + {"Easy1", "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"}, + {"Medium", "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"}, + {"Hard", "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"}, + {"Hard1", "ABCD|CDEF|EFGH|GHIJ|IJKL|KLMN|MNOP|OPQR|QRST|STUV|UVWX|WXYZ"}, +} + +var benchSizes = []struct { + name string + n int +}{ + {"16", 16}, + {"32", 32}, + {"1K", 1 << 10}, + {"32K", 32 << 10}, + {"1M", 1 << 20}, + {"32M", 32 << 20}, +} + +func TestLongest(t *testing.T) { + re, err := Compile(`a(|b)`) + if err != nil { + t.Fatal(err) + } + if g, w := re.FindString("ab"), "a"; g != w { + t.Errorf("first match was %q, want %q", g, w) + } + re.Longest() + if g, w := re.FindString("ab"), "ab"; g != w { + t.Errorf("longest match was %q, want %q", g, w) + } +} + +// TestProgramTooLongForBacktrack tests that a regex which is too long +// for the backtracker still executes properly. +func TestProgramTooLongForBacktrack(t *testing.T) { + longRegex := MustCompile(`(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twentyone|twentytwo|twentythree|twentyfour|twentyfive|twentysix|twentyseven|twentyeight|twentynine|thirty|thirtyone|thirtytwo|thirtythree|thirtyfour|thirtyfive|thirtysix|thirtyseven|thirtyeight|thirtynine|forty|fortyone|fortytwo|fortythree|fortyfour|fortyfive|fortysix|fortyseven|fortyeight|fortynine|fifty|fiftyone|fiftytwo|fiftythree|fiftyfour|fiftyfive|fiftysix|fiftyseven|fiftyeight|fiftynine|sixty|sixtyone|sixtytwo|sixtythree|sixtyfour|sixtyfive|sixtysix|sixtyseven|sixtyeight|sixtynine|seventy|seventyone|seventytwo|seventythree|seventyfour|seventyfive|seventysix|seventyseven|seventyeight|seventynine|eighty|eightyone|eightytwo|eightythree|eightyfour|eightyfive|eightysix|eightyseven|eightyeight|eightynine|ninety|ninetyone|ninetytwo|ninetythree|ninetyfour|ninetyfive|ninetysix|ninetyseven|ninetyeight|ninetynine|onehundred)`) + if !longRegex.MatchString("two") { + t.Errorf("longRegex.MatchString(\"two\") was false, want true") + } + if longRegex.MatchString("xxx") { + t.Errorf("longRegex.MatchString(\"xxx\") was true, want false") + } +} diff --git a/src/regexp/find_test.go b/src/regexp/find_test.go new file mode 100644 index 0000000..2edbe9b --- /dev/null +++ b/src/regexp/find_test.go @@ -0,0 +1,518 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "fmt" + "strings" + "testing" +) + +// For each pattern/text pair, what is the expected output of each function? +// We can derive the textual results from the indexed results, the non-submatch +// results from the submatched results, the single results from the 'all' results, +// and the byte results from the string results. Therefore the table includes +// only the FindAllStringSubmatchIndex result. +type FindTest struct { + pat string + text string + matches [][]int +} + +func (t FindTest) String() string { + return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) +} + +var findTests = []FindTest{ + {``, ``, build(1, 0, 0)}, + {`^abcdefg`, "abcdefg", build(1, 0, 7)}, + {`a+`, "baaab", build(1, 1, 4)}, + {"abcd..", "abcdef", build(1, 0, 6)}, + {`a`, "a", build(1, 0, 1)}, + {`x`, "y", nil}, + {`b`, "abc", build(1, 1, 2)}, + {`.`, "a", build(1, 0, 1)}, + {`.*`, "abcdef", build(1, 0, 6)}, + {`^`, "abcde", build(1, 0, 0)}, + {`$`, "abcde", build(1, 5, 5)}, + {`^abcd$`, "abcd", build(1, 0, 4)}, + {`^bcd'`, "abcdef", nil}, + {`^abcd$`, "abcde", nil}, + {`a+`, "baaab", build(1, 1, 4)}, + {`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)}, + {`[a-z]+`, "abcd", build(1, 0, 4)}, + {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, + {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, + {`[^\n]+`, "abcd\n", build(1, 0, 4)}, + {`[日本語]+`, "日本語日本語", build(1, 0, 18)}, + {`日本語+`, "日本語", build(1, 0, 9)}, + {`日本語+`, "日本語語語語", build(1, 0, 18)}, + {`()`, "", build(1, 0, 0, 0, 0)}, + {`(a)`, "a", build(1, 0, 1, 0, 1)}, + {`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)}, + {`(.*)`, "", build(1, 0, 0, 0, 0)}, + {`(.*)`, "abcd", build(1, 0, 4, 0, 4)}, + {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, + {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, + {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, + {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, + {`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)}, + {`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)}, + + {`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)}, + {`(.*).*`, "ab", build(1, 0, 2, 0, 2)}, + {`[.]`, ".", build(1, 0, 1)}, + {`/$`, "/abc/", build(1, 4, 5)}, + {`/$`, "/abc", nil}, + + // multiple matches + {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, + {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, + {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, + {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, + {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, + + // fixed bugs + {`ab$`, "cab", build(1, 1, 3)}, + {`axxb$`, "axxcb", nil}, + {`data`, "daXY data", build(1, 5, 9)}, + {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, + {`zx+`, "zzx", build(1, 1, 3)}, + {`ab$`, "abcab", build(1, 3, 5)}, + {`(aa)*$`, "a", build(1, 1, 1, -1, -1)}, + {`(?:.|(?:.a))`, "", nil}, + {`(?:A(?:A|a))`, "Aa", build(1, 0, 2)}, + {`(?:A|(?:A|a))`, "a", build(1, 0, 1)}, + {`(a){0}`, "", build(1, 0, 0, -1, -1)}, + {`(?-s)(?:(?:^).)`, "\n", nil}, + {`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)}, + {`(?:(?:^).)`, "\n", nil}, + {`\b`, "x", build(2, 0, 0, 1, 1)}, + {`\b`, "xx", build(2, 0, 0, 2, 2)}, + {`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)}, + {`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)}, + {`\B`, "x", nil}, + {`\B`, "xx", build(1, 1, 1)}, + {`\B`, "x y", nil}, + {`\B`, "xx yy", build(2, 1, 1, 4, 4)}, + {`(|a)*`, "aa", build(3, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2)}, + + // RE2 tests + {`[^\S\s]`, "abcd", nil}, + {`[^\S[:space:]]`, "abcd", nil}, + {`[^\D\d]`, "abcd", nil}, + {`[^\D[:digit:]]`, "abcd", nil}, + {`(?i)\W`, "x", nil}, + {`(?i)\W`, "k", nil}, + {`(?i)\W`, "s", nil}, + + // can backslash-escape any punctuation + {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {"\\`", "`", build(1, 0, 1)}, + {"[\\`]+", "`", build(1, 0, 1)}, + + {"\ufffd", "\xff", build(1, 0, 1)}, + {"\ufffd", "hello\xffworld", build(1, 5, 6)}, + {`.*`, "hello\xffworld", build(1, 0, 11)}, + {`\x{fffd}`, "\xc2\x00", build(1, 0, 1)}, + {"[\ufffd]", "\xff", build(1, 0, 1)}, + {`[\x{fffd}]`, "\xc2\x00", build(1, 0, 1)}, + + // long set of matches (longer than startSize) + { + ".", + "qwertyuiopasdfghjklzxcvbnm1234567890", + build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, + 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, + 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), + }, +} + +// build is a helper to construct a [][]int by extracting n sequences from x. +// This represents n matches with len(x)/n submatches each. +func build(n int, x ...int) [][]int { + ret := make([][]int, n) + runLength := len(x) / n + j := 0 + for i := range ret { + ret[i] = make([]int, runLength) + copy(ret[i], x[j:]) + j += runLength + if j > len(x) { + panic("invalid build entry") + } + } + return ret +} + +// First the simple cases. + +func TestFind(t *testing.T) { + for _, test := range findTests { + re := MustCompile(test.pat) + if re.String() != test.pat { + t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) + } + result := re.Find([]byte(test.text)) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if len(result) != cap(result) { + t.Errorf("expected capacity %d got %d: %s", len(result), cap(result), test) + } + if expect != string(result) { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func TestFindString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindString(test.text) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != "": + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == "": + // Tricky because an empty result has two meanings: no match or empty match. + if test.matches[0][0] != test.matches[0][1] { + t.Errorf("expected match; got none: %s", test) + } + case test.matches != nil && result != "": + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != result { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func testFindIndex(test *FindTest, result []int, t *testing.T) { + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.matches[0] + if expect[0] != result[0] || expect[1] != result[1] { + t.Errorf("expected %v got %v: %s", expect, result, test) + } + } +} + +func TestFindIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) + } +} + +func TestFindStringIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) + } +} + +func TestFindReaderIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t) + } +} + +// Now come the simple All cases. + +func TestFindAll(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAll([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Fatalf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + got := result[k] + if len(got) != cap(got) { + t.Errorf("match %d: expected capacity %d got %d: %s", k, len(got), cap(got), test) + } + expect := test.text[e[0]:e[1]] + if expect != string(got) { + t.Errorf("match %d: expected %q got %q: %s", k, expect, got, test) + } + } + } + } +} + +func TestFindAllString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllString(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != result[k] { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } + } +} + +func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + return + } + for k, e := range test.matches { + if e[0] != result[k][0] || e[1] != result[k][1] { + t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) + } + } + } +} + +func TestFindAllIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) + } +} + +// Now come the Submatch cases. + +func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != nil { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + got := result[k/2] + if len(got) != cap(got) { + t.Errorf("match %d: expected capacity %d got %d: %s", n, len(got), cap(got), test) + return + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != string(got) { + t.Errorf("match %d: expected %q got %q: %s", n, expect, got, test) + return + } + } +} + +func TestFindSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchBytes(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != "" { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != result[k/2] { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindStringSubmatch(test.text) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchString(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { + if len(expect) != len(result) { + t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) + return + } + for k, e := range expect { + if e != result[k] { + t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) + } + } +} + +func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchIndices(test, 0, test.matches[0], result, t) + } +} + +func TestFindSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) + } +} + +func TestFindStringSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) + } +} + +func TestFindReaderSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t) + } +} + +// Now come the monster AllSubmatch cases. + +func TestFindAllSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchBytes(&test, k, match, result[k], t) + } + } + } +} + +func TestFindAllStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchString(&test, k, match, result[k], t) + } + } + } +} + +func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchIndices(test, k, match, result[k], t) + } + } +} + +func TestFindAllSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) + } +} diff --git a/src/regexp/onepass.go b/src/regexp/onepass.go new file mode 100644 index 0000000..b3066e8 --- /dev/null +++ b/src/regexp/onepass.go @@ -0,0 +1,507 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "regexp/syntax" + "sort" + "strings" + "unicode" + "unicode/utf8" +) + +// "One-pass" regexp execution. +// Some regexps can be analyzed to determine that they never need +// backtracking: they are guaranteed to run in one pass over the string +// without bothering to save all the usual NFA state. +// Detect those and execute them more quickly. + +// A onePassProg is a compiled one-pass regular expression program. +// It is the same as syntax.Prog except for the use of onePassInst. +type onePassProg struct { + Inst []onePassInst + Start int // index of start instruction + NumCap int // number of InstCapture insts in re +} + +// A onePassInst is a single instruction in a one-pass regular expression program. +// It is the same as syntax.Inst except for the new 'Next' field. +type onePassInst struct { + syntax.Inst + Next []uint32 +} + +// onePassPrefix returns a literal string that all matches for the +// regexp must start with. Complete is true if the prefix +// is the entire match. Pc is the index of the last rune instruction +// in the string. The onePassPrefix skips over the mandatory +// EmptyBeginText. +func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { + i := &p.Inst[p.Start] + if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { + return "", i.Op == syntax.InstMatch, uint32(p.Start) + } + pc = i.Out + i = &p.Inst[pc] + for i.Op == syntax.InstNop { + pc = i.Out + i = &p.Inst[pc] + } + // Avoid allocation of buffer if prefix is empty. + if iop(i) != syntax.InstRune || len(i.Rune) != 1 { + return "", i.Op == syntax.InstMatch, uint32(p.Start) + } + + // Have prefix; gather characters. + var buf strings.Builder + for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError { + buf.WriteRune(i.Rune[0]) + pc, i = i.Out, &p.Inst[i.Out] + } + if i.Op == syntax.InstEmptyWidth && + syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && + p.Inst[i.Out].Op == syntax.InstMatch { + complete = true + } + return buf.String(), complete, pc +} + +// onePassNext selects the next actionable state of the prog, based on the input character. +// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. +// One of the alternates may ultimately lead without input to end of line. If the instruction +// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. +func onePassNext(i *onePassInst, r rune) uint32 { + next := i.MatchRunePos(r) + if next >= 0 { + return i.Next[next] + } + if i.Op == syntax.InstAltMatch { + return i.Out + } + return 0 +} + +func iop(i *syntax.Inst) syntax.InstOp { + op := i.Op + switch op { + case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + op = syntax.InstRune + } + return op +} + +// Sparse Array implementation is used as a queueOnePass. +type queueOnePass struct { + sparse []uint32 + dense []uint32 + size, nextIndex uint32 +} + +func (q *queueOnePass) empty() bool { + return q.nextIndex >= q.size +} + +func (q *queueOnePass) next() (n uint32) { + n = q.dense[q.nextIndex] + q.nextIndex++ + return +} + +func (q *queueOnePass) clear() { + q.size = 0 + q.nextIndex = 0 +} + +func (q *queueOnePass) contains(u uint32) bool { + if u >= uint32(len(q.sparse)) { + return false + } + return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u +} + +func (q *queueOnePass) insert(u uint32) { + if !q.contains(u) { + q.insertNew(u) + } +} + +func (q *queueOnePass) insertNew(u uint32) { + if u >= uint32(len(q.sparse)) { + return + } + q.sparse[u] = q.size + q.dense[q.size] = u + q.size++ +} + +func newQueue(size int) (q *queueOnePass) { + return &queueOnePass{ + sparse: make([]uint32, size), + dense: make([]uint32, size), + } +} + +// mergeRuneSets merges two non-intersecting runesets, and returns the merged result, +// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index +// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a +// NextIp array with the single element mergeFailed is returned. +// The code assumes that both inputs contain ordered and non-intersecting rune pairs. +const mergeFailed = uint32(0xffffffff) + +var ( + noRune = []rune{} + noNext = []uint32{mergeFailed} +) + +func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { + leftLen := len(*leftRunes) + rightLen := len(*rightRunes) + if leftLen&0x1 != 0 || rightLen&0x1 != 0 { + panic("mergeRuneSets odd length []rune") + } + var ( + lx, rx int + ) + merged := make([]rune, 0) + next := make([]uint32, 0) + ok := true + defer func() { + if !ok { + merged = nil + next = nil + } + }() + + ix := -1 + extend := func(newLow *int, newArray *[]rune, pc uint32) bool { + if ix > 0 && (*newArray)[*newLow] <= merged[ix] { + return false + } + merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) + *newLow += 2 + ix += 2 + next = append(next, pc) + return true + } + + for lx < leftLen || rx < rightLen { + switch { + case rx >= rightLen: + ok = extend(&lx, leftRunes, leftPC) + case lx >= leftLen: + ok = extend(&rx, rightRunes, rightPC) + case (*rightRunes)[rx] < (*leftRunes)[lx]: + ok = extend(&rx, rightRunes, rightPC) + default: + ok = extend(&lx, leftRunes, leftPC) + } + if !ok { + return noRune, noNext + } + } + return merged, next +} + +// cleanupOnePass drops working memory, and restores certain shortcut instructions. +func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { + for ix, instOriginal := range original.Inst { + switch instOriginal.Op { + case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: + case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: + prog.Inst[ix].Next = nil + case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + prog.Inst[ix].Next = nil + prog.Inst[ix] = onePassInst{Inst: instOriginal} + } + } +} + +// onePassCopy creates a copy of the original Prog, as we'll be modifying it. +func onePassCopy(prog *syntax.Prog) *onePassProg { + p := &onePassProg{ + Start: prog.Start, + NumCap: prog.NumCap, + Inst: make([]onePassInst, len(prog.Inst)), + } + for i, inst := range prog.Inst { + p.Inst[i] = onePassInst{Inst: inst} + } + + // rewrites one or more common Prog constructs that enable some otherwise + // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at + // ip A, that points to ips B & C. + // A:BC + B:DA => A:BC + B:CD + // A:BC + B:DC => A:DC + B:DC + for pc := range p.Inst { + switch p.Inst[pc].Op { + default: + continue + case syntax.InstAlt, syntax.InstAltMatch: + // A:Bx + B:Ay + p_A_Other := &p.Inst[pc].Out + p_A_Alt := &p.Inst[pc].Arg + // make sure a target is another Alt + instAlt := p.Inst[*p_A_Alt] + if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { + p_A_Alt, p_A_Other = p_A_Other, p_A_Alt + instAlt = p.Inst[*p_A_Alt] + if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { + continue + } + } + instOther := p.Inst[*p_A_Other] + // Analyzing both legs pointing to Alts is for another day + if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { + // too complicated + continue + } + // simple empty transition loop + // A:BC + B:DA => A:BC + B:DC + p_B_Alt := &p.Inst[*p_A_Alt].Out + p_B_Other := &p.Inst[*p_A_Alt].Arg + patch := false + if instAlt.Out == uint32(pc) { + patch = true + } else if instAlt.Arg == uint32(pc) { + patch = true + p_B_Alt, p_B_Other = p_B_Other, p_B_Alt + } + if patch { + *p_B_Alt = *p_A_Other + } + + // empty transition to common target + // A:BC + B:DC => A:DC + B:DC + if *p_A_Other == *p_B_Alt { + *p_A_Alt = *p_B_Other + } + } + } + return p +} + +// runeSlice exists to permit sorting the case-folded rune sets. +type runeSlice []rune + +func (p runeSlice) Len() int { return len(p) } +func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } +func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} +var anyRune = []rune{0, unicode.MaxRune} + +// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, +// the match engine can always tell which branch to take. The routine may modify +// p if it is turned into a onepass Prog. If it isn't possible for this to be a +// onepass Prog, the Prog nil is returned. makeOnePass is recursive +// to the size of the Prog. +func makeOnePass(p *onePassProg) *onePassProg { + // If the machine is very long, it's not worth the time to check if we can use one pass. + if len(p.Inst) >= 1000 { + return nil + } + + var ( + instQueue = newQueue(len(p.Inst)) + visitQueue = newQueue(len(p.Inst)) + check func(uint32, []bool) bool + onePassRunes = make([][]rune, len(p.Inst)) + ) + + // check that paths from Alt instructions are unambiguous, and rebuild the new + // program as a onepass program + check = func(pc uint32, m []bool) (ok bool) { + ok = true + inst := &p.Inst[pc] + if visitQueue.contains(pc) { + return + } + visitQueue.insert(pc) + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + ok = check(inst.Out, m) && check(inst.Arg, m) + // check no-input paths to InstMatch + matchOut := m[inst.Out] + matchArg := m[inst.Arg] + if matchOut && matchArg { + ok = false + break + } + // Match on empty goes in inst.Out + if matchArg { + inst.Out, inst.Arg = inst.Arg, inst.Out + matchOut, matchArg = matchArg, matchOut + } + if matchOut { + m[pc] = true + inst.Op = syntax.InstAltMatch + } + + // build a dispatch operator from the two legs of the alt. + onePassRunes[pc], inst.Next = mergeRuneSets( + &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) + if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { + ok = false + break + } + case syntax.InstCapture, syntax.InstNop: + ok = check(inst.Out, m) + m[pc] = m[inst.Out] + // pass matching runes back through these no-ops. + onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) + inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) + for i := range inst.Next { + inst.Next[i] = inst.Out + } + case syntax.InstEmptyWidth: + ok = check(inst.Out, m) + m[pc] = m[inst.Out] + onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) + inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) + for i := range inst.Next { + inst.Next[i] = inst.Out + } + case syntax.InstMatch, syntax.InstFail: + m[pc] = inst.Op == syntax.InstMatch + case syntax.InstRune: + m[pc] = false + if len(inst.Next) > 0 { + break + } + instQueue.insert(inst.Out) + if len(inst.Rune) == 0 { + onePassRunes[pc] = []rune{} + inst.Next = []uint32{inst.Out} + break + } + runes := make([]rune, 0) + if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { + r0 := inst.Rune[0] + runes = append(runes, r0, r0) + for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { + runes = append(runes, r1, r1) + } + sort.Sort(runeSlice(runes)) + } else { + runes = append(runes, inst.Rune...) + } + onePassRunes[pc] = runes + inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) + for i := range inst.Next { + inst.Next[i] = inst.Out + } + inst.Op = syntax.InstRune + case syntax.InstRune1: + m[pc] = false + if len(inst.Next) > 0 { + break + } + instQueue.insert(inst.Out) + runes := []rune{} + // expand case-folded runes + if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { + r0 := inst.Rune[0] + runes = append(runes, r0, r0) + for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { + runes = append(runes, r1, r1) + } + sort.Sort(runeSlice(runes)) + } else { + runes = append(runes, inst.Rune[0], inst.Rune[0]) + } + onePassRunes[pc] = runes + inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) + for i := range inst.Next { + inst.Next[i] = inst.Out + } + inst.Op = syntax.InstRune + case syntax.InstRuneAny: + m[pc] = false + if len(inst.Next) > 0 { + break + } + instQueue.insert(inst.Out) + onePassRunes[pc] = append([]rune{}, anyRune...) + inst.Next = []uint32{inst.Out} + case syntax.InstRuneAnyNotNL: + m[pc] = false + if len(inst.Next) > 0 { + break + } + instQueue.insert(inst.Out) + onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) + inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) + for i := range inst.Next { + inst.Next[i] = inst.Out + } + } + return + } + + instQueue.clear() + instQueue.insert(uint32(p.Start)) + m := make([]bool, len(p.Inst)) + for !instQueue.empty() { + visitQueue.clear() + pc := instQueue.next() + if !check(pc, m) { + p = nil + break + } + } + if p != nil { + for i := range p.Inst { + p.Inst[i].Rune = onePassRunes[i] + } + } + return p +} + +// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog +// can be recharacterized as a one-pass regexp program, or syntax.nil if the +// Prog cannot be converted. For a one pass prog, the fundamental condition that must +// be true is: at any InstAlt, there must be no ambiguity about what branch to take. +func compileOnePass(prog *syntax.Prog) (p *onePassProg) { + if prog.Start == 0 { + return nil + } + // onepass regexp is anchored + if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || + syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { + return nil + } + // every instruction leading to InstMatch must be EmptyEndText + for _, inst := range prog.Inst { + opOut := prog.Inst[inst.Out].Op + switch inst.Op { + default: + if opOut == syntax.InstMatch { + return nil + } + case syntax.InstAlt, syntax.InstAltMatch: + if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { + return nil + } + case syntax.InstEmptyWidth: + if opOut == syntax.InstMatch { + if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { + continue + } + return nil + } + } + } + // Creates a slightly optimized copy of the original Prog + // that cleans up some Prog idioms that block valid onepass programs + p = onePassCopy(prog) + + // checkAmbiguity on InstAlts, build onepass Prog if possible + p = makeOnePass(p) + + if p != nil { + cleanupOnePass(p, prog) + } + return p +} diff --git a/src/regexp/onepass_test.go b/src/regexp/onepass_test.go new file mode 100644 index 0000000..6a42eda --- /dev/null +++ b/src/regexp/onepass_test.go @@ -0,0 +1,225 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "reflect" + "regexp/syntax" + "strings" + "testing" +) + +var runeMergeTests = []struct { + left, right, merged []rune + next []uint32 + leftPC, rightPC uint32 +}{ + { + // empty rhs + []rune{69, 69}, + []rune{}, + []rune{69, 69}, + []uint32{1}, + 1, 2, + }, + { + // identical runes, identical targets + []rune{69, 69}, + []rune{69, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 1, + }, + { + // identical runes, different targets + []rune{69, 69}, + []rune{69, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // append right-first + []rune{69, 69}, + []rune{71, 71}, + []rune{69, 69, 71, 71}, + []uint32{1, 2}, + 1, 2, + }, + { + // append, left-first + []rune{71, 71}, + []rune{69, 69}, + []rune{69, 69, 71, 71}, + []uint32{2, 1}, + 1, 2, + }, + { + // successful interleave + []rune{60, 60, 71, 71, 101, 101}, + []rune{69, 69, 88, 88}, + []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101}, + []uint32{1, 2, 1, 2, 1}, + 1, 2, + }, + { + // left surrounds right + []rune{69, 74}, + []rune{71, 71}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // right surrounds left + []rune{69, 74}, + []rune{68, 75}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap at interval begin + []rune{69, 74}, + []rune{74, 75}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap ar interval end + []rune{69, 74}, + []rune{65, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap from above + []rune{69, 74}, + []rune{71, 74}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap from below + []rune{69, 74}, + []rune{65, 71}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // out of order []rune + []rune{69, 74, 60, 65}, + []rune{66, 67}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, +} + +func TestMergeRuneSet(t *testing.T) { + for ix, test := range runeMergeTests { + merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC) + if !reflect.DeepEqual(merged, test.merged) { + t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged) + } + if !reflect.DeepEqual(next, test.next) { + t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next) + } + } +} + +var onePassTests = []struct { + re string + isOnePass bool +}{ + {`^(?:a|(?:a*))$`, false}, + {`^(?:(a)|(?:a*))$`, false}, + {`^(?:(?:(?:.(?:$))?))$`, true}, + {`^abcd$`, true}, + {`^(?:(?:a{0,})*?)$`, false}, + {`^(?:(?:a+)*)$`, true}, + {`^(?:(?:a|(?:aa)))$`, true}, + {`^(?:[^\s\S])$`, true}, + {`^(?:(?:a{3,4}){0,})$`, false}, + {`^(?:(?:(?:a*)+))$`, true}, + {`^[a-c]+$`, true}, + {`^[a-c]*$`, true}, + {`^(?:a*)$`, true}, + {`^(?:(?:aa)|a)$`, true}, + {`^[a-c]*`, false}, + {`^...$`, true}, + {`^(?:a|(?:aa))$`, true}, + {`^a((b))c$`, true}, + {`^a.[l-nA-Cg-j]?e$`, true}, + {`^a((b))$`, true}, + {`^a(?:(b)|(c))c$`, true}, + {`^a(?:(b*)|(c))c$`, false}, + {`^a(?:b|c)$`, true}, + {`^a(?:b?|c)$`, true}, + {`^a(?:b?|c?)$`, false}, + {`^a(?:b?|c+)$`, true}, + {`^a(?:b+|(bc))d$`, false}, + {`^a(?:bc)+$`, true}, + {`^a(?:[bcd])+$`, true}, + {`^a((?:[bcd])+)$`, true}, + {`^a(:?b|c)*d$`, true}, + {`^.bc(d|e)*$`, true}, + {`^(?:(?:aa)|.)$`, false}, + {`^(?:(?:a{1,2}){1,2})$`, false}, + {`^l` + strings.Repeat("o", 2<<8) + `ng$`, true}, +} + +func TestCompileOnePass(t *testing.T) { + var ( + p *syntax.Prog + re *syntax.Regexp + err error + ) + for _, test := range onePassTests { + if re, err = syntax.Parse(test.re, syntax.Perl); err != nil { + t.Errorf("Parse(%q) got err:%s, want success", test.re, err) + continue + } + // needs to be done before compile... + re = re.Simplify() + if p, err = syntax.Compile(re); err != nil { + t.Errorf("Compile(%q) got err:%s, want success", test.re, err) + continue + } + isOnePass := compileOnePass(p) != nil + if isOnePass != test.isOnePass { + t.Errorf("CompileOnePass(%q) got isOnePass=%v, expected %v", test.re, isOnePass, test.isOnePass) + } + } +} + +// TODO(cespare): Unify with onePassTests and rationalize one-pass test cases. +var onePassTests1 = []struct { + re string + match string +}{ + {`^a(/b+(#c+)*)*$`, "a/b#c"}, // golang.org/issue/11905 +} + +func TestRunOnePass(t *testing.T) { + for _, test := range onePassTests1 { + re, err := Compile(test.re) + if err != nil { + t.Errorf("Compile(%q): got err: %s", test.re, err) + continue + } + if re.onepass == nil { + t.Errorf("Compile(%q): got nil, want one-pass", test.re) + continue + } + if !re.MatchString(test.match) { + t.Errorf("onepass %q did not match %q", test.re, test.match) + } + } +} diff --git a/src/regexp/regexp.go b/src/regexp/regexp.go new file mode 100644 index 0000000..990c06e --- /dev/null +++ b/src/regexp/regexp.go @@ -0,0 +1,1285 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package regexp implements regular expression search. +// +// The syntax of the regular expressions accepted is the same +// general syntax used by Perl, Python, and other languages. +// More precisely, it is the syntax accepted by RE2 and described at +// https://golang.org/s/re2syntax, except for \C. +// For an overview of the syntax, run +// +// go doc regexp/syntax +// +// The regexp implementation provided by this package is +// guaranteed to run in time linear in the size of the input. +// (This is a property not guaranteed by most open source +// implementations of regular expressions.) For more information +// about this property, see +// +// https://swtch.com/~rsc/regexp/regexp1.html +// +// or any book about automata theory. +// +// All characters are UTF-8-encoded code points. +// Following utf8.DecodeRune, each byte of an invalid UTF-8 sequence +// is treated as if it encoded utf8.RuneError (U+FFFD). +// +// There are 16 methods of Regexp that match a regular expression and identify +// the matched text. Their names are matched by this regular expression: +// +// Find(All)?(String)?(Submatch)?(Index)? +// +// If 'All' is present, the routine matches successive non-overlapping +// matches of the entire expression. Empty matches abutting a preceding +// match are ignored. The return value is a slice containing the successive +// return values of the corresponding non-'All' routine. These routines take +// an extra integer argument, n. If n >= 0, the function returns at most n +// matches/submatches; otherwise, it returns all of them. +// +// If 'String' is present, the argument is a string; otherwise it is a slice +// of bytes; return values are adjusted as appropriate. +// +// If 'Submatch' is present, the return value is a slice identifying the +// successive submatches of the expression. Submatches are matches of +// parenthesized subexpressions (also known as capturing groups) within the +// regular expression, numbered from left to right in order of opening +// parenthesis. Submatch 0 is the match of the entire expression, submatch 1 is +// the match of the first parenthesized subexpression, and so on. +// +// If 'Index' is present, matches and submatches are identified by byte index +// pairs within the input string: result[2*n:2*n+2] identifies the indexes of +// the nth submatch. The pair for n==0 identifies the match of the entire +// expression. If 'Index' is not present, the match is identified by the text +// of the match/submatch. If an index is negative or text is nil, it means that +// subexpression did not match any string in the input. For 'String' versions +// an empty string means either no match or an empty match. +// +// There is also a subset of the methods that can be applied to text read +// from a RuneReader: +// +// MatchReader, FindReaderIndex, FindReaderSubmatchIndex +// +// This set may grow. Note that regular expression matches may need to +// examine text beyond the text returned by a match, so the methods that +// match text from a RuneReader may read arbitrarily far into the input +// before returning. +// +// (There are a few other methods that do not match this pattern.) +package regexp + +import ( + "bytes" + "io" + "regexp/syntax" + "strconv" + "strings" + "sync" + "unicode" + "unicode/utf8" +) + +// Regexp is the representation of a compiled regular expression. +// A Regexp is safe for concurrent use by multiple goroutines, +// except for configuration methods, such as Longest. +type Regexp struct { + expr string // as passed to Compile + prog *syntax.Prog // compiled program + onepass *onePassProg // onepass program or nil + numSubexp int + maxBitStateLen int + subexpNames []string + prefix string // required prefix in unanchored matches + prefixBytes []byte // prefix, as a []byte + prefixRune rune // first rune in prefix + prefixEnd uint32 // pc for last rune in prefix + mpool int // pool for machines + matchcap int // size of recorded match lengths + prefixComplete bool // prefix is the entire regexp + cond syntax.EmptyOp // empty-width conditions required at start of match + minInputLen int // minimum length of the input in bytes + + // This field can be modified by the Longest method, + // but it is otherwise read-only. + longest bool // whether regexp prefers leftmost-longest match +} + +// String returns the source text used to compile the regular expression. +func (re *Regexp) String() string { + return re.expr +} + +// Copy returns a new Regexp object copied from re. +// Calling Longest on one copy does not affect another. +// +// Deprecated: In earlier releases, when using a Regexp in multiple goroutines, +// giving each goroutine its own copy helped to avoid lock contention. +// As of Go 1.12, using Copy is no longer necessary to avoid lock contention. +// Copy may still be appropriate if the reason for its use is to make +// two copies with different Longest settings. +func (re *Regexp) Copy() *Regexp { + re2 := *re + return &re2 +} + +// Compile parses a regular expression and returns, if successful, +// a Regexp object that can be used to match against text. +// +// When matching against text, the regexp returns a match that +// begins as early as possible in the input (leftmost), and among those +// it chooses the one that a backtracking search would have found first. +// This so-called leftmost-first matching is the same semantics +// that Perl, Python, and other implementations use, although this +// package implements it without the expense of backtracking. +// For POSIX leftmost-longest matching, see CompilePOSIX. +func Compile(expr string) (*Regexp, error) { + return compile(expr, syntax.Perl, false) +} + +// CompilePOSIX is like Compile but restricts the regular expression +// to POSIX ERE (egrep) syntax and changes the match semantics to +// leftmost-longest. +// +// That is, when matching against text, the regexp returns a match that +// begins as early as possible in the input (leftmost), and among those +// it chooses a match that is as long as possible. +// This so-called leftmost-longest matching is the same semantics +// that early regular expression implementations used and that POSIX +// specifies. +// +// However, there can be multiple leftmost-longest matches, with different +// submatch choices, and here this package diverges from POSIX. +// Among the possible leftmost-longest matches, this package chooses +// the one that a backtracking search would have found first, while POSIX +// specifies that the match be chosen to maximize the length of the first +// subexpression, then the second, and so on from left to right. +// The POSIX rule is computationally prohibitive and not even well-defined. +// See https://swtch.com/~rsc/regexp/regexp2.html#posix for details. +func CompilePOSIX(expr string) (*Regexp, error) { + return compile(expr, syntax.POSIX, true) +} + +// Longest makes future searches prefer the leftmost-longest match. +// That is, when matching against text, the regexp returns a match that +// begins as early as possible in the input (leftmost), and among those +// it chooses a match that is as long as possible. +// This method modifies the Regexp and may not be called concurrently +// with any other methods. +func (re *Regexp) Longest() { + re.longest = true +} + +func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { + re, err := syntax.Parse(expr, mode) + if err != nil { + return nil, err + } + maxCap := re.MaxCap() + capNames := re.CapNames() + + re = re.Simplify() + prog, err := syntax.Compile(re) + if err != nil { + return nil, err + } + matchcap := prog.NumCap + if matchcap < 2 { + matchcap = 2 + } + regexp := &Regexp{ + expr: expr, + prog: prog, + onepass: compileOnePass(prog), + numSubexp: maxCap, + subexpNames: capNames, + cond: prog.StartCond(), + longest: longest, + matchcap: matchcap, + minInputLen: minInputLen(re), + } + if regexp.onepass == nil { + regexp.prefix, regexp.prefixComplete = prog.Prefix() + regexp.maxBitStateLen = maxBitStateLen(prog) + } else { + regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) + } + if regexp.prefix != "" { + // TODO(rsc): Remove this allocation by adding + // IndexString to package bytes. + regexp.prefixBytes = []byte(regexp.prefix) + regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) + } + + n := len(prog.Inst) + i := 0 + for matchSize[i] != 0 && matchSize[i] < n { + i++ + } + regexp.mpool = i + + return regexp, nil +} + +// Pools of *machine for use during (*Regexp).doExecute, +// split up by the size of the execution queues. +// matchPool[i] machines have queue size matchSize[i]. +// On a 64-bit system each queue entry is 16 bytes, +// so matchPool[0] has 16*2*128 = 4kB queues, etc. +// The final matchPool is a catch-all for very large queues. +var ( + matchSize = [...]int{128, 512, 2048, 16384, 0} + matchPool [len(matchSize)]sync.Pool +) + +// get returns a machine to use for matching re. +// It uses the re's machine cache if possible, to avoid +// unnecessary allocation. +func (re *Regexp) get() *machine { + m, ok := matchPool[re.mpool].Get().(*machine) + if !ok { + m = new(machine) + } + m.re = re + m.p = re.prog + if cap(m.matchcap) < re.matchcap { + m.matchcap = make([]int, re.matchcap) + for _, t := range m.pool { + t.cap = make([]int, re.matchcap) + } + } + + // Allocate queues if needed. + // Or reallocate, for "large" match pool. + n := matchSize[re.mpool] + if n == 0 { // large pool + n = len(re.prog.Inst) + } + if len(m.q0.sparse) < n { + m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} + m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} + } + return m +} + +// put returns a machine to the correct machine pool. +func (re *Regexp) put(m *machine) { + m.re = nil + m.p = nil + m.inputs.clear() + matchPool[re.mpool].Put(m) +} + +// minInputLen walks the regexp to find the minimum length of any matchable input. +func minInputLen(re *syntax.Regexp) int { + switch re.Op { + default: + return 0 + case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass: + return 1 + case syntax.OpLiteral: + l := 0 + for _, r := range re.Rune { + if r == utf8.RuneError { + l++ + } else { + l += utf8.RuneLen(r) + } + } + return l + case syntax.OpCapture, syntax.OpPlus: + return minInputLen(re.Sub[0]) + case syntax.OpRepeat: + return re.Min * minInputLen(re.Sub[0]) + case syntax.OpConcat: + l := 0 + for _, sub := range re.Sub { + l += minInputLen(sub) + } + return l + case syntax.OpAlternate: + l := minInputLen(re.Sub[0]) + var lnext int + for _, sub := range re.Sub[1:] { + lnext = minInputLen(sub) + if lnext < l { + l = lnext + } + } + return l + } +} + +// MustCompile is like Compile but panics if the expression cannot be parsed. +// It simplifies safe initialization of global variables holding compiled regular +// expressions. +func MustCompile(str string) *Regexp { + regexp, err := Compile(str) + if err != nil { + panic(`regexp: Compile(` + quote(str) + `): ` + err.Error()) + } + return regexp +} + +// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. +// It simplifies safe initialization of global variables holding compiled regular +// expressions. +func MustCompilePOSIX(str string) *Regexp { + regexp, err := CompilePOSIX(str) + if err != nil { + panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error()) + } + return regexp +} + +func quote(s string) string { + if strconv.CanBackquote(s) { + return "`" + s + "`" + } + return strconv.Quote(s) +} + +// NumSubexp returns the number of parenthesized subexpressions in this Regexp. +func (re *Regexp) NumSubexp() int { + return re.numSubexp +} + +// SubexpNames returns the names of the parenthesized subexpressions +// in this Regexp. The name for the first sub-expression is names[1], +// so that if m is a match slice, the name for m[i] is SubexpNames()[i]. +// Since the Regexp as a whole cannot be named, names[0] is always +// the empty string. The slice should not be modified. +func (re *Regexp) SubexpNames() []string { + return re.subexpNames +} + +// SubexpIndex returns the index of the first subexpression with the given name, +// or -1 if there is no subexpression with that name. +// +// Note that multiple subexpressions can be written using the same name, as in +// (?P<bob>a+)(?P<bob>b+), which declares two subexpressions named "bob". +// In this case, SubexpIndex returns the index of the leftmost such subexpression +// in the regular expression. +func (re *Regexp) SubexpIndex(name string) int { + if name != "" { + for i, s := range re.subexpNames { + if name == s { + return i + } + } + } + return -1 +} + +const endOfText rune = -1 + +// input abstracts different representations of the input text. It provides +// one-character lookahead. +type input interface { + step(pos int) (r rune, width int) // advance one rune + canCheckPrefix() bool // can we look ahead without losing info? + hasPrefix(re *Regexp) bool + index(re *Regexp, pos int) int + context(pos int) lazyFlag +} + +// inputString scans a string. +type inputString struct { + str string +} + +func (i *inputString) step(pos int) (rune, int) { + if pos < len(i.str) { + c := i.str[pos] + if c < utf8.RuneSelf { + return rune(c), 1 + } + return utf8.DecodeRuneInString(i.str[pos:]) + } + return endOfText, 0 +} + +func (i *inputString) canCheckPrefix() bool { + return true +} + +func (i *inputString) hasPrefix(re *Regexp) bool { + return strings.HasPrefix(i.str, re.prefix) +} + +func (i *inputString) index(re *Regexp, pos int) int { + return strings.Index(i.str[pos:], re.prefix) +} + +func (i *inputString) context(pos int) lazyFlag { + r1, r2 := endOfText, endOfText + // 0 < pos && pos <= len(i.str) + if uint(pos-1) < uint(len(i.str)) { + r1 = rune(i.str[pos-1]) + if r1 >= utf8.RuneSelf { + r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) + } + } + // 0 <= pos && pos < len(i.str) + if uint(pos) < uint(len(i.str)) { + r2 = rune(i.str[pos]) + if r2 >= utf8.RuneSelf { + r2, _ = utf8.DecodeRuneInString(i.str[pos:]) + } + } + return newLazyFlag(r1, r2) +} + +// inputBytes scans a byte slice. +type inputBytes struct { + str []byte +} + +func (i *inputBytes) step(pos int) (rune, int) { + if pos < len(i.str) { + c := i.str[pos] + if c < utf8.RuneSelf { + return rune(c), 1 + } + return utf8.DecodeRune(i.str[pos:]) + } + return endOfText, 0 +} + +func (i *inputBytes) canCheckPrefix() bool { + return true +} + +func (i *inputBytes) hasPrefix(re *Regexp) bool { + return bytes.HasPrefix(i.str, re.prefixBytes) +} + +func (i *inputBytes) index(re *Regexp, pos int) int { + return bytes.Index(i.str[pos:], re.prefixBytes) +} + +func (i *inputBytes) context(pos int) lazyFlag { + r1, r2 := endOfText, endOfText + // 0 < pos && pos <= len(i.str) + if uint(pos-1) < uint(len(i.str)) { + r1 = rune(i.str[pos-1]) + if r1 >= utf8.RuneSelf { + r1, _ = utf8.DecodeLastRune(i.str[:pos]) + } + } + // 0 <= pos && pos < len(i.str) + if uint(pos) < uint(len(i.str)) { + r2 = rune(i.str[pos]) + if r2 >= utf8.RuneSelf { + r2, _ = utf8.DecodeRune(i.str[pos:]) + } + } + return newLazyFlag(r1, r2) +} + +// inputReader scans a RuneReader. +type inputReader struct { + r io.RuneReader + atEOT bool + pos int +} + +func (i *inputReader) step(pos int) (rune, int) { + if !i.atEOT && pos != i.pos { + return endOfText, 0 + + } + r, w, err := i.r.ReadRune() + if err != nil { + i.atEOT = true + return endOfText, 0 + } + i.pos += w + return r, w +} + +func (i *inputReader) canCheckPrefix() bool { + return false +} + +func (i *inputReader) hasPrefix(re *Regexp) bool { + return false +} + +func (i *inputReader) index(re *Regexp, pos int) int { + return -1 +} + +func (i *inputReader) context(pos int) lazyFlag { + return 0 // not used +} + +// LiteralPrefix returns a literal string that must begin any match +// of the regular expression re. It returns the boolean true if the +// literal string comprises the entire regular expression. +func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { + return re.prefix, re.prefixComplete +} + +// MatchReader reports whether the text returned by the RuneReader +// contains any match of the regular expression re. +func (re *Regexp) MatchReader(r io.RuneReader) bool { + return re.doMatch(r, nil, "") +} + +// MatchString reports whether the string s +// contains any match of the regular expression re. +func (re *Regexp) MatchString(s string) bool { + return re.doMatch(nil, nil, s) +} + +// Match reports whether the byte slice b +// contains any match of the regular expression re. +func (re *Regexp) Match(b []byte) bool { + return re.doMatch(nil, b, "") +} + +// MatchReader reports whether the text returned by the RuneReader +// contains any match of the regular expression pattern. +// More complicated queries need to use Compile and the full Regexp interface. +func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.MatchReader(r), nil +} + +// MatchString reports whether the string s +// contains any match of the regular expression pattern. +// More complicated queries need to use Compile and the full Regexp interface. +func MatchString(pattern string, s string) (matched bool, err error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.MatchString(s), nil +} + +// Match reports whether the byte slice b +// contains any match of the regular expression pattern. +// More complicated queries need to use Compile and the full Regexp interface. +func Match(pattern string, b []byte) (matched bool, err error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.Match(b), nil +} + +// ReplaceAllString returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. Inside repl, $ signs are interpreted as +// in Expand, so for instance $1 represents the text of the first submatch. +func (re *Regexp) ReplaceAllString(src, repl string) string { + n := 2 + if strings.Contains(repl, "$") { + n = 2 * (re.numSubexp + 1) + } + b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { + return re.expand(dst, repl, nil, src, match) + }) + return string(b) +} + +// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. The replacement repl is substituted directly, +// without using Expand. +func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { + return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { + return append(dst, repl...) + })) +} + +// ReplaceAllStringFunc returns a copy of src in which all matches of the +// Regexp have been replaced by the return value of function repl applied +// to the matched substring. The replacement returned by repl is substituted +// directly, without using Expand. +func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { + b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { + return append(dst, repl(src[match[0]:match[1]])...) + }) + return string(b) +} + +func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { + lastMatchEnd := 0 // end position of the most recent match + searchPos := 0 // position where we next look for a match + var buf []byte + var endPos int + if bsrc != nil { + endPos = len(bsrc) + } else { + endPos = len(src) + } + if nmatch > re.prog.NumCap { + nmatch = re.prog.NumCap + } + + var dstCap [2]int + for searchPos <= endPos { + a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) + if len(a) == 0 { + break // no more matches + } + + // Copy the unmatched characters before this match. + if bsrc != nil { + buf = append(buf, bsrc[lastMatchEnd:a[0]]...) + } else { + buf = append(buf, src[lastMatchEnd:a[0]]...) + } + + // Now insert a copy of the replacement string, but not for a + // match of the empty string immediately after another match. + // (Otherwise, we get double replacement for patterns that + // match both empty and nonempty strings.) + if a[1] > lastMatchEnd || a[0] == 0 { + buf = repl(buf, a) + } + lastMatchEnd = a[1] + + // Advance past this match; always advance at least one character. + var width int + if bsrc != nil { + _, width = utf8.DecodeRune(bsrc[searchPos:]) + } else { + _, width = utf8.DecodeRuneInString(src[searchPos:]) + } + if searchPos+width > a[1] { + searchPos += width + } else if searchPos+1 > a[1] { + // This clause is only needed at the end of the input + // string. In that case, DecodeRuneInString returns width=0. + searchPos++ + } else { + searchPos = a[1] + } + } + + // Copy the unmatched characters after the last match. + if bsrc != nil { + buf = append(buf, bsrc[lastMatchEnd:]...) + } else { + buf = append(buf, src[lastMatchEnd:]...) + } + + return buf +} + +// ReplaceAll returns a copy of src, replacing matches of the Regexp +// with the replacement text repl. Inside repl, $ signs are interpreted as +// in Expand, so for instance $1 represents the text of the first submatch. +func (re *Regexp) ReplaceAll(src, repl []byte) []byte { + n := 2 + if bytes.IndexByte(repl, '$') >= 0 { + n = 2 * (re.numSubexp + 1) + } + srepl := "" + b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { + if len(srepl) != len(repl) { + srepl = string(repl) + } + return re.expand(dst, srepl, src, "", match) + }) + return b +} + +// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp +// with the replacement bytes repl. The replacement repl is substituted directly, +// without using Expand. +func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { + return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { + return append(dst, repl...) + }) +} + +// ReplaceAllFunc returns a copy of src in which all matches of the +// Regexp have been replaced by the return value of function repl applied +// to the matched byte slice. The replacement returned by repl is substituted +// directly, without using Expand. +func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { + return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { + return append(dst, repl(src[match[0]:match[1]])...) + }) +} + +// Bitmap used by func special to check whether a character needs to be escaped. +var specialBytes [16]byte + +// special reports whether byte b needs to be escaped by QuoteMeta. +func special(b byte) bool { + return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 +} + +func init() { + for _, b := range []byte(`\.+*?()|[]{}^$`) { + specialBytes[b%16] |= 1 << (b / 16) + } +} + +// QuoteMeta returns a string that escapes all regular expression metacharacters +// inside the argument text; the returned string is a regular expression matching +// the literal text. +func QuoteMeta(s string) string { + // A byte loop is correct because all metacharacters are ASCII. + var i int + for i = 0; i < len(s); i++ { + if special(s[i]) { + break + } + } + // No meta characters found, so return original string. + if i >= len(s) { + return s + } + + b := make([]byte, 2*len(s)-i) + copy(b, s[:i]) + j := i + for ; i < len(s); i++ { + if special(s[i]) { + b[j] = '\\' + j++ + } + b[j] = s[i] + j++ + } + return string(b[:j]) +} + +// The number of capture values in the program may correspond +// to fewer capturing expressions than are in the regexp. +// For example, "(a){0}" turns into an empty program, so the +// maximum capture in the program is 0 but we need to return +// an expression for \1. Pad appends -1s to the slice a as needed. +func (re *Regexp) pad(a []int) []int { + if a == nil { + // No match. + return nil + } + n := (1 + re.numSubexp) * 2 + for len(a) < n { + a = append(a, -1) + } + return a +} + +// allMatches calls deliver at most n times +// with the location of successive matches in the input text. +// The input text is b if non-nil, otherwise s. +func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { + var end int + if b == nil { + end = len(s) + } else { + end = len(b) + } + + for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { + matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) + if len(matches) == 0 { + break + } + + accept := true + if matches[1] == pos { + // We've found an empty match. + if matches[0] == prevMatchEnd { + // We don't allow an empty match right + // after a previous match, so ignore it. + accept = false + } + var width int + if b == nil { + is := inputString{str: s} + _, width = is.step(pos) + } else { + ib := inputBytes{str: b} + _, width = ib.step(pos) + } + if width > 0 { + pos += width + } else { + pos = end + 1 + } + } else { + pos = matches[1] + } + prevMatchEnd = matches[1] + + if accept { + deliver(re.pad(matches)) + i++ + } + } +} + +// Find returns a slice holding the text of the leftmost match in b of the regular expression. +// A return value of nil indicates no match. +func (re *Regexp) Find(b []byte) []byte { + var dstCap [2]int + a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) + if a == nil { + return nil + } + return b[a[0]:a[1]:a[1]] +} + +// FindIndex returns a two-element slice of integers defining the location of +// the leftmost match in b of the regular expression. The match itself is at +// b[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindIndex(b []byte) (loc []int) { + a := re.doExecute(nil, b, "", 0, 2, nil) + if a == nil { + return nil + } + return a[0:2] +} + +// FindString returns a string holding the text of the leftmost match in s of the regular +// expression. If there is no match, the return value is an empty string, +// but it will also be empty if the regular expression successfully matches +// an empty string. Use FindStringIndex or FindStringSubmatch if it is +// necessary to distinguish these cases. +func (re *Regexp) FindString(s string) string { + var dstCap [2]int + a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) + if a == nil { + return "" + } + return s[a[0]:a[1]] +} + +// FindStringIndex returns a two-element slice of integers defining the +// location of the leftmost match in s of the regular expression. The match +// itself is at s[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindStringIndex(s string) (loc []int) { + a := re.doExecute(nil, nil, s, 0, 2, nil) + if a == nil { + return nil + } + return a[0:2] +} + +// FindReaderIndex returns a two-element slice of integers defining the +// location of the leftmost match of the regular expression in text read from +// the RuneReader. The match text was found in the input stream at +// byte offset loc[0] through loc[1]-1. +// A return value of nil indicates no match. +func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { + a := re.doExecute(r, nil, "", 0, 2, nil) + if a == nil { + return nil + } + return a[0:2] +} + +// FindSubmatch returns a slice of slices holding the text of the leftmost +// match of the regular expression in b and the matches, if any, of its +// subexpressions, as defined by the 'Submatch' descriptions in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatch(b []byte) [][]byte { + var dstCap [4]int + a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) + if a == nil { + return nil + } + ret := make([][]byte, 1+re.numSubexp) + for i := range ret { + if 2*i < len(a) && a[2*i] >= 0 { + ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]] + } + } + return ret +} + +// Expand appends template to dst and returns the result; during the +// append, Expand replaces variables in the template with corresponding +// matches drawn from src. The match slice should have been returned by +// FindSubmatchIndex. +// +// In the template, a variable is denoted by a substring of the form +// $name or ${name}, where name is a non-empty sequence of letters, +// digits, and underscores. A purely numeric name like $1 refers to +// the submatch with the corresponding index; other names refer to +// capturing parentheses named with the (?P<name>...) syntax. A +// reference to an out of range or unmatched index or a name that is not +// present in the regular expression is replaced with an empty slice. +// +// In the $name form, name is taken to be as long as possible: $1x is +// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. +// +// To insert a literal $ in the output, use $$ in the template. +func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { + return re.expand(dst, string(template), src, "", match) +} + +// ExpandString is like Expand but the template and source are strings. +// It appends to and returns a byte slice in order to give the calling +// code control over allocation. +func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { + return re.expand(dst, template, nil, src, match) +} + +func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { + for len(template) > 0 { + before, after, ok := strings.Cut(template, "$") + if !ok { + break + } + dst = append(dst, before...) + template = after + if template != "" && template[0] == '$' { + // Treat $$ as $. + dst = append(dst, '$') + template = template[1:] + continue + } + name, num, rest, ok := extract(template) + if !ok { + // Malformed; treat $ as raw text. + dst = append(dst, '$') + continue + } + template = rest + if num >= 0 { + if 2*num+1 < len(match) && match[2*num] >= 0 { + if bsrc != nil { + dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) + } else { + dst = append(dst, src[match[2*num]:match[2*num+1]]...) + } + } + } else { + for i, namei := range re.subexpNames { + if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { + if bsrc != nil { + dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) + } else { + dst = append(dst, src[match[2*i]:match[2*i+1]]...) + } + break + } + } + } + } + dst = append(dst, template...) + return dst +} + +// extract returns the name from a leading "name" or "{name}" in str. +// (The $ has already been removed by the caller.) +// If it is a number, extract returns num set to that number; otherwise num = -1. +func extract(str string) (name string, num int, rest string, ok bool) { + if str == "" { + return + } + brace := false + if str[0] == '{' { + brace = true + str = str[1:] + } + i := 0 + for i < len(str) { + rune, size := utf8.DecodeRuneInString(str[i:]) + if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { + break + } + i += size + } + if i == 0 { + // empty name is not okay + return + } + name = str[:i] + if brace { + if i >= len(str) || str[i] != '}' { + // missing closing brace + return + } + i++ + } + + // Parse number. + num = 0 + for i := 0; i < len(name); i++ { + if name[i] < '0' || '9' < name[i] || num >= 1e8 { + num = -1 + break + } + num = num*10 + int(name[i]) - '0' + } + // Disallow leading zeros. + if name[0] == '0' && len(name) > 1 { + num = -1 + } + + rest = str[i:] + ok = true + return +} + +// FindSubmatchIndex returns a slice holding the index pairs identifying the +// leftmost match of the regular expression in b and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatchIndex(b []byte) []int { + return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) +} + +// FindStringSubmatch returns a slice of strings holding the text of the +// leftmost match of the regular expression in s and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatch(s string) []string { + var dstCap [4]int + a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) + if a == nil { + return nil + } + ret := make([]string, 1+re.numSubexp) + for i := range ret { + if 2*i < len(a) && a[2*i] >= 0 { + ret[i] = s[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindStringSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression in s and the +// matches, if any, of its subexpressions, as defined by the 'Submatch' and +// 'Index' descriptions in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatchIndex(s string) []int { + return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) +} + +// FindReaderSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression of text read by +// the RuneReader, and the matches, if any, of its subexpressions, as defined +// by the 'Submatch' and 'Index' descriptions in the package comment. A +// return value of nil indicates no match. +func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { + return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) +} + +const startSize = 10 // The size at which to start a slice in the 'All' routines. + +// FindAll is the 'All' version of Find; it returns a slice of all successive +// matches of the expression, as defined by the 'All' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAll(b []byte, n int) [][]byte { + if n < 0 { + n = len(b) + 1 + } + var result [][]byte + re.allMatches("", b, n, func(match []int) { + if result == nil { + result = make([][]byte, 0, startSize) + } + result = append(result, b[match[0]:match[1]:match[1]]) + }) + return result +} + +// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + var result [][]int + re.allMatches("", b, n, func(match []int) { + if result == nil { + result = make([][]int, 0, startSize) + } + result = append(result, match[0:2]) + }) + return result +} + +// FindAllString is the 'All' version of FindString; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllString(s string, n int) []string { + if n < 0 { + n = len(s) + 1 + } + var result []string + re.allMatches(s, nil, n, func(match []int) { + if result == nil { + result = make([]string, 0, startSize) + } + result = append(result, s[match[0]:match[1]]) + }) + return result +} + +// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a +// slice of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + var result [][]int + re.allMatches(s, nil, n, func(match []int) { + if result == nil { + result = make([][]int, 0, startSize) + } + result = append(result, match[0:2]) + }) + return result +} + +// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice +// of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { + if n < 0 { + n = len(b) + 1 + } + var result [][][]byte + re.allMatches("", b, n, func(match []int) { + if result == nil { + result = make([][][]byte, 0, startSize) + } + slice := make([][]byte, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]] + } + } + result = append(result, slice) + }) + return result +} + +// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns +// a slice of all successive matches of the expression, as defined by the +// 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + var result [][]int + re.allMatches("", b, n, func(match []int) { + if result == nil { + result = make([][]int, 0, startSize) + } + result = append(result, match) + }) + return result +} + +// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it +// returns a slice of all successive matches of the expression, as defined by +// the 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { + if n < 0 { + n = len(s) + 1 + } + var result [][]string + re.allMatches(s, nil, n, func(match []int) { + if result == nil { + result = make([][]string, 0, startSize) + } + slice := make([]string, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = s[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + return result +} + +// FindAllStringSubmatchIndex is the 'All' version of +// FindStringSubmatchIndex; it returns a slice of all successive matches of +// the expression, as defined by the 'All' description in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + var result [][]int + re.allMatches(s, nil, n, func(match []int) { + if result == nil { + result = make([][]int, 0, startSize) + } + result = append(result, match) + }) + return result +} + +// Split slices s into substrings separated by the expression and returns a slice of +// the substrings between those expression matches. +// +// The slice returned by this method consists of all the substrings of s +// not contained in the slice returned by FindAllString. When called on an expression +// that contains no metacharacters, it is equivalent to strings.SplitN. +// +// Example: +// +// s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) +// // s: ["", "b", "b", "c", "cadaaae"] +// +// The count determines the number of substrings to return: +// +// n > 0: at most n substrings; the last substring will be the unsplit remainder. +// n == 0: the result is nil (zero substrings) +// n < 0: all substrings +func (re *Regexp) Split(s string, n int) []string { + + if n == 0 { + return nil + } + + if len(re.expr) > 0 && len(s) == 0 { + return []string{""} + } + + matches := re.FindAllStringIndex(s, n) + strings := make([]string, 0, len(matches)) + + beg := 0 + end := 0 + for _, match := range matches { + if n > 0 && len(strings) >= n-1 { + break + } + + end = match[0] + if match[1] != 0 { + strings = append(strings, s[beg:end]) + } + beg = match[1] + } + + if end != len(s) { + strings = append(strings, s[beg:]) + } + + return strings +} diff --git a/src/regexp/syntax/compile.go b/src/regexp/syntax/compile.go new file mode 100644 index 0000000..c9f9fa0 --- /dev/null +++ b/src/regexp/syntax/compile.go @@ -0,0 +1,296 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import "unicode" + +// A patchList is a list of instruction pointers that need to be filled in (patched). +// Because the pointers haven't been filled in yet, we can reuse their storage +// to hold the list. It's kind of sleazy, but works well in practice. +// See https://swtch.com/~rsc/regexp/regexp1.html for inspiration. +// +// These aren't really pointers: they're integers, so we can reinterpret them +// this way without using package unsafe. A value l.head denotes +// p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1). +// head == 0 denotes the empty list, okay because we start every program +// with a fail instruction, so we'll never want to point at its output link. +type patchList struct { + head, tail uint32 +} + +func makePatchList(n uint32) patchList { + return patchList{n, n} +} + +func (l patchList) patch(p *Prog, val uint32) { + head := l.head + for head != 0 { + i := &p.Inst[head>>1] + if head&1 == 0 { + head = i.Out + i.Out = val + } else { + head = i.Arg + i.Arg = val + } + } +} + +func (l1 patchList) append(p *Prog, l2 patchList) patchList { + if l1.head == 0 { + return l2 + } + if l2.head == 0 { + return l1 + } + + i := &p.Inst[l1.tail>>1] + if l1.tail&1 == 0 { + i.Out = l2.head + } else { + i.Arg = l2.head + } + return patchList{l1.head, l2.tail} +} + +// A frag represents a compiled program fragment. +type frag struct { + i uint32 // index of first instruction + out patchList // where to record end instruction + nullable bool // whether fragment can match empty string +} + +type compiler struct { + p *Prog +} + +// Compile compiles the regexp into a program to be executed. +// The regexp should have been simplified already (returned from re.Simplify). +func Compile(re *Regexp) (*Prog, error) { + var c compiler + c.init() + f := c.compile(re) + f.out.patch(c.p, c.inst(InstMatch).i) + c.p.Start = int(f.i) + return c.p, nil +} + +func (c *compiler) init() { + c.p = new(Prog) + c.p.NumCap = 2 // implicit ( and ) for whole match $0 + c.inst(InstFail) +} + +var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} +var anyRune = []rune{0, unicode.MaxRune} + +func (c *compiler) compile(re *Regexp) frag { + switch re.Op { + case OpNoMatch: + return c.fail() + case OpEmptyMatch: + return c.nop() + case OpLiteral: + if len(re.Rune) == 0 { + return c.nop() + } + var f frag + for j := range re.Rune { + f1 := c.rune(re.Rune[j:j+1], re.Flags) + if j == 0 { + f = f1 + } else { + f = c.cat(f, f1) + } + } + return f + case OpCharClass: + return c.rune(re.Rune, re.Flags) + case OpAnyCharNotNL: + return c.rune(anyRuneNotNL, 0) + case OpAnyChar: + return c.rune(anyRune, 0) + case OpBeginLine: + return c.empty(EmptyBeginLine) + case OpEndLine: + return c.empty(EmptyEndLine) + case OpBeginText: + return c.empty(EmptyBeginText) + case OpEndText: + return c.empty(EmptyEndText) + case OpWordBoundary: + return c.empty(EmptyWordBoundary) + case OpNoWordBoundary: + return c.empty(EmptyNoWordBoundary) + case OpCapture: + bra := c.cap(uint32(re.Cap << 1)) + sub := c.compile(re.Sub[0]) + ket := c.cap(uint32(re.Cap<<1 | 1)) + return c.cat(c.cat(bra, sub), ket) + case OpStar: + return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpPlus: + return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpQuest: + return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpConcat: + if len(re.Sub) == 0 { + return c.nop() + } + var f frag + for i, sub := range re.Sub { + if i == 0 { + f = c.compile(sub) + } else { + f = c.cat(f, c.compile(sub)) + } + } + return f + case OpAlternate: + var f frag + for _, sub := range re.Sub { + f = c.alt(f, c.compile(sub)) + } + return f + } + panic("regexp: unhandled case in compile") +} + +func (c *compiler) inst(op InstOp) frag { + // TODO: impose length limit + f := frag{i: uint32(len(c.p.Inst)), nullable: true} + c.p.Inst = append(c.p.Inst, Inst{Op: op}) + return f +} + +func (c *compiler) nop() frag { + f := c.inst(InstNop) + f.out = makePatchList(f.i << 1) + return f +} + +func (c *compiler) fail() frag { + return frag{} +} + +func (c *compiler) cap(arg uint32) frag { + f := c.inst(InstCapture) + f.out = makePatchList(f.i << 1) + c.p.Inst[f.i].Arg = arg + + if c.p.NumCap < int(arg)+1 { + c.p.NumCap = int(arg) + 1 + } + return f +} + +func (c *compiler) cat(f1, f2 frag) frag { + // concat of failure is failure + if f1.i == 0 || f2.i == 0 { + return frag{} + } + + // TODO: elide nop + + f1.out.patch(c.p, f2.i) + return frag{f1.i, f2.out, f1.nullable && f2.nullable} +} + +func (c *compiler) alt(f1, f2 frag) frag { + // alt of failure is other + if f1.i == 0 { + return f2 + } + if f2.i == 0 { + return f1 + } + + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + i.Out = f1.i + i.Arg = f2.i + f.out = f1.out.append(c.p, f2.out) + f.nullable = f1.nullable || f2.nullable + return f +} + +func (c *compiler) quest(f1 frag, nongreedy bool) frag { + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + if nongreedy { + i.Arg = f1.i + f.out = makePatchList(f.i << 1) + } else { + i.Out = f1.i + f.out = makePatchList(f.i<<1 | 1) + } + f.out = f.out.append(c.p, f1.out) + return f +} + +// loop returns the fragment for the main loop of a plus or star. +// For plus, it can be used after changing the entry to f1.i. +// For star, it can be used directly when f1 can't match an empty string. +// (When f1 can match an empty string, f1* must be implemented as (f1+)? +// to get the priority match order correct.) +func (c *compiler) loop(f1 frag, nongreedy bool) frag { + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + if nongreedy { + i.Arg = f1.i + f.out = makePatchList(f.i << 1) + } else { + i.Out = f1.i + f.out = makePatchList(f.i<<1 | 1) + } + f1.out.patch(c.p, f.i) + return f +} + +func (c *compiler) star(f1 frag, nongreedy bool) frag { + if f1.nullable { + // Use (f1+)? to get priority match order correct. + // See golang.org/issue/46123. + return c.quest(c.plus(f1, nongreedy), nongreedy) + } + return c.loop(f1, nongreedy) +} + +func (c *compiler) plus(f1 frag, nongreedy bool) frag { + return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable} +} + +func (c *compiler) empty(op EmptyOp) frag { + f := c.inst(InstEmptyWidth) + c.p.Inst[f.i].Arg = uint32(op) + f.out = makePatchList(f.i << 1) + return f +} + +func (c *compiler) rune(r []rune, flags Flags) frag { + f := c.inst(InstRune) + f.nullable = false + i := &c.p.Inst[f.i] + i.Rune = r + flags &= FoldCase // only relevant flag is FoldCase + if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] { + // and sometimes not even that + flags &^= FoldCase + } + i.Arg = uint32(flags) + f.out = makePatchList(f.i << 1) + + // Special cases for exec machine. + switch { + case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]): + i.Op = InstRune1 + case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune: + i.Op = InstRuneAny + case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune: + i.Op = InstRuneAnyNotNL + } + + return f +} diff --git a/src/regexp/syntax/doc.go b/src/regexp/syntax/doc.go new file mode 100644 index 0000000..f6a4b43 --- /dev/null +++ b/src/regexp/syntax/doc.go @@ -0,0 +1,141 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution. + +/* +Package syntax parses regular expressions into parse trees and compiles +parse trees into programs. Most clients of regular expressions will use the +facilities of package regexp (such as Compile and Match) instead of this package. + +# Syntax + +The regular expression syntax understood by this package when parsing with the Perl flag is as follows. +Parts of the syntax can be disabled by passing alternate flags to Parse. + +Single characters: + + . any character, possibly including newline (flag s=true) + [xyz] character class + [^xyz] negated character class + \d Perl character class + \D negated Perl character class + [[:alpha:]] ASCII character class + [[:^alpha:]] negated ASCII character class + \pN Unicode character class (one-letter name) + \p{Greek} Unicode character class + \PN negated Unicode character class (one-letter name) + \P{Greek} negated Unicode character class + +Composites: + + xy x followed by y + x|y x or y (prefer x) + +Repetitions: + + x* zero or more x, prefer more + x+ one or more x, prefer more + x? zero or one x, prefer one + x{n,m} n or n+1 or ... or m x, prefer more + x{n,} n or more x, prefer more + x{n} exactly n x + x*? zero or more x, prefer fewer + x+? one or more x, prefer fewer + x?? zero or one x, prefer zero + x{n,m}? n or n+1 or ... or m x, prefer fewer + x{n,}? n or more x, prefer fewer + x{n}? exactly n x + +Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n} +reject forms that create a minimum or maximum repetition count above 1000. +Unlimited repetitions are not subject to this restriction. + +Grouping: + + (re) numbered capturing group (submatch) + (?P<name>re) named & numbered capturing group (submatch) + (?:re) non-capturing group + (?flags) set flags within current group; non-capturing + (?flags:re) set flags during re; non-capturing + + Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are: + + i case-insensitive (default false) + m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false) + s let . match \n (default false) + U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false) + +Empty strings: + + ^ at beginning of text or line (flag m=true) + $ at end of text (like \z not \Z) or line (flag m=true) + \A at beginning of text + \b at ASCII word boundary (\w on one side and \W, \A, or \z on the other) + \B not at ASCII word boundary + \z at end of text + +Escape sequences: + + \a bell (== \007) + \f form feed (== \014) + \t horizontal tab (== \011) + \n newline (== \012) + \r carriage return (== \015) + \v vertical tab character (== \013) + \* literal *, for any punctuation character * + \123 octal character code (up to three digits) + \x7F hex character code (exactly two digits) + \x{10FFFF} hex character code + \Q...\E literal text ... even if ... has punctuation + +Character class elements: + + x single character + A-Z character range (inclusive) + \d Perl character class + [:foo:] ASCII character class foo + \p{Foo} Unicode character class Foo + \pF Unicode character class F (one-letter name) + +Named character classes as character class elements: + + [\d] digits (== \d) + [^\d] not digits (== \D) + [\D] not digits (== \D) + [^\D] not not digits (== \d) + [[:name:]] named ASCII class inside character class (== [:name:]) + [^[:name:]] named ASCII class inside negated character class (== [:^name:]) + [\p{Name}] named Unicode property inside character class (== \p{Name}) + [^\p{Name}] named Unicode property inside negated character class (== \P{Name}) + +Perl character classes (all ASCII-only): + + \d digits (== [0-9]) + \D not digits (== [^0-9]) + \s whitespace (== [\t\n\f\r ]) + \S not whitespace (== [^\t\n\f\r ]) + \w word characters (== [0-9A-Za-z_]) + \W not word characters (== [^0-9A-Za-z_]) + +ASCII character classes: + + [[:alnum:]] alphanumeric (== [0-9A-Za-z]) + [[:alpha:]] alphabetic (== [A-Za-z]) + [[:ascii:]] ASCII (== [\x00-\x7F]) + [[:blank:]] blank (== [\t ]) + [[:cntrl:]] control (== [\x00-\x1F\x7F]) + [[:digit:]] digits (== [0-9]) + [[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) + [[:lower:]] lower case (== [a-z]) + [[:print:]] printable (== [ -~] == [ [:graph:]]) + [[:punct:]] punctuation (== [!-/:-@[-`{-~]) + [[:space:]] whitespace (== [\t\n\v\f\r ]) + [[:upper:]] upper case (== [A-Z]) + [[:word:]] word characters (== [0-9A-Za-z_]) + [[:xdigit:]] hex digit (== [0-9A-Fa-f]) + +Unicode character classes are those in unicode.Categories and unicode.Scripts. +*/ +package syntax diff --git a/src/regexp/syntax/make_perl_groups.pl b/src/regexp/syntax/make_perl_groups.pl new file mode 100755 index 0000000..80a2c9a --- /dev/null +++ b/src/regexp/syntax/make_perl_groups.pl @@ -0,0 +1,113 @@ +#!/usr/bin/perl +# Copyright 2008 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Modified version of RE2's make_perl_groups.pl. + +# Generate table entries giving character ranges +# for POSIX/Perl character classes. Rather than +# figure out what the definition is, it is easier to ask +# Perl about each letter from 0-128 and write down +# its answer. + +@posixclasses = ( + "[:alnum:]", + "[:alpha:]", + "[:ascii:]", + "[:blank:]", + "[:cntrl:]", + "[:digit:]", + "[:graph:]", + "[:lower:]", + "[:print:]", + "[:punct:]", + "[:space:]", + "[:upper:]", + "[:word:]", + "[:xdigit:]", +); + +@perlclasses = ( + "\\d", + "\\s", + "\\w", +); + +%overrides = ( + # Prior to Perl 5.18, \s did not match vertical tab. + # RE2 preserves that original behaviour. + "\\s:11" => 0, +); + +sub ComputeClass($) { + my @ranges; + my ($class) = @_; + my $regexp = "[$class]"; + my $start = -1; + for (my $i=0; $i<=129; $i++) { + if ($i == 129) { $i = 256; } + if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) { + if ($start < 0) { + $start = $i; + } + } else { + if ($start >= 0) { + push @ranges, [$start, $i-1]; + } + $start = -1; + } + } + return @ranges; +} + +sub PrintClass($$@) { + my ($cname, $name, @ranges) = @_; + print "var code$cname = []rune{ /* $name */\n"; + for (my $i=0; $i<@ranges; $i++) { + my @a = @{$ranges[$i]}; + printf "\t0x%x, 0x%x,\n", $a[0], $a[1]; + } + print "}\n\n"; + my $n = @ranges; + $negname = $name; + if ($negname =~ /:/) { + $negname =~ s/:/:^/; + } else { + $negname =~ y/a-z/A-Z/; + } + return "\t`$name`: {+1, code$cname},\n" . + "\t`$negname`: {-1, code$cname},\n"; +} + +my $gen = 0; + +sub PrintClasses($@) { + my ($cname, @classes) = @_; + my @entries; + foreach my $cl (@classes) { + my @ranges = ComputeClass($cl); + push @entries, PrintClass(++$gen, $cl, @ranges); + } + print "var ${cname}Group = map[string]charGroup{\n"; + foreach my $e (@entries) { + print $e; + } + print "}\n"; + my $count = @entries; +} + +print <<EOF; +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// GENERATED BY make_perl_groups.pl; DO NOT EDIT. +// make_perl_groups.pl >perl_groups.go + +package syntax + +EOF + +PrintClasses("perl", @perlclasses); +PrintClasses("posix", @posixclasses); diff --git a/src/regexp/syntax/op_string.go b/src/regexp/syntax/op_string.go new file mode 100644 index 0000000..3952b2b --- /dev/null +++ b/src/regexp/syntax/op_string.go @@ -0,0 +1,26 @@ +// Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT. + +package syntax + +import "strconv" + +const ( + _Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate" + _Op_name_1 = "opPseudo" +) + +var ( + _Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151} +) + +func (i Op) String() string { + switch { + case 1 <= i && i <= 19: + i -= 1 + return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]] + case i == 128: + return _Op_name_1 + default: + return "Op(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go new file mode 100644 index 0000000..accee9a --- /dev/null +++ b/src/regexp/syntax/parse.go @@ -0,0 +1,2115 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "sort" + "strings" + "unicode" + "unicode/utf8" +) + +// An Error describes a failure to parse a regular expression +// and gives the offending expression. +type Error struct { + Code ErrorCode + Expr string +} + +func (e *Error) Error() string { + return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" +} + +// An ErrorCode describes a failure to parse a regular expression. +type ErrorCode string + +const ( + // Unexpected error + ErrInternalError ErrorCode = "regexp/syntax: internal error" + + // Parse errors + ErrInvalidCharClass ErrorCode = "invalid character class" + ErrInvalidCharRange ErrorCode = "invalid character class range" + ErrInvalidEscape ErrorCode = "invalid escape sequence" + ErrInvalidNamedCapture ErrorCode = "invalid named capture" + ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" + ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" + ErrInvalidRepeatSize ErrorCode = "invalid repeat count" + ErrInvalidUTF8 ErrorCode = "invalid UTF-8" + ErrMissingBracket ErrorCode = "missing closing ]" + ErrMissingParen ErrorCode = "missing closing )" + ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" + ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" + ErrUnexpectedParen ErrorCode = "unexpected )" + ErrNestingDepth ErrorCode = "expression nests too deeply" + ErrLarge ErrorCode = "expression too large" +) + +func (e ErrorCode) String() string { + return string(e) +} + +// Flags control the behavior of the parser and record information about regexp context. +type Flags uint16 + +const ( + FoldCase Flags = 1 << iota // case-insensitive match + Literal // treat pattern as literal string + ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline + DotNL // allow . to match newline + OneLine // treat ^ and $ as only matching at beginning and end of text + NonGreedy // make repetition operators default to non-greedy + PerlX // allow Perl extensions + UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation + WasDollar // regexp OpEndText was $, not \z + Simple // regexp contains no counted repetition + + MatchNL = ClassNL | DotNL + + Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible + POSIX Flags = 0 // POSIX syntax +) + +// Pseudo-ops for parsing stack. +const ( + opLeftParen = opPseudo + iota + opVerticalBar +) + +// maxHeight is the maximum height of a regexp parse tree. +// It is somewhat arbitrarily chosen, but the idea is to be large enough +// that no one will actually hit in real use but at the same time small enough +// that recursion on the Regexp tree will not hit the 1GB Go stack limit. +// The maximum amount of stack for a single recursive frame is probably +// closer to 1kB, so this could potentially be raised, but it seems unlikely +// that people have regexps nested even this deeply. +// We ran a test on Google's C++ code base and turned up only +// a single use case with depth > 100; it had depth 128. +// Using depth 1000 should be plenty of margin. +// As an optimization, we don't even bother calculating heights +// until we've allocated at least maxHeight Regexp structures. +const maxHeight = 1000 + +// maxSize is the maximum size of a compiled regexp in Insts. +// It too is somewhat arbitrarily chosen, but the idea is to be large enough +// to allow significant regexps while at the same time small enough that +// the compiled form will not take up too much memory. +// 128 MB is enough for a 3.3 million Inst structures, which roughly +// corresponds to a 3.3 MB regexp. +const ( + maxSize = 128 << 20 / instSize + instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words +) + +// maxRunes is the maximum number of runes allowed in a regexp tree +// counting the runes in all the nodes. +// Ignoring character classes p.numRunes is always less than the length of the regexp. +// Character classes can make it much larger: each \pL adds 1292 runes. +// 128 MB is enough for 32M runes, which is over 26k \pL instances. +// Note that repetitions do not make copies of the rune slices, +// so \pL{1000} is only one rune slice, not 1000. +// We could keep a cache of character classes we've seen, +// so that all the \pL we see use the same rune list, +// but that doesn't remove the problem entirely: +// consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. +// And because the Rune slice is exposed directly in the Regexp, +// there is not an opportunity to change the representation to allow +// partial sharing between different character classes. +// So the limit is the best we can do. +const ( + maxRunes = 128 << 20 / runeSize + runeSize = 4 // rune is int32 +) + +type parser struct { + flags Flags // parse mode flags + stack []*Regexp // stack of parsed expressions + free *Regexp + numCap int // number of capturing groups seen + wholeRegexp string + tmpClass []rune // temporary char class work space + numRegexp int // number of regexps allocated + numRunes int // number of runes in char classes + repeats int64 // product of all repetitions seen + height map[*Regexp]int // regexp height, for height limit check + size map[*Regexp]int64 // regexp compiled size, for size limit check +} + +func (p *parser) newRegexp(op Op) *Regexp { + re := p.free + if re != nil { + p.free = re.Sub0[0] + *re = Regexp{} + } else { + re = new(Regexp) + p.numRegexp++ + } + re.Op = op + return re +} + +func (p *parser) reuse(re *Regexp) { + if p.height != nil { + delete(p.height, re) + } + re.Sub0[0] = p.free + p.free = re +} + +func (p *parser) checkLimits(re *Regexp) { + if p.numRunes > maxRunes { + panic(ErrLarge) + } + p.checkSize(re) + p.checkHeight(re) +} + +func (p *parser) checkSize(re *Regexp) { + if p.size == nil { + // We haven't started tracking size yet. + // Do a relatively cheap check to see if we need to start. + // Maintain the product of all the repeats we've seen + // and don't track if the total number of regexp nodes + // we've seen times the repeat product is in budget. + if p.repeats == 0 { + p.repeats = 1 + } + if re.Op == OpRepeat { + n := re.Max + if n == -1 { + n = re.Min + } + if n <= 0 { + n = 1 + } + if int64(n) > maxSize/p.repeats { + p.repeats = maxSize + } else { + p.repeats *= int64(n) + } + } + if int64(p.numRegexp) < maxSize/p.repeats { + return + } + + // We need to start tracking size. + // Make the map and belatedly populate it + // with info about everything we've constructed so far. + p.size = make(map[*Regexp]int64) + for _, re := range p.stack { + p.checkSize(re) + } + } + + if p.calcSize(re, true) > maxSize { + panic(ErrLarge) + } +} + +func (p *parser) calcSize(re *Regexp, force bool) int64 { + if !force { + if size, ok := p.size[re]; ok { + return size + } + } + + var size int64 + switch re.Op { + case OpLiteral: + size = int64(len(re.Rune)) + case OpCapture, OpStar: + // star can be 1+ or 2+; assume 2 pessimistically + size = 2 + p.calcSize(re.Sub[0], false) + case OpPlus, OpQuest: + size = 1 + p.calcSize(re.Sub[0], false) + case OpConcat: + for _, sub := range re.Sub { + size += p.calcSize(sub, false) + } + case OpAlternate: + for _, sub := range re.Sub { + size += p.calcSize(sub, false) + } + if len(re.Sub) > 1 { + size += int64(len(re.Sub)) - 1 + } + case OpRepeat: + sub := p.calcSize(re.Sub[0], false) + if re.Max == -1 { + if re.Min == 0 { + size = 2 + sub // x* + } else { + size = 1 + int64(re.Min)*sub // xxx+ + } + break + } + // x{2,5} = xx(x(x(x)?)?)? + size = int64(re.Max)*sub + int64(re.Max-re.Min) + } + + if size < 1 { + size = 1 + } + p.size[re] = size + return size +} + +func (p *parser) checkHeight(re *Regexp) { + if p.numRegexp < maxHeight { + return + } + if p.height == nil { + p.height = make(map[*Regexp]int) + for _, re := range p.stack { + p.checkHeight(re) + } + } + if p.calcHeight(re, true) > maxHeight { + panic(ErrNestingDepth) + } +} + +func (p *parser) calcHeight(re *Regexp, force bool) int { + if !force { + if h, ok := p.height[re]; ok { + return h + } + } + h := 1 + for _, sub := range re.Sub { + hsub := p.calcHeight(sub, false) + if h < 1+hsub { + h = 1 + hsub + } + } + p.height[re] = h + return h +} + +// Parse stack manipulation. + +// push pushes the regexp re onto the parse stack and returns the regexp. +func (p *parser) push(re *Regexp) *Regexp { + p.numRunes += len(re.Rune) + if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { + // Single rune. + if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { + return nil + } + re.Op = OpLiteral + re.Rune = re.Rune[:1] + re.Flags = p.flags &^ FoldCase + } else if re.Op == OpCharClass && len(re.Rune) == 4 && + re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && + unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && + unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || + re.Op == OpCharClass && len(re.Rune) == 2 && + re.Rune[0]+1 == re.Rune[1] && + unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && + unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { + // Case-insensitive rune like [Aa] or [Δδ]. + if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { + return nil + } + + // Rewrite as (case-insensitive) literal. + re.Op = OpLiteral + re.Rune = re.Rune[:1] + re.Flags = p.flags | FoldCase + } else { + // Incremental concatenation. + p.maybeConcat(-1, 0) + } + + p.stack = append(p.stack, re) + p.checkLimits(re) + return re +} + +// maybeConcat implements incremental concatenation +// of literal runes into string nodes. The parser calls this +// before each push, so only the top fragment of the stack +// might need processing. Since this is called before a push, +// the topmost literal is no longer subject to operators like * +// (Otherwise ab* would turn into (ab)*.) +// If r >= 0 and there's a node left over, maybeConcat uses it +// to push r with the given flags. +// maybeConcat reports whether r was pushed. +func (p *parser) maybeConcat(r rune, flags Flags) bool { + n := len(p.stack) + if n < 2 { + return false + } + + re1 := p.stack[n-1] + re2 := p.stack[n-2] + if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { + return false + } + + // Push re1 into re2. + re2.Rune = append(re2.Rune, re1.Rune...) + + // Reuse re1 if possible. + if r >= 0 { + re1.Rune = re1.Rune0[:1] + re1.Rune[0] = r + re1.Flags = flags + return true + } + + p.stack = p.stack[:n-1] + p.reuse(re1) + return false // did not push r +} + +// literal pushes a literal regexp for the rune r on the stack. +func (p *parser) literal(r rune) { + re := p.newRegexp(OpLiteral) + re.Flags = p.flags + if p.flags&FoldCase != 0 { + r = minFoldRune(r) + } + re.Rune0[0] = r + re.Rune = re.Rune0[:1] + p.push(re) +} + +// minFoldRune returns the minimum rune fold-equivalent to r. +func minFoldRune(r rune) rune { + if r < minFold || r > maxFold { + return r + } + min := r + r0 := r + for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { + if min > r { + min = r + } + } + return min +} + +// op pushes a regexp with the given op onto the stack +// and returns that regexp. +func (p *parser) op(op Op) *Regexp { + re := p.newRegexp(op) + re.Flags = p.flags + return p.push(re) +} + +// repeat replaces the top stack element with itself repeated according to op, min, max. +// before is the regexp suffix starting at the repetition operator. +// after is the regexp suffix following after the repetition operator. +// repeat returns an updated 'after' and an error, if any. +func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { + flags := p.flags + if p.flags&PerlX != 0 { + if len(after) > 0 && after[0] == '?' { + after = after[1:] + flags ^= NonGreedy + } + if lastRepeat != "" { + // In Perl it is not allowed to stack repetition operators: + // a** is a syntax error, not a doubled star, and a++ means + // something else entirely, which we don't support! + return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} + } + } + n := len(p.stack) + if n == 0 { + return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} + } + sub := p.stack[n-1] + if sub.Op >= opPseudo { + return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} + } + + re := p.newRegexp(op) + re.Min = min + re.Max = max + re.Flags = flags + re.Sub = re.Sub0[:1] + re.Sub[0] = sub + p.stack[n-1] = re + p.checkLimits(re) + + if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { + return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} + } + + return after, nil +} + +// repeatIsValid reports whether the repetition re is valid. +// Valid means that the combination of the top-level repetition +// and any inner repetitions does not exceed n copies of the +// innermost thing. +// This function rewalks the regexp tree and is called for every repetition, +// so we have to worry about inducing quadratic behavior in the parser. +// We avoid this by only calling repeatIsValid when min or max >= 2. +// In that case the depth of any >= 2 nesting can only get to 9 without +// triggering a parse error, so each subtree can only be rewalked 9 times. +func repeatIsValid(re *Regexp, n int) bool { + if re.Op == OpRepeat { + m := re.Max + if m == 0 { + return true + } + if m < 0 { + m = re.Min + } + if m > n { + return false + } + if m > 0 { + n /= m + } + } + for _, sub := range re.Sub { + if !repeatIsValid(sub, n) { + return false + } + } + return true +} + +// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. +func (p *parser) concat() *Regexp { + p.maybeConcat(-1, 0) + + // Scan down to find pseudo-operator | or (. + i := len(p.stack) + for i > 0 && p.stack[i-1].Op < opPseudo { + i-- + } + subs := p.stack[i:] + p.stack = p.stack[:i] + + // Empty concatenation is special case. + if len(subs) == 0 { + return p.push(p.newRegexp(OpEmptyMatch)) + } + + return p.push(p.collapse(subs, OpConcat)) +} + +// alternate replaces the top of the stack (above the topmost '(') with its alternation. +func (p *parser) alternate() *Regexp { + // Scan down to find pseudo-operator (. + // There are no | above (. + i := len(p.stack) + for i > 0 && p.stack[i-1].Op < opPseudo { + i-- + } + subs := p.stack[i:] + p.stack = p.stack[:i] + + // Make sure top class is clean. + // All the others already are (see swapVerticalBar). + if len(subs) > 0 { + cleanAlt(subs[len(subs)-1]) + } + + // Empty alternate is special case + // (shouldn't happen but easy to handle). + if len(subs) == 0 { + return p.push(p.newRegexp(OpNoMatch)) + } + + return p.push(p.collapse(subs, OpAlternate)) +} + +// cleanAlt cleans re for eventual inclusion in an alternation. +func cleanAlt(re *Regexp) { + switch re.Op { + case OpCharClass: + re.Rune = cleanClass(&re.Rune) + if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { + re.Rune = nil + re.Op = OpAnyChar + return + } + if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { + re.Rune = nil + re.Op = OpAnyCharNotNL + return + } + if cap(re.Rune)-len(re.Rune) > 100 { + // re.Rune will not grow any more. + // Make a copy or inline to reclaim storage. + re.Rune = append(re.Rune0[:0], re.Rune...) + } + } +} + +// collapse returns the result of applying op to sub. +// If sub contains op nodes, they all get hoisted up +// so that there is never a concat of a concat or an +// alternate of an alternate. +func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { + if len(subs) == 1 { + return subs[0] + } + re := p.newRegexp(op) + re.Sub = re.Sub0[:0] + for _, sub := range subs { + if sub.Op == op { + re.Sub = append(re.Sub, sub.Sub...) + p.reuse(sub) + } else { + re.Sub = append(re.Sub, sub) + } + } + if op == OpAlternate { + re.Sub = p.factor(re.Sub) + if len(re.Sub) == 1 { + old := re + re = re.Sub[0] + p.reuse(old) + } + } + return re +} + +// factor factors common prefixes from the alternation list sub. +// It returns a replacement list that reuses the same storage and +// frees (passes to p.reuse) any removed *Regexps. +// +// For example, +// +// ABC|ABD|AEF|BCX|BCY +// +// simplifies by literal prefix extraction to +// +// A(B(C|D)|EF)|BC(X|Y) +// +// which simplifies by character class introduction to +// +// A(B[CD]|EF)|BC[XY] +func (p *parser) factor(sub []*Regexp) []*Regexp { + if len(sub) < 2 { + return sub + } + + // Round 1: Factor out common literal prefixes. + var str []rune + var strflags Flags + start := 0 + out := sub[:0] + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin + // with str as modified by strflags. + var istr []rune + var iflags Flags + if i < len(sub) { + istr, iflags = p.leadingString(sub[i]) + if iflags == strflags { + same := 0 + for same < len(str) && same < len(istr) && str[same] == istr[same] { + same++ + } + if same > 0 { + // Matches at least one rune in current range. + // Keep going around. + str = str[:same] + continue + } + } + } + + // Found end of a run with common leading literal string: + // sub[start:i] all begin with str[0:len(str)], but sub[i] + // does not even begin with str[0]. + // + // Factor out common string and append factored expression to out. + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + // Just one: don't bother factoring. + out = append(out, sub[start]) + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + prefix := p.newRegexp(OpLiteral) + prefix.Flags = strflags + prefix.Rune = append(prefix.Rune[:0], str...) + + for j := start; j < i; j++ { + sub[j] = p.removeLeadingString(sub[j], len(str)) + p.checkLimits(sub[j]) + } + suffix := p.collapse(sub[start:i], OpAlternate) // recurse + + re := p.newRegexp(OpConcat) + re.Sub = append(re.Sub[:0], prefix, suffix) + out = append(out, re) + } + + // Prepare for next iteration. + start = i + str = istr + strflags = iflags + } + sub = out + + // Round 2: Factor out common simple prefixes, + // just the first piece of each concatenation. + // This will be good enough a lot of the time. + // + // Complex subexpressions (e.g. involving quantifiers) + // are not safe to factor because that collapses their + // distinct paths through the automaton, which affects + // correctness in some cases. + start = 0 + out = sub[:0] + var first *Regexp + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin with ifirst. + var ifirst *Regexp + if i < len(sub) { + ifirst = p.leadingRegexp(sub[i]) + if first != nil && first.Equal(ifirst) && + // first must be a character class OR a fixed repeat of a character class. + (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { + continue + } + } + + // Found end of a run with common leading regexp: + // sub[start:i] all begin with first but sub[i] does not. + // + // Factor out common regexp and append factored expression to out. + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + // Just one: don't bother factoring. + out = append(out, sub[start]) + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + prefix := first + for j := start; j < i; j++ { + reuse := j != start // prefix came from sub[start] + sub[j] = p.removeLeadingRegexp(sub[j], reuse) + p.checkLimits(sub[j]) + } + suffix := p.collapse(sub[start:i], OpAlternate) // recurse + + re := p.newRegexp(OpConcat) + re.Sub = append(re.Sub[:0], prefix, suffix) + out = append(out, re) + } + + // Prepare for next iteration. + start = i + first = ifirst + } + sub = out + + // Round 3: Collapse runs of single literals into character classes. + start = 0 + out = sub[:0] + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that are either + // literal runes or character classes. + if i < len(sub) && isCharClass(sub[i]) { + continue + } + + // sub[i] is not a char or char class; + // emit char class for sub[start:i]... + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + out = append(out, sub[start]) + } else { + // Make new char class. + // Start with most complex regexp in sub[start]. + max := start + for j := start + 1; j < i; j++ { + if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { + max = j + } + } + sub[start], sub[max] = sub[max], sub[start] + + for j := start + 1; j < i; j++ { + mergeCharClass(sub[start], sub[j]) + p.reuse(sub[j]) + } + cleanAlt(sub[start]) + out = append(out, sub[start]) + } + + // ... and then emit sub[i]. + if i < len(sub) { + out = append(out, sub[i]) + } + start = i + 1 + } + sub = out + + // Round 4: Collapse runs of empty matches into a single empty match. + start = 0 + out = sub[:0] + for i := range sub { + if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { + continue + } + out = append(out, sub[i]) + } + sub = out + + return sub +} + +// leadingString returns the leading literal string that re begins with. +// The string refers to storage in re or its children. +func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { + if re.Op == OpConcat && len(re.Sub) > 0 { + re = re.Sub[0] + } + if re.Op != OpLiteral { + return nil, 0 + } + return re.Rune, re.Flags & FoldCase +} + +// removeLeadingString removes the first n leading runes +// from the beginning of re. It returns the replacement for re. +func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { + if re.Op == OpConcat && len(re.Sub) > 0 { + // Removing a leading string in a concatenation + // might simplify the concatenation. + sub := re.Sub[0] + sub = p.removeLeadingString(sub, n) + re.Sub[0] = sub + if sub.Op == OpEmptyMatch { + p.reuse(sub) + switch len(re.Sub) { + case 0, 1: + // Impossible but handle. + re.Op = OpEmptyMatch + re.Sub = nil + case 2: + old := re + re = re.Sub[1] + p.reuse(old) + default: + copy(re.Sub, re.Sub[1:]) + re.Sub = re.Sub[:len(re.Sub)-1] + } + } + return re + } + + if re.Op == OpLiteral { + re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] + if len(re.Rune) == 0 { + re.Op = OpEmptyMatch + } + } + return re +} + +// leadingRegexp returns the leading regexp that re begins with. +// The regexp refers to storage in re or its children. +func (p *parser) leadingRegexp(re *Regexp) *Regexp { + if re.Op == OpEmptyMatch { + return nil + } + if re.Op == OpConcat && len(re.Sub) > 0 { + sub := re.Sub[0] + if sub.Op == OpEmptyMatch { + return nil + } + return sub + } + return re +} + +// removeLeadingRegexp removes the leading regexp in re. +// It returns the replacement for re. +// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. +func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { + if re.Op == OpConcat && len(re.Sub) > 0 { + if reuse { + p.reuse(re.Sub[0]) + } + re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] + switch len(re.Sub) { + case 0: + re.Op = OpEmptyMatch + re.Sub = nil + case 1: + old := re + re = re.Sub[0] + p.reuse(old) + } + return re + } + if reuse { + p.reuse(re) + } + return p.newRegexp(OpEmptyMatch) +} + +func literalRegexp(s string, flags Flags) *Regexp { + re := &Regexp{Op: OpLiteral} + re.Flags = flags + re.Rune = re.Rune0[:0] // use local storage for small strings + for _, c := range s { + if len(re.Rune) >= cap(re.Rune) { + // string is too long to fit in Rune0. let Go handle it + re.Rune = []rune(s) + break + } + re.Rune = append(re.Rune, c) + } + return re +} + +// Parsing. + +// Parse parses a regular expression string s, controlled by the specified +// Flags, and returns a regular expression parse tree. The syntax is +// described in the top-level comment. +func Parse(s string, flags Flags) (*Regexp, error) { + return parse(s, flags) +} + +func parse(s string, flags Flags) (_ *Regexp, err error) { + defer func() { + switch r := recover(); r { + default: + panic(r) + case nil: + // ok + case ErrLarge: // too big + err = &Error{Code: ErrLarge, Expr: s} + case ErrNestingDepth: + err = &Error{Code: ErrNestingDepth, Expr: s} + } + }() + + if flags&Literal != 0 { + // Trivial parser for literal string. + if err := checkUTF8(s); err != nil { + return nil, err + } + return literalRegexp(s, flags), nil + } + + // Otherwise, must do real work. + var ( + p parser + c rune + op Op + lastRepeat string + ) + p.flags = flags + p.wholeRegexp = s + t := s + for t != "" { + repeat := "" + BigSwitch: + switch t[0] { + default: + if c, t, err = nextRune(t); err != nil { + return nil, err + } + p.literal(c) + + case '(': + if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { + // Flag changes and non-capturing groups. + if t, err = p.parsePerlFlags(t); err != nil { + return nil, err + } + break + } + p.numCap++ + p.op(opLeftParen).Cap = p.numCap + t = t[1:] + case '|': + if err = p.parseVerticalBar(); err != nil { + return nil, err + } + t = t[1:] + case ')': + if err = p.parseRightParen(); err != nil { + return nil, err + } + t = t[1:] + case '^': + if p.flags&OneLine != 0 { + p.op(OpBeginText) + } else { + p.op(OpBeginLine) + } + t = t[1:] + case '$': + if p.flags&OneLine != 0 { + p.op(OpEndText).Flags |= WasDollar + } else { + p.op(OpEndLine) + } + t = t[1:] + case '.': + if p.flags&DotNL != 0 { + p.op(OpAnyChar) + } else { + p.op(OpAnyCharNotNL) + } + t = t[1:] + case '[': + if t, err = p.parseClass(t); err != nil { + return nil, err + } + case '*', '+', '?': + before := t + switch t[0] { + case '*': + op = OpStar + case '+': + op = OpPlus + case '?': + op = OpQuest + } + after := t[1:] + if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { + return nil, err + } + repeat = before + t = after + case '{': + op = OpRepeat + before := t + min, max, after, ok := p.parseRepeat(t) + if !ok { + // If the repeat cannot be parsed, { is a literal. + p.literal('{') + t = t[1:] + break + } + if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { + // Numbers were too big, or max is present and min > max. + return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} + } + if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { + return nil, err + } + repeat = before + t = after + case '\\': + if p.flags&PerlX != 0 && len(t) >= 2 { + switch t[1] { + case 'A': + p.op(OpBeginText) + t = t[2:] + break BigSwitch + case 'b': + p.op(OpWordBoundary) + t = t[2:] + break BigSwitch + case 'B': + p.op(OpNoWordBoundary) + t = t[2:] + break BigSwitch + case 'C': + // any byte; not supported + return nil, &Error{ErrInvalidEscape, t[:2]} + case 'Q': + // \Q ... \E: the ... is always literals + var lit string + lit, t, _ = strings.Cut(t[2:], `\E`) + for lit != "" { + c, rest, err := nextRune(lit) + if err != nil { + return nil, err + } + p.literal(c) + lit = rest + } + break BigSwitch + case 'z': + p.op(OpEndText) + t = t[2:] + break BigSwitch + } + } + + re := p.newRegexp(OpCharClass) + re.Flags = p.flags + + // Look for Unicode character group like \p{Han} + if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { + r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) + if err != nil { + return nil, err + } + if r != nil { + re.Rune = r + t = rest + p.push(re) + break BigSwitch + } + } + + // Perl character class escape. + if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { + re.Rune = r + t = rest + p.push(re) + break BigSwitch + } + p.reuse(re) + + // Ordinary single-character escape. + if c, t, err = p.parseEscape(t); err != nil { + return nil, err + } + p.literal(c) + } + lastRepeat = repeat + } + + p.concat() + if p.swapVerticalBar() { + // pop vertical bar + p.stack = p.stack[:len(p.stack)-1] + } + p.alternate() + + n := len(p.stack) + if n != 1 { + return nil, &Error{ErrMissingParen, s} + } + return p.stack[0], nil +} + +// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. +// If s is not of that form, it returns ok == false. +// If s has the right form but the values are too big, it returns min == -1, ok == true. +func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { + if s == "" || s[0] != '{' { + return + } + s = s[1:] + var ok1 bool + if min, s, ok1 = p.parseInt(s); !ok1 { + return + } + if s == "" { + return + } + if s[0] != ',' { + max = min + } else { + s = s[1:] + if s == "" { + return + } + if s[0] == '}' { + max = -1 + } else if max, s, ok1 = p.parseInt(s); !ok1 { + return + } else if max < 0 { + // parseInt found too big a number + min = -1 + } + } + if s == "" || s[0] != '}' { + return + } + rest = s[1:] + ok = true + return +} + +// parsePerlFlags parses a Perl flag setting or non-capturing group or both, +// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. +// The caller must have ensured that s begins with "(?". +func (p *parser) parsePerlFlags(s string) (rest string, err error) { + t := s + + // Check for named captures, first introduced in Python's regexp library. + // As usual, there are three slightly different syntaxes: + // + // (?P<name>expr) the original, introduced by Python + // (?<name>expr) the .NET alteration, adopted by Perl 5.10 + // (?'name'expr) another .NET alteration, adopted by Perl 5.10 + // + // Perl 5.10 gave in and implemented the Python version too, + // but they claim that the last two are the preferred forms. + // PCRE and languages based on it (specifically, PHP and Ruby) + // support all three as well. EcmaScript 4 uses only the Python form. + // + // In both the open source world (via Code Search) and the + // Google source tree, (?P<expr>name) is the dominant form, + // so that's the one we implement. One is enough. + if len(t) > 4 && t[2] == 'P' && t[3] == '<' { + // Pull out name. + end := strings.IndexRune(t, '>') + if end < 0 { + if err = checkUTF8(t); err != nil { + return "", err + } + return "", &Error{ErrInvalidNamedCapture, s} + } + + capture := t[:end+1] // "(?P<name>" + name := t[4:end] // "name" + if err = checkUTF8(name); err != nil { + return "", err + } + if !isValidCaptureName(name) { + return "", &Error{ErrInvalidNamedCapture, capture} + } + + // Like ordinary capture, but named. + p.numCap++ + re := p.op(opLeftParen) + re.Cap = p.numCap + re.Name = name + return t[end+1:], nil + } + + // Non-capturing group. Might also twiddle Perl flags. + var c rune + t = t[2:] // skip (? + flags := p.flags + sign := +1 + sawFlag := false +Loop: + for t != "" { + if c, t, err = nextRune(t); err != nil { + return "", err + } + switch c { + default: + break Loop + + // Flags. + case 'i': + flags |= FoldCase + sawFlag = true + case 'm': + flags &^= OneLine + sawFlag = true + case 's': + flags |= DotNL + sawFlag = true + case 'U': + flags |= NonGreedy + sawFlag = true + + // Switch to negation. + case '-': + if sign < 0 { + break Loop + } + sign = -1 + // Invert flags so that | above turn into &^ and vice versa. + // We'll invert flags again before using it below. + flags = ^flags + sawFlag = false + + // End of flags, starting group or not. + case ':', ')': + if sign < 0 { + if !sawFlag { + break Loop + } + flags = ^flags + } + if c == ':' { + // Open new group + p.op(opLeftParen) + } + p.flags = flags + return t, nil + } + } + + return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} +} + +// isValidCaptureName reports whether name +// is a valid capture name: [A-Za-z0-9_]+. +// PCRE limits names to 32 bytes. +// Python rejects names starting with digits. +// We don't enforce either of those. +func isValidCaptureName(name string) bool { + if name == "" { + return false + } + for _, c := range name { + if c != '_' && !isalnum(c) { + return false + } + } + return true +} + +// parseInt parses a decimal integer. +func (p *parser) parseInt(s string) (n int, rest string, ok bool) { + if s == "" || s[0] < '0' || '9' < s[0] { + return + } + // Disallow leading zeros. + if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { + return + } + t := s + for s != "" && '0' <= s[0] && s[0] <= '9' { + s = s[1:] + } + rest = s + ok = true + // Have digits, compute value. + t = t[:len(t)-len(s)] + for i := 0; i < len(t); i++ { + // Avoid overflow. + if n >= 1e8 { + n = -1 + break + } + n = n*10 + int(t[i]) - '0' + } + return +} + +// can this be represented as a character class? +// single-rune literal string, char class, ., and .|\n. +func isCharClass(re *Regexp) bool { + return re.Op == OpLiteral && len(re.Rune) == 1 || + re.Op == OpCharClass || + re.Op == OpAnyCharNotNL || + re.Op == OpAnyChar +} + +// does re match r? +func matchRune(re *Regexp, r rune) bool { + switch re.Op { + case OpLiteral: + return len(re.Rune) == 1 && re.Rune[0] == r + case OpCharClass: + for i := 0; i < len(re.Rune); i += 2 { + if re.Rune[i] <= r && r <= re.Rune[i+1] { + return true + } + } + return false + case OpAnyCharNotNL: + return r != '\n' + case OpAnyChar: + return true + } + return false +} + +// parseVerticalBar handles a | in the input. +func (p *parser) parseVerticalBar() error { + p.concat() + + // The concatenation we just parsed is on top of the stack. + // If it sits above an opVerticalBar, swap it below + // (things below an opVerticalBar become an alternation). + // Otherwise, push a new vertical bar. + if !p.swapVerticalBar() { + p.op(opVerticalBar) + } + + return nil +} + +// mergeCharClass makes dst = dst|src. +// The caller must ensure that dst.Op >= src.Op, +// to reduce the amount of copying. +func mergeCharClass(dst, src *Regexp) { + switch dst.Op { + case OpAnyChar: + // src doesn't add anything. + case OpAnyCharNotNL: + // src might add \n + if matchRune(src, '\n') { + dst.Op = OpAnyChar + } + case OpCharClass: + // src is simpler, so either literal or char class + if src.Op == OpLiteral { + dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) + } else { + dst.Rune = appendClass(dst.Rune, src.Rune) + } + case OpLiteral: + // both literal + if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { + break + } + dst.Op = OpCharClass + dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) + dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) + } +} + +// If the top of the stack is an element followed by an opVerticalBar +// swapVerticalBar swaps the two and returns true. +// Otherwise it returns false. +func (p *parser) swapVerticalBar() bool { + // If above and below vertical bar are literal or char class, + // can merge into a single char class. + n := len(p.stack) + if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { + re1 := p.stack[n-1] + re3 := p.stack[n-3] + // Make re3 the more complex of the two. + if re1.Op > re3.Op { + re1, re3 = re3, re1 + p.stack[n-3] = re3 + } + mergeCharClass(re3, re1) + p.reuse(re1) + p.stack = p.stack[:n-1] + return true + } + + if n >= 2 { + re1 := p.stack[n-1] + re2 := p.stack[n-2] + if re2.Op == opVerticalBar { + if n >= 3 { + // Now out of reach. + // Clean opportunistically. + cleanAlt(p.stack[n-3]) + } + p.stack[n-2] = re1 + p.stack[n-1] = re2 + return true + } + } + return false +} + +// parseRightParen handles a ) in the input. +func (p *parser) parseRightParen() error { + p.concat() + if p.swapVerticalBar() { + // pop vertical bar + p.stack = p.stack[:len(p.stack)-1] + } + p.alternate() + + n := len(p.stack) + if n < 2 { + return &Error{ErrUnexpectedParen, p.wholeRegexp} + } + re1 := p.stack[n-1] + re2 := p.stack[n-2] + p.stack = p.stack[:n-2] + if re2.Op != opLeftParen { + return &Error{ErrUnexpectedParen, p.wholeRegexp} + } + // Restore flags at time of paren. + p.flags = re2.Flags + if re2.Cap == 0 { + // Just for grouping. + p.push(re1) + } else { + re2.Op = OpCapture + re2.Sub = re2.Sub0[:1] + re2.Sub[0] = re1 + p.push(re2) + } + return nil +} + +// parseEscape parses an escape sequence at the beginning of s +// and returns the rune. +func (p *parser) parseEscape(s string) (r rune, rest string, err error) { + t := s[1:] + if t == "" { + return 0, "", &Error{ErrTrailingBackslash, ""} + } + c, t, err := nextRune(t) + if err != nil { + return 0, "", err + } + +Switch: + switch c { + default: + if c < utf8.RuneSelf && !isalnum(c) { + // Escaped non-word characters are always themselves. + // PCRE is not quite so rigorous: it accepts things like + // \q, but we don't. We once rejected \_, but too many + // programs and people insist on using it, so allow \_. + return c, t, nil + } + + // Octal escapes. + case '1', '2', '3', '4', '5', '6', '7': + // Single non-zero digit is a backreference; not supported + if t == "" || t[0] < '0' || t[0] > '7' { + break + } + fallthrough + case '0': + // Consume up to three octal digits; already have one. + r = c - '0' + for i := 1; i < 3; i++ { + if t == "" || t[0] < '0' || t[0] > '7' { + break + } + r = r*8 + rune(t[0]) - '0' + t = t[1:] + } + return r, t, nil + + // Hexadecimal escapes. + case 'x': + if t == "" { + break + } + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + if c == '{' { + // Any number of digits in braces. + // Perl accepts any text at all; it ignores all text + // after the first non-hex digit. We require only hex digits, + // and at least one. + nhex := 0 + r = 0 + for { + if t == "" { + break Switch + } + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + if c == '}' { + break + } + v := unhex(c) + if v < 0 { + break Switch + } + r = r*16 + v + if r > unicode.MaxRune { + break Switch + } + nhex++ + } + if nhex == 0 { + break Switch + } + return r, t, nil + } + + // Easy case: two hex digits. + x := unhex(c) + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + y := unhex(c) + if x < 0 || y < 0 { + break + } + return x*16 + y, t, nil + + // C escapes. There is no case 'b', to avoid misparsing + // the Perl word-boundary \b as the C backspace \b + // when in POSIX mode. In Perl, /\b/ means word-boundary + // but /[\b]/ means backspace. We don't support that. + // If you want a backspace, embed a literal backspace + // character or use \x08. + case 'a': + return '\a', t, err + case 'f': + return '\f', t, err + case 'n': + return '\n', t, err + case 'r': + return '\r', t, err + case 't': + return '\t', t, err + case 'v': + return '\v', t, err + } + return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} +} + +// parseClassChar parses a character class character at the beginning of s +// and returns it. +func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { + if s == "" { + return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} + } + + // Allow regular escape sequences even though + // many need not be escaped in this context. + if s[0] == '\\' { + return p.parseEscape(s) + } + + return nextRune(s) +} + +type charGroup struct { + sign int + class []rune +} + +// parsePerlClassEscape parses a leading Perl character class escape like \d +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { + if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { + return + } + g := perlGroup[s[0:2]] + if g.sign == 0 { + return + } + return p.appendGroup(r, g), s[2:] +} + +// parseNamedClass parses a leading POSIX named character class like [:alnum:] +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { + if len(s) < 2 || s[0] != '[' || s[1] != ':' { + return + } + + i := strings.Index(s[2:], ":]") + if i < 0 { + return + } + i += 2 + name, s := s[0:i+2], s[i+2:] + g := posixGroup[name] + if g.sign == 0 { + return nil, "", &Error{ErrInvalidCharRange, name} + } + return p.appendGroup(r, g), s, nil +} + +func (p *parser) appendGroup(r []rune, g charGroup) []rune { + if p.flags&FoldCase == 0 { + if g.sign < 0 { + r = appendNegatedClass(r, g.class) + } else { + r = appendClass(r, g.class) + } + } else { + tmp := p.tmpClass[:0] + tmp = appendFoldedClass(tmp, g.class) + p.tmpClass = tmp + tmp = cleanClass(&p.tmpClass) + if g.sign < 0 { + r = appendNegatedClass(r, tmp) + } else { + r = appendClass(r, tmp) + } + } + return r +} + +var anyTable = &unicode.RangeTable{ + R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, + R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, +} + +// unicodeTable returns the unicode.RangeTable identified by name +// and the table of additional fold-equivalent code points. +func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { + // Special case: "Any" means any. + if name == "Any" { + return anyTable, anyTable + } + if t := unicode.Categories[name]; t != nil { + return t, unicode.FoldCategory[name] + } + if t := unicode.Scripts[name]; t != nil { + return t, unicode.FoldScript[name] + } + return nil, nil +} + +// parseUnicodeClass parses a leading Unicode character class like \p{Han} +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { + if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { + return + } + + // Committed to parse or return error. + sign := +1 + if s[1] == 'P' { + sign = -1 + } + t := s[2:] + c, t, err := nextRune(t) + if err != nil { + return + } + var seq, name string + if c != '{' { + // Single-letter name. + seq = s[:len(s)-len(t)] + name = seq[2:] + } else { + // Name is in braces. + end := strings.IndexRune(s, '}') + if end < 0 { + if err = checkUTF8(s); err != nil { + return + } + return nil, "", &Error{ErrInvalidCharRange, s} + } + seq, t = s[:end+1], s[end+1:] + name = s[3:end] + if err = checkUTF8(name); err != nil { + return + } + } + + // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. + if name != "" && name[0] == '^' { + sign = -sign + name = name[1:] + } + + tab, fold := unicodeTable(name) + if tab == nil { + return nil, "", &Error{ErrInvalidCharRange, seq} + } + + if p.flags&FoldCase == 0 || fold == nil { + if sign > 0 { + r = appendTable(r, tab) + } else { + r = appendNegatedTable(r, tab) + } + } else { + // Merge and clean tab and fold in a temporary buffer. + // This is necessary for the negative case and just tidy + // for the positive case. + tmp := p.tmpClass[:0] + tmp = appendTable(tmp, tab) + tmp = appendTable(tmp, fold) + p.tmpClass = tmp + tmp = cleanClass(&p.tmpClass) + if sign > 0 { + r = appendClass(r, tmp) + } else { + r = appendNegatedClass(r, tmp) + } + } + return r, t, nil +} + +// parseClass parses a character class at the beginning of s +// and pushes it onto the parse stack. +func (p *parser) parseClass(s string) (rest string, err error) { + t := s[1:] // chop [ + re := p.newRegexp(OpCharClass) + re.Flags = p.flags + re.Rune = re.Rune0[:0] + + sign := +1 + if t != "" && t[0] == '^' { + sign = -1 + t = t[1:] + + // If character class does not match \n, add it here, + // so that negation later will do the right thing. + if p.flags&ClassNL == 0 { + re.Rune = append(re.Rune, '\n', '\n') + } + } + + class := re.Rune + first := true // ] and - are okay as first char in class + for t == "" || t[0] != ']' || first { + // POSIX: - is only okay unescaped as first or last in class. + // Perl: - is okay anywhere. + if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { + _, size := utf8.DecodeRuneInString(t[1:]) + return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} + } + first = false + + // Look for POSIX [:alnum:] etc. + if len(t) > 2 && t[0] == '[' && t[1] == ':' { + nclass, nt, err := p.parseNamedClass(t, class) + if err != nil { + return "", err + } + if nclass != nil { + class, t = nclass, nt + continue + } + } + + // Look for Unicode character group like \p{Han}. + nclass, nt, err := p.parseUnicodeClass(t, class) + if err != nil { + return "", err + } + if nclass != nil { + class, t = nclass, nt + continue + } + + // Look for Perl character class symbols (extension). + if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { + class, t = nclass, nt + continue + } + + // Single character or simple range. + rng := t + var lo, hi rune + if lo, t, err = p.parseClassChar(t, s); err != nil { + return "", err + } + hi = lo + // [a-] means (a|-) so check for final ]. + if len(t) >= 2 && t[0] == '-' && t[1] != ']' { + t = t[1:] + if hi, t, err = p.parseClassChar(t, s); err != nil { + return "", err + } + if hi < lo { + rng = rng[:len(rng)-len(t)] + return "", &Error{Code: ErrInvalidCharRange, Expr: rng} + } + } + if p.flags&FoldCase == 0 { + class = appendRange(class, lo, hi) + } else { + class = appendFoldedRange(class, lo, hi) + } + } + t = t[1:] // chop ] + + // Use &re.Rune instead of &class to avoid allocation. + re.Rune = class + class = cleanClass(&re.Rune) + if sign < 0 { + class = negateClass(class) + } + re.Rune = class + p.push(re) + return t, nil +} + +// cleanClass sorts the ranges (pairs of elements of r), +// merges them, and eliminates duplicates. +func cleanClass(rp *[]rune) []rune { + + // Sort by lo increasing, hi decreasing to break ties. + sort.Sort(ranges{rp}) + + r := *rp + if len(r) < 2 { + return r + } + + // Merge abutting, overlapping. + w := 2 // write index + for i := 2; i < len(r); i += 2 { + lo, hi := r[i], r[i+1] + if lo <= r[w-1]+1 { + // merge with previous range + if hi > r[w-1] { + r[w-1] = hi + } + continue + } + // new disjoint range + r[w] = lo + r[w+1] = hi + w += 2 + } + + return r[:w] +} + +// appendLiteral returns the result of appending the literal x to the class r. +func appendLiteral(r []rune, x rune, flags Flags) []rune { + if flags&FoldCase != 0 { + return appendFoldedRange(r, x, x) + } + return appendRange(r, x, x) +} + +// appendRange returns the result of appending the range lo-hi to the class r. +func appendRange(r []rune, lo, hi rune) []rune { + // Expand last range or next to last range if it overlaps or abuts. + // Checking two ranges helps when appending case-folded + // alphabets, so that one range can be expanding A-Z and the + // other expanding a-z. + n := len(r) + for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 + if n >= i { + rlo, rhi := r[n-i], r[n-i+1] + if lo <= rhi+1 && rlo <= hi+1 { + if lo < rlo { + r[n-i] = lo + } + if hi > rhi { + r[n-i+1] = hi + } + return r + } + } + } + + return append(r, lo, hi) +} + +const ( + // minimum and maximum runes involved in folding. + // checked during test. + minFold = 0x0041 + maxFold = 0x1e943 +) + +// appendFoldedRange returns the result of appending the range lo-hi +// and its case folding-equivalent runes to the class r. +func appendFoldedRange(r []rune, lo, hi rune) []rune { + // Optimizations. + if lo <= minFold && hi >= maxFold { + // Range is full: folding can't add more. + return appendRange(r, lo, hi) + } + if hi < minFold || lo > maxFold { + // Range is outside folding possibilities. + return appendRange(r, lo, hi) + } + if lo < minFold { + // [lo, minFold-1] needs no folding. + r = appendRange(r, lo, minFold-1) + lo = minFold + } + if hi > maxFold { + // [maxFold+1, hi] needs no folding. + r = appendRange(r, maxFold+1, hi) + hi = maxFold + } + + // Brute force. Depend on appendRange to coalesce ranges on the fly. + for c := lo; c <= hi; c++ { + r = appendRange(r, c, c) + f := unicode.SimpleFold(c) + for f != c { + r = appendRange(r, f, f) + f = unicode.SimpleFold(f) + } + } + return r +} + +// appendClass returns the result of appending the class x to the class r. +// It assume x is clean. +func appendClass(r []rune, x []rune) []rune { + for i := 0; i < len(x); i += 2 { + r = appendRange(r, x[i], x[i+1]) + } + return r +} + +// appendFoldedClass returns the result of appending the case folding of the class x to the class r. +func appendFoldedClass(r []rune, x []rune) []rune { + for i := 0; i < len(x); i += 2 { + r = appendFoldedRange(r, x[i], x[i+1]) + } + return r +} + +// appendNegatedClass returns the result of appending the negation of the class x to the class r. +// It assumes x is clean. +func appendNegatedClass(r []rune, x []rune) []rune { + nextLo := '\u0000' + for i := 0; i < len(x); i += 2 { + lo, hi := x[i], x[i+1] + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + } + if nextLo <= unicode.MaxRune { + r = appendRange(r, nextLo, unicode.MaxRune) + } + return r +} + +// appendTable returns the result of appending x to the class r. +func appendTable(r []rune, x *unicode.RangeTable) []rune { + for _, xr := range x.R16 { + lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) + if stride == 1 { + r = appendRange(r, lo, hi) + continue + } + for c := lo; c <= hi; c += stride { + r = appendRange(r, c, c) + } + } + for _, xr := range x.R32 { + lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) + if stride == 1 { + r = appendRange(r, lo, hi) + continue + } + for c := lo; c <= hi; c += stride { + r = appendRange(r, c, c) + } + } + return r +} + +// appendNegatedTable returns the result of appending the negation of x to the class r. +func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { + nextLo := '\u0000' // lo end of next class to add + for _, xr := range x.R16 { + lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) + if stride == 1 { + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + continue + } + for c := lo; c <= hi; c += stride { + if nextLo <= c-1 { + r = appendRange(r, nextLo, c-1) + } + nextLo = c + 1 + } + } + for _, xr := range x.R32 { + lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) + if stride == 1 { + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + continue + } + for c := lo; c <= hi; c += stride { + if nextLo <= c-1 { + r = appendRange(r, nextLo, c-1) + } + nextLo = c + 1 + } + } + if nextLo <= unicode.MaxRune { + r = appendRange(r, nextLo, unicode.MaxRune) + } + return r +} + +// negateClass overwrites r and returns r's negation. +// It assumes the class r is already clean. +func negateClass(r []rune) []rune { + nextLo := '\u0000' // lo end of next class to add + w := 0 // write index + for i := 0; i < len(r); i += 2 { + lo, hi := r[i], r[i+1] + if nextLo <= lo-1 { + r[w] = nextLo + r[w+1] = lo - 1 + w += 2 + } + nextLo = hi + 1 + } + r = r[:w] + if nextLo <= unicode.MaxRune { + // It's possible for the negation to have one more + // range - this one - than the original class, so use append. + r = append(r, nextLo, unicode.MaxRune) + } + return r +} + +// ranges implements sort.Interface on a []rune. +// The choice of receiver type definition is strange +// but avoids an allocation since we already have +// a *[]rune. +type ranges struct { + p *[]rune +} + +func (ra ranges) Less(i, j int) bool { + p := *ra.p + i *= 2 + j *= 2 + return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] +} + +func (ra ranges) Len() int { + return len(*ra.p) / 2 +} + +func (ra ranges) Swap(i, j int) { + p := *ra.p + i *= 2 + j *= 2 + p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] +} + +func checkUTF8(s string) error { + for s != "" { + rune, size := utf8.DecodeRuneInString(s) + if rune == utf8.RuneError && size == 1 { + return &Error{Code: ErrInvalidUTF8, Expr: s} + } + s = s[size:] + } + return nil +} + +func nextRune(s string) (c rune, t string, err error) { + c, size := utf8.DecodeRuneInString(s) + if c == utf8.RuneError && size == 1 { + return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} + } + return c, s[size:], nil +} + +func isalnum(c rune) bool { + return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' +} + +func unhex(c rune) rune { + if '0' <= c && c <= '9' { + return c - '0' + } + if 'a' <= c && c <= 'f' { + return c - 'a' + 10 + } + if 'A' <= c && c <= 'F' { + return c - 'A' + 10 + } + return -1 +} diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go new file mode 100644 index 0000000..67e3c56 --- /dev/null +++ b/src/regexp/syntax/parse_test.go @@ -0,0 +1,586 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "fmt" + "strings" + "testing" + "unicode" +) + +type parseTest struct { + Regexp string + Dump string +} + +var parseTests = []parseTest{ + // Base cases + {`a`, `lit{a}`}, + {`a.`, `cat{lit{a}dot{}}`}, + {`a.b`, `cat{lit{a}dot{}lit{b}}`}, + {`ab`, `str{ab}`}, + {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, + {`abc`, `str{abc}`}, + {`a|^`, `alt{lit{a}bol{}}`}, + {`a|b`, `cc{0x61-0x62}`}, + {`(a)`, `cap{lit{a}}`}, + {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, + {`a*`, `star{lit{a}}`}, + {`a+`, `plus{lit{a}}`}, + {`a?`, `que{lit{a}}`}, + {`a{2}`, `rep{2,2 lit{a}}`}, + {`a{2,3}`, `rep{2,3 lit{a}}`}, + {`a{2,}`, `rep{2,-1 lit{a}}`}, + {`a*?`, `nstar{lit{a}}`}, + {`a+?`, `nplus{lit{a}}`}, + {`a??`, `nque{lit{a}}`}, + {`a{2}?`, `nrep{2,2 lit{a}}`}, + {`a{2,3}?`, `nrep{2,3 lit{a}}`}, + {`a{2,}?`, `nrep{2,-1 lit{a}}`}, + // Malformed { } are treated as literals. + {`x{1001`, `str{x{1001}`}, + {`x{9876543210`, `str{x{9876543210}`}, + {`x{9876543210,`, `str{x{9876543210,}`}, + {`x{2,1`, `str{x{2,1}`}, + {`x{1,9876543210`, `str{x{1,9876543210}`}, + {``, `emp{}`}, + {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored + {`|x|`, `alt{emp{}lit{x}emp{}}`}, + {`.`, `dot{}`}, + {`^`, `bol{}`}, + {`$`, `eol{}`}, + {`\|`, `lit{|}`}, + {`\(`, `lit{(}`}, + {`\)`, `lit{)}`}, + {`\*`, `lit{*}`}, + {`\+`, `lit{+}`}, + {`\?`, `lit{?}`}, + {`{`, `lit{{}`}, + {`}`, `lit{}}`}, + {`\.`, `lit{.}`}, + {`\^`, `lit{^}`}, + {`\$`, `lit{$}`}, + {`\\`, `lit{\}`}, + {`[ace]`, `cc{0x61 0x63 0x65}`}, + {`[abc]`, `cc{0x61-0x63}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[a]`, `lit{a}`}, + {`\-`, `lit{-}`}, + {`-`, `lit{-}`}, + {`\_`, `lit{_}`}, + {`abc`, `str{abc}`}, + {`abc|def`, `alt{str{abc}str{def}}`}, + {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, + + // Posix and Perl extensions + {`[[:lower:]]`, `cc{0x61-0x7a}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`\d`, `cc{0x30-0x39}`}, + {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, + {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, + {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, + {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, + {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, + {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, + // { `\C`, `byte{}` }, // probably never + + // Unicode, negatives, and a double negative. + {`\p{Braille}`, `cc{0x2800-0x28ff}`}, + {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, + {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, + {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, + {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, + {`\p{Any}`, `dot{}`}, + {`\p{^Any}`, `cc{}`}, + + // Hex, octal. + {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, + {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, + + // More interesting regular expressions. + {`a{,2}`, `str{a{,2}}`}, + {`\.\^\$\\`, `str{.^$\}`}, + {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, + {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 + {`a*{`, `cat{star{lit{a}}lit{{}}`}, + + // Test precedences + {`(?:ab)*`, `star{str{ab}}`}, + {`(ab)*`, `star{cap{str{ab}}}`}, + {`ab|cd`, `alt{str{ab}str{cd}}`}, + {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, + + // Test flattening. + {`(?:a)`, `lit{a}`}, + {`(?:ab)(?:cd)`, `str{abcd}`}, + {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, + {`a|.`, `dot{}`}, + {`.|a`, `dot{}`}, + {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, + {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, + + // Test Perl quoted literals + {`\Q+|*?{[\E`, `str{+|*?{[}`}, + {`\Q+\E+`, `plus{lit{+}}`}, + {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, + {`\Q\\E`, `lit{\}`}, + {`\Q\\\E`, `str{\\}`}, + + // Test Perl \A and \z + {`(?m)^`, `bol{}`}, + {`(?m)$`, `eol{}`}, + {`(?-m)^`, `bot{}`}, + {`(?-m)$`, `eot{}`}, + {`(?m)\A`, `bot{}`}, + {`(?m)\z`, `eot{\z}`}, + {`(?-m)\A`, `bot{}`}, + {`(?-m)\z`, `eot{\z}`}, + + // Test named captures + {`(?P<name>a)`, `cap{name:lit{a}}`}, + + // Case-folded literals + {`[Aa]`, `litfold{A}`}, + {`[\x{100}\x{101}]`, `litfold{Ā}`}, + {`[Δδ]`, `litfold{Δ}`}, + + // Strings + {`abcde`, `str{abcde}`}, + {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, + + // Factoring. + {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, + + // Bug fixes. + {`(?:.)`, `dot{}`}, + {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, + {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, + {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, + {`(?:A|a)`, `litfold{A}`}, + {`A|(?:A|a)`, `litfold{A}`}, + {`(?s).`, `dot{}`}, + {`(?-s).`, `dnl{}`}, + {`(?:(?:^).)`, `cat{bol{}dot{}}`}, + {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, + {`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`}, + + // RE2 prefix_tests + {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`}, + {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`}, + {`abc|abd|aef|bcx|bcy`, + `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` + + `cat{str{bc}cc{0x78-0x79}}}`}, + {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, + {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, + {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, + {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, + {`x{2}|x{2}[0-9]`, + `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, + {`x{2}y|x{2}[0-9]y`, + `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, + {`a.*?c|a.*?b`, + `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, + + // Valid repetitions. + {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, + {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, + + // Valid nesting. + {strings.Repeat("(", 999) + strings.Repeat(")", 999), ``}, + {strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``}, + {"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all +} + +const testFlags = MatchNL | PerlX | UnicodeGroups + +func TestParseSimple(t *testing.T) { + testParseDump(t, parseTests, testFlags) +} + +var foldcaseTests = []parseTest{ + {`AbCdE`, `strfold{ABCDE}`}, + {`[Aa]`, `litfold{A}`}, + {`a`, `litfold{A}`}, + + // 0x17F is an old English long s (looks like an f) and folds to s. + // 0x212A is the Kelvin symbol and folds to k. + {`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...] + {`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, +} + +func TestParseFoldCase(t *testing.T) { + testParseDump(t, foldcaseTests, FoldCase) +} + +var literalTests = []parseTest{ + {"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"}, +} + +func TestParseLiteral(t *testing.T) { + testParseDump(t, literalTests, Literal) +} + +var matchnlTests = []parseTest{ + {`.`, `dot{}`}, + {"\n", "lit{\n}"}, + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, + {`[a\n]`, `cc{0xa 0x61}`}, +} + +func TestParseMatchNL(t *testing.T) { + testParseDump(t, matchnlTests, MatchNL) +} + +var nomatchnlTests = []parseTest{ + {`.`, `dnl{}`}, + {"\n", "lit{\n}"}, + {`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`}, + {`[a\n]`, `cc{0xa 0x61}`}, +} + +func TestParseNoMatchNL(t *testing.T) { + testParseDump(t, nomatchnlTests, 0) +} + +// Test Parse -> Dump. +func testParseDump(t *testing.T, tests []parseTest, flags Flags) { + for _, tt := range tests { + re, err := Parse(tt.Regexp, flags) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.Regexp, err) + continue + } + if tt.Dump == "" { + // It parsed. That's all we care about. + continue + } + d := dump(re) + if d != tt.Dump { + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) + } + } +} + +// dump prints a string representation of the regexp showing +// the structure explicitly. +func dump(re *Regexp) string { + var b strings.Builder + dumpRegexp(&b, re) + return b.String() +} + +var opNames = []string{ + OpNoMatch: "no", + OpEmptyMatch: "emp", + OpLiteral: "lit", + OpCharClass: "cc", + OpAnyCharNotNL: "dnl", + OpAnyChar: "dot", + OpBeginLine: "bol", + OpEndLine: "eol", + OpBeginText: "bot", + OpEndText: "eot", + OpWordBoundary: "wb", + OpNoWordBoundary: "nwb", + OpCapture: "cap", + OpStar: "star", + OpPlus: "plus", + OpQuest: "que", + OpRepeat: "rep", + OpConcat: "cat", + OpAlternate: "alt", +} + +// dumpRegexp writes an encoding of the syntax tree for the regexp re to b. +// It is used during testing to distinguish between parses that might print +// the same using re's String method. +func dumpRegexp(b *strings.Builder, re *Regexp) { + if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { + fmt.Fprintf(b, "op%d", re.Op) + } else { + switch re.Op { + default: + b.WriteString(opNames[re.Op]) + case OpStar, OpPlus, OpQuest, OpRepeat: + if re.Flags&NonGreedy != 0 { + b.WriteByte('n') + } + b.WriteString(opNames[re.Op]) + case OpLiteral: + if len(re.Rune) > 1 { + b.WriteString("str") + } else { + b.WriteString("lit") + } + if re.Flags&FoldCase != 0 { + for _, r := range re.Rune { + if unicode.SimpleFold(r) != r { + b.WriteString("fold") + break + } + } + } + } + } + b.WriteByte('{') + switch re.Op { + case OpEndText: + if re.Flags&WasDollar == 0 { + b.WriteString(`\z`) + } + case OpLiteral: + for _, r := range re.Rune { + b.WriteRune(r) + } + case OpConcat, OpAlternate: + for _, sub := range re.Sub { + dumpRegexp(b, sub) + } + case OpStar, OpPlus, OpQuest: + dumpRegexp(b, re.Sub[0]) + case OpRepeat: + fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) + dumpRegexp(b, re.Sub[0]) + case OpCapture: + if re.Name != "" { + b.WriteString(re.Name) + b.WriteByte(':') + } + dumpRegexp(b, re.Sub[0]) + case OpCharClass: + sep := "" + for i := 0; i < len(re.Rune); i += 2 { + b.WriteString(sep) + sep = " " + lo, hi := re.Rune[i], re.Rune[i+1] + if lo == hi { + fmt.Fprintf(b, "%#x", lo) + } else { + fmt.Fprintf(b, "%#x-%#x", lo, hi) + } + } + } + b.WriteByte('}') +} + +func mkCharClass(f func(rune) bool) string { + re := &Regexp{Op: OpCharClass} + lo := rune(-1) + for i := rune(0); i <= unicode.MaxRune; i++ { + if f(i) { + if lo < 0 { + lo = i + } + } else { + if lo >= 0 { + re.Rune = append(re.Rune, lo, i-1) + lo = -1 + } + } + } + if lo >= 0 { + re.Rune = append(re.Rune, lo, unicode.MaxRune) + } + return dump(re) +} + +func isUpperFold(r rune) bool { + if unicode.IsUpper(r) { + return true + } + c := unicode.SimpleFold(r) + for c != r { + if unicode.IsUpper(c) { + return true + } + c = unicode.SimpleFold(c) + } + return false +} + +func TestFoldConstants(t *testing.T) { + last := rune(-1) + for i := rune(0); i <= unicode.MaxRune; i++ { + if unicode.SimpleFold(i) == i { + continue + } + if last == -1 && minFold != i { + t.Errorf("minFold=%#U should be %#U", minFold, i) + } + last = i + } + if maxFold != last { + t.Errorf("maxFold=%#U should be %#U", maxFold, last) + } +} + +func TestAppendRangeCollapse(t *testing.T) { + // AppendRange should collapse each of the new ranges + // into the earlier ones (it looks back two ranges), so that + // the slice never grows very large. + // Note that we are not calling cleanClass. + var r []rune + for i := rune('A'); i <= 'Z'; i++ { + r = appendRange(r, i, i) + r = appendRange(r, i+'a'-'A', i+'a'-'A') + } + if string(r) != "AZaz" { + t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r)) + } +} + +var invalidRegexps = []string{ + `(`, + `)`, + `(a`, + `a)`, + `(a))`, + `(a|b|`, + `a|b|)`, + `(a|b|))`, + `(a|b`, + `a|b)`, + `(a|b))`, + `[a-z`, + `([a-z)`, + `[a-z)`, + `([a-z]))`, + `x{1001}`, + `x{9876543210}`, + `x{2,1}`, + `x{1,9876543210}`, + "\xff", // Invalid UTF-8 + "[\xff]", + "[\\\xff]", + "\\\xff", + `(?P<name>a`, + `(?P<name>`, + `(?P<name`, + `(?P<x y>a)`, + `(?P<>a)`, + `[a-Z]`, + `(?i)[a-Z]`, + `\Q\E*`, + `a{100000}`, // too much repetition + `a{100000,}`, // too much repetition + "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", // too much repetition + strings.Repeat("(", 1000) + strings.Repeat(")", 1000), // too deep + strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000), // too deep + "(" + strings.Repeat("(xx?)", 1000) + "){1000}", // too long + strings.Repeat("(xx?){1000}", 1000), // too long + strings.Repeat(`\pL`, 27000), // too many runes +} + +var onlyPerl = []string{ + `[a-b-c]`, + `\Qabc\E`, + `\Q*+?{[\E`, + `\Q\\E`, + `\Q\\\E`, + `\Q\\\\E`, + `\Q\\\\\E`, + `(?:a)`, + `(?P<name>a)`, +} + +var onlyPOSIX = []string{ + "a++", + "a**", + "a?*", + "a+*", + "a{1}*", + ".{1}{2}.{3}", +} + +func TestParseInvalidRegexps(t *testing.T) { + for _, regexp := range invalidRegexps { + if re, err := Parse(regexp, Perl); err == nil { + t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re)) + } + if re, err := Parse(regexp, POSIX); err == nil { + t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re)) + } + } + for _, regexp := range onlyPerl { + if _, err := Parse(regexp, Perl); err != nil { + t.Errorf("Parse(%#q, Perl): %v", regexp, err) + } + if re, err := Parse(regexp, POSIX); err == nil { + t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re)) + } + } + for _, regexp := range onlyPOSIX { + if re, err := Parse(regexp, Perl); err == nil { + t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re)) + } + if _, err := Parse(regexp, POSIX); err != nil { + t.Errorf("Parse(%#q, POSIX): %v", regexp, err) + } + } +} + +func TestToStringEquivalentParse(t *testing.T) { + for _, tt := range parseTests { + re, err := Parse(tt.Regexp, testFlags) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.Regexp, err) + continue + } + if tt.Dump == "" { + // It parsed. That's all we care about. + continue + } + d := dump(re) + if d != tt.Dump { + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) + continue + } + + s := re.String() + if s != tt.Regexp { + // If ToString didn't return the original regexp, + // it must have found one with fewer parens. + // Unfortunately we can't check the length here, because + // ToString produces "\\{" for a literal brace, + // but "{" is a shorter equivalent in some contexts. + nre, err := Parse(s, testFlags) + if err != nil { + t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err) + continue + } + nd := dump(nre) + if d != nd { + t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd) + } + + ns := nre.String() + if s != ns { + t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns) + } + } + } +} diff --git a/src/regexp/syntax/perl_groups.go b/src/regexp/syntax/perl_groups.go new file mode 100644 index 0000000..effe4e6 --- /dev/null +++ b/src/regexp/syntax/perl_groups.go @@ -0,0 +1,134 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// GENERATED BY make_perl_groups.pl; DO NOT EDIT. +// make_perl_groups.pl >perl_groups.go + +package syntax + +var code1 = []rune{ /* \d */ + 0x30, 0x39, +} + +var code2 = []rune{ /* \s */ + 0x9, 0xa, + 0xc, 0xd, + 0x20, 0x20, +} + +var code3 = []rune{ /* \w */ + 0x30, 0x39, + 0x41, 0x5a, + 0x5f, 0x5f, + 0x61, 0x7a, +} + +var perlGroup = map[string]charGroup{ + `\d`: {+1, code1}, + `\D`: {-1, code1}, + `\s`: {+1, code2}, + `\S`: {-1, code2}, + `\w`: {+1, code3}, + `\W`: {-1, code3}, +} +var code4 = []rune{ /* [:alnum:] */ + 0x30, 0x39, + 0x41, 0x5a, + 0x61, 0x7a, +} + +var code5 = []rune{ /* [:alpha:] */ + 0x41, 0x5a, + 0x61, 0x7a, +} + +var code6 = []rune{ /* [:ascii:] */ + 0x0, 0x7f, +} + +var code7 = []rune{ /* [:blank:] */ + 0x9, 0x9, + 0x20, 0x20, +} + +var code8 = []rune{ /* [:cntrl:] */ + 0x0, 0x1f, + 0x7f, 0x7f, +} + +var code9 = []rune{ /* [:digit:] */ + 0x30, 0x39, +} + +var code10 = []rune{ /* [:graph:] */ + 0x21, 0x7e, +} + +var code11 = []rune{ /* [:lower:] */ + 0x61, 0x7a, +} + +var code12 = []rune{ /* [:print:] */ + 0x20, 0x7e, +} + +var code13 = []rune{ /* [:punct:] */ + 0x21, 0x2f, + 0x3a, 0x40, + 0x5b, 0x60, + 0x7b, 0x7e, +} + +var code14 = []rune{ /* [:space:] */ + 0x9, 0xd, + 0x20, 0x20, +} + +var code15 = []rune{ /* [:upper:] */ + 0x41, 0x5a, +} + +var code16 = []rune{ /* [:word:] */ + 0x30, 0x39, + 0x41, 0x5a, + 0x5f, 0x5f, + 0x61, 0x7a, +} + +var code17 = []rune{ /* [:xdigit:] */ + 0x30, 0x39, + 0x41, 0x46, + 0x61, 0x66, +} + +var posixGroup = map[string]charGroup{ + `[:alnum:]`: {+1, code4}, + `[:^alnum:]`: {-1, code4}, + `[:alpha:]`: {+1, code5}, + `[:^alpha:]`: {-1, code5}, + `[:ascii:]`: {+1, code6}, + `[:^ascii:]`: {-1, code6}, + `[:blank:]`: {+1, code7}, + `[:^blank:]`: {-1, code7}, + `[:cntrl:]`: {+1, code8}, + `[:^cntrl:]`: {-1, code8}, + `[:digit:]`: {+1, code9}, + `[:^digit:]`: {-1, code9}, + `[:graph:]`: {+1, code10}, + `[:^graph:]`: {-1, code10}, + `[:lower:]`: {+1, code11}, + `[:^lower:]`: {-1, code11}, + `[:print:]`: {+1, code12}, + `[:^print:]`: {-1, code12}, + `[:punct:]`: {+1, code13}, + `[:^punct:]`: {-1, code13}, + `[:space:]`: {+1, code14}, + `[:^space:]`: {-1, code14}, + `[:upper:]`: {+1, code15}, + `[:^upper:]`: {-1, code15}, + `[:word:]`: {+1, code16}, + `[:^word:]`: {-1, code16}, + `[:xdigit:]`: {+1, code17}, + `[:^xdigit:]`: {-1, code17}, +} diff --git a/src/regexp/syntax/prog.go b/src/regexp/syntax/prog.go new file mode 100644 index 0000000..896cdc4 --- /dev/null +++ b/src/regexp/syntax/prog.go @@ -0,0 +1,347 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "strconv" + "strings" + "unicode" + "unicode/utf8" +) + +// Compiled program. +// May not belong in this package, but convenient for now. + +// A Prog is a compiled regular expression program. +type Prog struct { + Inst []Inst + Start int // index of start instruction + NumCap int // number of InstCapture insts in re +} + +// An InstOp is an instruction opcode. +type InstOp uint8 + +const ( + InstAlt InstOp = iota + InstAltMatch + InstCapture + InstEmptyWidth + InstMatch + InstFail + InstNop + InstRune + InstRune1 + InstRuneAny + InstRuneAnyNotNL +) + +var instOpNames = []string{ + "InstAlt", + "InstAltMatch", + "InstCapture", + "InstEmptyWidth", + "InstMatch", + "InstFail", + "InstNop", + "InstRune", + "InstRune1", + "InstRuneAny", + "InstRuneAnyNotNL", +} + +func (i InstOp) String() string { + if uint(i) >= uint(len(instOpNames)) { + return "" + } + return instOpNames[i] +} + +// An EmptyOp specifies a kind or mixture of zero-width assertions. +type EmptyOp uint8 + +const ( + EmptyBeginLine EmptyOp = 1 << iota + EmptyEndLine + EmptyBeginText + EmptyEndText + EmptyWordBoundary + EmptyNoWordBoundary +) + +// EmptyOpContext returns the zero-width assertions +// satisfied at the position between the runes r1 and r2. +// Passing r1 == -1 indicates that the position is +// at the beginning of the text. +// Passing r2 == -1 indicates that the position is +// at the end of the text. +func EmptyOpContext(r1, r2 rune) EmptyOp { + var op EmptyOp = EmptyNoWordBoundary + var boundary byte + switch { + case IsWordChar(r1): + boundary = 1 + case r1 == '\n': + op |= EmptyBeginLine + case r1 < 0: + op |= EmptyBeginText | EmptyBeginLine + } + switch { + case IsWordChar(r2): + boundary ^= 1 + case r2 == '\n': + op |= EmptyEndLine + case r2 < 0: + op |= EmptyEndText | EmptyEndLine + } + if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) + op ^= (EmptyWordBoundary | EmptyNoWordBoundary) + } + return op +} + +// IsWordChar reports whether r is considered a “word character” +// during the evaluation of the \b and \B zero-width assertions. +// These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. +func IsWordChar(r rune) bool { + return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' +} + +// An Inst is a single instruction in a regular expression program. +type Inst struct { + Op InstOp + Out uint32 // all but InstMatch, InstFail + Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth + Rune []rune +} + +func (p *Prog) String() string { + var b strings.Builder + dumpProg(&b, p) + return b.String() +} + +// skipNop follows any no-op or capturing instructions. +func (p *Prog) skipNop(pc uint32) *Inst { + i := &p.Inst[pc] + for i.Op == InstNop || i.Op == InstCapture { + i = &p.Inst[i.Out] + } + return i +} + +// op returns i.Op but merges all the Rune special cases into InstRune +func (i *Inst) op() InstOp { + op := i.Op + switch op { + case InstRune1, InstRuneAny, InstRuneAnyNotNL: + op = InstRune + } + return op +} + +// Prefix returns a literal string that all matches for the +// regexp must start with. Complete is true if the prefix +// is the entire match. +func (p *Prog) Prefix() (prefix string, complete bool) { + i := p.skipNop(uint32(p.Start)) + + // Avoid allocation of buffer if prefix is empty. + if i.op() != InstRune || len(i.Rune) != 1 { + return "", i.Op == InstMatch + } + + // Have prefix; gather characters. + var buf strings.Builder + for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError { + buf.WriteRune(i.Rune[0]) + i = p.skipNop(i.Out) + } + return buf.String(), i.Op == InstMatch +} + +// StartCond returns the leading empty-width conditions that must +// be true in any match. It returns ^EmptyOp(0) if no matches are possible. +func (p *Prog) StartCond() EmptyOp { + var flag EmptyOp + pc := uint32(p.Start) + i := &p.Inst[pc] +Loop: + for { + switch i.Op { + case InstEmptyWidth: + flag |= EmptyOp(i.Arg) + case InstFail: + return ^EmptyOp(0) + case InstCapture, InstNop: + // skip + default: + break Loop + } + pc = i.Out + i = &p.Inst[pc] + } + return flag +} + +const noMatch = -1 + +// MatchRune reports whether the instruction matches (and consumes) r. +// It should only be called when i.Op == InstRune. +func (i *Inst) MatchRune(r rune) bool { + return i.MatchRunePos(r) != noMatch +} + +// MatchRunePos checks whether the instruction matches (and consumes) r. +// If so, MatchRunePos returns the index of the matching rune pair +// (or, when len(i.Rune) == 1, rune singleton). +// If not, MatchRunePos returns -1. +// MatchRunePos should only be called when i.Op == InstRune. +func (i *Inst) MatchRunePos(r rune) int { + rune := i.Rune + + switch len(rune) { + case 0: + return noMatch + + case 1: + // Special case: single-rune slice is from literal string, not char class. + r0 := rune[0] + if r == r0 { + return 0 + } + if Flags(i.Arg)&FoldCase != 0 { + for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { + if r == r1 { + return 0 + } + } + } + return noMatch + + case 2: + if r >= rune[0] && r <= rune[1] { + return 0 + } + return noMatch + + case 4, 6, 8: + // Linear search for a few pairs. + // Should handle ASCII well. + for j := 0; j < len(rune); j += 2 { + if r < rune[j] { + return noMatch + } + if r <= rune[j+1] { + return j / 2 + } + } + return noMatch + } + + // Otherwise binary search. + lo := 0 + hi := len(rune) / 2 + for lo < hi { + m := lo + (hi-lo)/2 + if c := rune[2*m]; c <= r { + if r <= rune[2*m+1] { + return m + } + lo = m + 1 + } else { + hi = m + } + } + return noMatch +} + +// MatchEmptyWidth reports whether the instruction matches +// an empty string between the runes before and after. +// It should only be called when i.Op == InstEmptyWidth. +func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { + switch EmptyOp(i.Arg) { + case EmptyBeginLine: + return before == '\n' || before == -1 + case EmptyEndLine: + return after == '\n' || after == -1 + case EmptyBeginText: + return before == -1 + case EmptyEndText: + return after == -1 + case EmptyWordBoundary: + return IsWordChar(before) != IsWordChar(after) + case EmptyNoWordBoundary: + return IsWordChar(before) == IsWordChar(after) + } + panic("unknown empty width arg") +} + +func (i *Inst) String() string { + var b strings.Builder + dumpInst(&b, i) + return b.String() +} + +func bw(b *strings.Builder, args ...string) { + for _, s := range args { + b.WriteString(s) + } +} + +func dumpProg(b *strings.Builder, p *Prog) { + for j := range p.Inst { + i := &p.Inst[j] + pc := strconv.Itoa(j) + if len(pc) < 3 { + b.WriteString(" "[len(pc):]) + } + if j == p.Start { + pc += "*" + } + bw(b, pc, "\t") + dumpInst(b, i) + bw(b, "\n") + } +} + +func u32(i uint32) string { + return strconv.FormatUint(uint64(i), 10) +} + +func dumpInst(b *strings.Builder, i *Inst) { + switch i.Op { + case InstAlt: + bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) + case InstAltMatch: + bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) + case InstCapture: + bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) + case InstEmptyWidth: + bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) + case InstMatch: + bw(b, "match") + case InstFail: + bw(b, "fail") + case InstNop: + bw(b, "nop -> ", u32(i.Out)) + case InstRune: + if i.Rune == nil { + // shouldn't happen + bw(b, "rune <nil>") + } + bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) + if Flags(i.Arg)&FoldCase != 0 { + bw(b, "/i") + } + bw(b, " -> ", u32(i.Out)) + case InstRune1: + bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) + case InstRuneAny: + bw(b, "any -> ", u32(i.Out)) + case InstRuneAnyNotNL: + bw(b, "anynotnl -> ", u32(i.Out)) + } +} diff --git a/src/regexp/syntax/prog_test.go b/src/regexp/syntax/prog_test.go new file mode 100644 index 0000000..5603aea --- /dev/null +++ b/src/regexp/syntax/prog_test.go @@ -0,0 +1,129 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import "testing" + +var compileTests = []struct { + Regexp string + Prog string +}{ + {"a", ` 0 fail + 1* rune1 "a" -> 2 + 2 match +`}, + {"[A-M][n-z]", ` 0 fail + 1* rune "AM" -> 2 + 2 rune "nz" -> 3 + 3 match +`}, + {"", ` 0 fail + 1* nop -> 2 + 2 match +`}, + {"a?", ` 0 fail + 1 rune1 "a" -> 3 + 2* alt -> 1, 3 + 3 match +`}, + {"a??", ` 0 fail + 1 rune1 "a" -> 3 + 2* alt -> 3, 1 + 3 match +`}, + {"a+", ` 0 fail + 1* rune1 "a" -> 2 + 2 alt -> 1, 3 + 3 match +`}, + {"a+?", ` 0 fail + 1* rune1 "a" -> 2 + 2 alt -> 3, 1 + 3 match +`}, + {"a*", ` 0 fail + 1 rune1 "a" -> 2 + 2* alt -> 1, 3 + 3 match +`}, + {"a*?", ` 0 fail + 1 rune1 "a" -> 2 + 2* alt -> 3, 1 + 3 match +`}, + {"a+b+", ` 0 fail + 1* rune1 "a" -> 2 + 2 alt -> 1, 3 + 3 rune1 "b" -> 4 + 4 alt -> 3, 5 + 5 match +`}, + {"(a+)(b+)", ` 0 fail + 1* cap 2 -> 2 + 2 rune1 "a" -> 3 + 3 alt -> 2, 4 + 4 cap 3 -> 5 + 5 cap 4 -> 6 + 6 rune1 "b" -> 7 + 7 alt -> 6, 8 + 8 cap 5 -> 9 + 9 match +`}, + {"a+|b+", ` 0 fail + 1 rune1 "a" -> 2 + 2 alt -> 1, 6 + 3 rune1 "b" -> 4 + 4 alt -> 3, 6 + 5* alt -> 1, 3 + 6 match +`}, + {"A[Aa]", ` 0 fail + 1* rune1 "A" -> 2 + 2 rune "A"/i -> 3 + 3 match +`}, + {"(?:(?:^).)", ` 0 fail + 1* empty 4 -> 2 + 2 anynotnl -> 3 + 3 match +`}, + {"(?:|a)+", ` 0 fail + 1 nop -> 4 + 2 rune1 "a" -> 4 + 3* alt -> 1, 2 + 4 alt -> 3, 5 + 5 match +`}, + {"(?:|a)*", ` 0 fail + 1 nop -> 4 + 2 rune1 "a" -> 4 + 3 alt -> 1, 2 + 4 alt -> 3, 6 + 5* alt -> 3, 6 + 6 match +`}, +} + +func TestCompile(t *testing.T) { + for _, tt := range compileTests { + re, _ := Parse(tt.Regexp, Perl) + p, _ := Compile(re) + s := p.String() + if s != tt.Prog { + t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog) + } + } +} + +func BenchmarkEmptyOpContext(b *testing.B) { + for i := 0; i < b.N; i++ { + var r1 rune = -1 + for _, r2 := range "foo, bar, baz\nsome input text.\n" { + EmptyOpContext(r1, r2) + r1 = r2 + } + EmptyOpContext(r1, -1) + } +} diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go new file mode 100644 index 0000000..3a4d2d2 --- /dev/null +++ b/src/regexp/syntax/regexp.go @@ -0,0 +1,320 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +// Note to implementers: +// In this package, re is always a *Regexp and r is always a rune. + +import ( + "strconv" + "strings" + "unicode" +) + +// A Regexp is a node in a regular expression syntax tree. +type Regexp struct { + Op Op // operator + Flags Flags + Sub []*Regexp // subexpressions, if any + Sub0 [1]*Regexp // storage for short Sub + Rune []rune // matched runes, for OpLiteral, OpCharClass + Rune0 [2]rune // storage for short Rune + Min, Max int // min, max for OpRepeat + Cap int // capturing index, for OpCapture + Name string // capturing name, for OpCapture +} + +//go:generate stringer -type Op -trimprefix Op + +// An Op is a single regular expression operator. +type Op uint8 + +// Operators are listed in precedence order, tightest binding to weakest. +// Character class operators are listed simplest to most complex +// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). + +const ( + OpNoMatch Op = 1 + iota // matches no strings + OpEmptyMatch // matches empty string + OpLiteral // matches Runes sequence + OpCharClass // matches Runes interpreted as range pair list + OpAnyCharNotNL // matches any character except newline + OpAnyChar // matches any character + OpBeginLine // matches empty string at beginning of line + OpEndLine // matches empty string at end of line + OpBeginText // matches empty string at beginning of text + OpEndText // matches empty string at end of text + OpWordBoundary // matches word boundary `\b` + OpNoWordBoundary // matches word non-boundary `\B` + OpCapture // capturing subexpression with index Cap, optional name Name + OpStar // matches Sub[0] zero or more times + OpPlus // matches Sub[0] one or more times + OpQuest // matches Sub[0] zero or one times + OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) + OpConcat // matches concatenation of Subs + OpAlternate // matches alternation of Subs +) + +const opPseudo Op = 128 // where pseudo-ops start + +// Equal reports whether x and y have identical structure. +func (x *Regexp) Equal(y *Regexp) bool { + if x == nil || y == nil { + return x == y + } + if x.Op != y.Op { + return false + } + switch x.Op { + case OpEndText: + // The parse flags remember whether this is \z or \Z. + if x.Flags&WasDollar != y.Flags&WasDollar { + return false + } + + case OpLiteral, OpCharClass: + if len(x.Rune) != len(y.Rune) { + return false + } + for i, r := range x.Rune { + if r != y.Rune[i] { + return false + } + } + + case OpAlternate, OpConcat: + if len(x.Sub) != len(y.Sub) { + return false + } + for i, sub := range x.Sub { + if !sub.Equal(y.Sub[i]) { + return false + } + } + + case OpStar, OpPlus, OpQuest: + if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpRepeat: + if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpCapture: + if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + } + return true +} + +// writeRegexp writes the Perl syntax for the regular expression re to b. +func writeRegexp(b *strings.Builder, re *Regexp) { + switch re.Op { + default: + b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") + case OpNoMatch: + b.WriteString(`[^\x00-\x{10FFFF}]`) + case OpEmptyMatch: + b.WriteString(`(?:)`) + case OpLiteral: + if re.Flags&FoldCase != 0 { + b.WriteString(`(?i:`) + } + for _, r := range re.Rune { + escape(b, r, false) + } + if re.Flags&FoldCase != 0 { + b.WriteString(`)`) + } + case OpCharClass: + if len(re.Rune)%2 != 0 { + b.WriteString(`[invalid char class]`) + break + } + b.WriteRune('[') + if len(re.Rune) == 0 { + b.WriteString(`^\x00-\x{10FFFF}`) + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { + // Contains 0 and MaxRune. Probably a negated class. + // Print the gaps. + b.WriteRune('^') + for i := 1; i < len(re.Rune)-1; i += 2 { + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } else { + for i := 0; i < len(re.Rune); i += 2 { + lo, hi := re.Rune[i], re.Rune[i+1] + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } + b.WriteRune(']') + case OpAnyCharNotNL: + b.WriteString(`(?-s:.)`) + case OpAnyChar: + b.WriteString(`(?s:.)`) + case OpBeginLine: + b.WriteString(`(?m:^)`) + case OpEndLine: + b.WriteString(`(?m:$)`) + case OpBeginText: + b.WriteString(`\A`) + case OpEndText: + if re.Flags&WasDollar != 0 { + b.WriteString(`(?-m:$)`) + } else { + b.WriteString(`\z`) + } + case OpWordBoundary: + b.WriteString(`\b`) + case OpNoWordBoundary: + b.WriteString(`\B`) + case OpCapture: + if re.Name != "" { + b.WriteString(`(?P<`) + b.WriteString(re.Name) + b.WriteRune('>') + } else { + b.WriteRune('(') + } + if re.Sub[0].Op != OpEmptyMatch { + writeRegexp(b, re.Sub[0]) + } + b.WriteRune(')') + case OpStar, OpPlus, OpQuest, OpRepeat: + if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + switch re.Op { + case OpStar: + b.WriteRune('*') + case OpPlus: + b.WriteRune('+') + case OpQuest: + b.WriteRune('?') + case OpRepeat: + b.WriteRune('{') + b.WriteString(strconv.Itoa(re.Min)) + if re.Max != re.Min { + b.WriteRune(',') + if re.Max >= 0 { + b.WriteString(strconv.Itoa(re.Max)) + } + } + b.WriteRune('}') + } + if re.Flags&NonGreedy != 0 { + b.WriteRune('?') + } + case OpConcat: + for _, sub := range re.Sub { + if sub.Op == OpAlternate { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + } + case OpAlternate: + for i, sub := range re.Sub { + if i > 0 { + b.WriteRune('|') + } + writeRegexp(b, sub) + } + } +} + +func (re *Regexp) String() string { + var b strings.Builder + writeRegexp(&b, re) + return b.String() +} + +const meta = `\.+*?()|[]{}^$` + +func escape(b *strings.Builder, r rune, force bool) { + if unicode.IsPrint(r) { + if strings.ContainsRune(meta, r) || force { + b.WriteRune('\\') + } + b.WriteRune(r) + return + } + + switch r { + case '\a': + b.WriteString(`\a`) + case '\f': + b.WriteString(`\f`) + case '\n': + b.WriteString(`\n`) + case '\r': + b.WriteString(`\r`) + case '\t': + b.WriteString(`\t`) + case '\v': + b.WriteString(`\v`) + default: + if r < 0x100 { + b.WriteString(`\x`) + s := strconv.FormatInt(int64(r), 16) + if len(s) == 1 { + b.WriteRune('0') + } + b.WriteString(s) + break + } + b.WriteString(`\x{`) + b.WriteString(strconv.FormatInt(int64(r), 16)) + b.WriteString(`}`) + } +} + +// MaxCap walks the regexp to find the maximum capture index. +func (re *Regexp) MaxCap() int { + m := 0 + if re.Op == OpCapture { + m = re.Cap + } + for _, sub := range re.Sub { + if n := sub.MaxCap(); m < n { + m = n + } + } + return m +} + +// CapNames walks the regexp to find the names of capturing groups. +func (re *Regexp) CapNames() []string { + names := make([]string, re.MaxCap()+1) + re.capNames(names) + return names +} + +func (re *Regexp) capNames(names []string) { + if re.Op == OpCapture { + names[re.Cap] = re.Name + } + for _, sub := range re.Sub { + sub.capNames(names) + } +} diff --git a/src/regexp/syntax/simplify.go b/src/regexp/syntax/simplify.go new file mode 100644 index 0000000..e439325 --- /dev/null +++ b/src/regexp/syntax/simplify.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +// Simplify returns a regexp equivalent to re but without counted repetitions +// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. +// The resulting regexp will execute correctly but its string representation +// will not produce the same parse tree, because capturing parentheses +// may have been duplicated or removed. For example, the simplified form +// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. +// The returned regexp may share structure with or be the original. +func (re *Regexp) Simplify() *Regexp { + if re == nil { + return nil + } + switch re.Op { + case OpCapture, OpConcat, OpAlternate: + // Simplify children, building new Regexp if children change. + nre := re + for i, sub := range re.Sub { + nsub := sub.Simplify() + if nre == re && nsub != sub { + // Start a copy. + nre = new(Regexp) + *nre = *re + nre.Rune = nil + nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...) + } + if nre != re { + nre.Sub = append(nre.Sub, nsub) + } + } + return nre + + case OpStar, OpPlus, OpQuest: + sub := re.Sub[0].Simplify() + return simplify1(re.Op, re.Flags, sub, re) + + case OpRepeat: + // Special special case: x{0} matches the empty string + // and doesn't even need to consider x. + if re.Min == 0 && re.Max == 0 { + return &Regexp{Op: OpEmptyMatch} + } + + // The fun begins. + sub := re.Sub[0].Simplify() + + // x{n,} means at least n matches of x. + if re.Max == -1 { + // Special case: x{0,} is x*. + if re.Min == 0 { + return simplify1(OpStar, re.Flags, sub, nil) + } + + // Special case: x{1,} is x+. + if re.Min == 1 { + return simplify1(OpPlus, re.Flags, sub, nil) + } + + // General case: x{4,} is xxxx+. + nre := &Regexp{Op: OpConcat} + nre.Sub = nre.Sub0[:0] + for i := 0; i < re.Min-1; i++ { + nre.Sub = append(nre.Sub, sub) + } + nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil)) + return nre + } + + // Special case x{0} handled above. + + // Special case: x{1} is just x. + if re.Min == 1 && re.Max == 1 { + return sub + } + + // General case: x{n,m} means n copies of x and m copies of x? + // The machine will do less work if we nest the final m copies, + // so that x{2,5} = xx(x(x(x)?)?)? + + // Build leading prefix: xx. + var prefix *Regexp + if re.Min > 0 { + prefix = &Regexp{Op: OpConcat} + prefix.Sub = prefix.Sub0[:0] + for i := 0; i < re.Min; i++ { + prefix.Sub = append(prefix.Sub, sub) + } + } + + // Build and attach suffix: (x(x(x)?)?)? + if re.Max > re.Min { + suffix := simplify1(OpQuest, re.Flags, sub, nil) + for i := re.Min + 1; i < re.Max; i++ { + nre2 := &Regexp{Op: OpConcat} + nre2.Sub = append(nre2.Sub0[:0], sub, suffix) + suffix = simplify1(OpQuest, re.Flags, nre2, nil) + } + if prefix == nil { + return suffix + } + prefix.Sub = append(prefix.Sub, suffix) + } + if prefix != nil { + return prefix + } + + // Some degenerate case like min > max or min < max < 0. + // Handle as impossible match. + return &Regexp{Op: OpNoMatch} + } + + return re +} + +// simplify1 implements Simplify for the unary OpStar, +// OpPlus, and OpQuest operators. It returns the simple regexp +// equivalent to +// +// Regexp{Op: op, Flags: flags, Sub: {sub}} +// +// under the assumption that sub is already simple, and +// without first allocating that structure. If the regexp +// to be returned turns out to be equivalent to re, simplify1 +// returns re instead. +// +// simplify1 is factored out of Simplify because the implementation +// for other operators generates these unary expressions. +// Letting them call simplify1 makes sure the expressions they +// generate are simple. +func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp { + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if sub.Op == OpEmptyMatch { + return sub + } + // The operators are idempotent if the flags match. + if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy { + return sub + } + if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] { + return re + } + + re = &Regexp{Op: op, Flags: flags} + re.Sub = append(re.Sub0[:0], sub) + return re +} diff --git a/src/regexp/syntax/simplify_test.go b/src/regexp/syntax/simplify_test.go new file mode 100644 index 0000000..9877db3 --- /dev/null +++ b/src/regexp/syntax/simplify_test.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import "testing" + +var simplifyTests = []struct { + Regexp string + Simple string +}{ + // Already-simple constructs + {`a`, `a`}, + {`ab`, `ab`}, + {`a|b`, `[a-b]`}, + {`ab|cd`, `ab|cd`}, + {`(ab)*`, `(ab)*`}, + {`(ab)+`, `(ab)+`}, + {`(ab)?`, `(ab)?`}, + {`.`, `(?s:.)`}, + {`^`, `(?m:^)`}, + {`$`, `(?m:$)`}, + {`[ac]`, `[ac]`}, + {`[^ac]`, `[^ac]`}, + + // Posix character classes + {`[[:alnum:]]`, `[0-9A-Za-z]`}, + {`[[:alpha:]]`, `[A-Za-z]`}, + {`[[:blank:]]`, `[\t ]`}, + {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, + {`[[:digit:]]`, `[0-9]`}, + {`[[:graph:]]`, `[!-~]`}, + {`[[:lower:]]`, `[a-z]`}, + {`[[:print:]]`, `[ -~]`}, + {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, + {`[[:space:]]`, `[\t-\r ]`}, + {`[[:upper:]]`, `[A-Z]`}, + {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, + + // Perl character classes + {`\d`, `[0-9]`}, + {`\s`, `[\t-\n\f-\r ]`}, + {`\w`, `[0-9A-Z_a-z]`}, + {`\D`, `[^0-9]`}, + {`\S`, `[^\t-\n\f-\r ]`}, + {`\W`, `[^0-9A-Z_a-z]`}, + {`[\d]`, `[0-9]`}, + {`[\s]`, `[\t-\n\f-\r ]`}, + {`[\w]`, `[0-9A-Z_a-z]`}, + {`[\D]`, `[^0-9]`}, + {`[\S]`, `[^\t-\n\f-\r ]`}, + {`[\W]`, `[^0-9A-Z_a-z]`}, + + // Posix repetitions + {`a{1}`, `a`}, + {`a{2}`, `aa`}, + {`a{5}`, `aaaaa`}, + {`a{0,1}`, `a?`}, + // The next three are illegible because Simplify inserts (?:) + // parens instead of () parens to avoid creating extra + // captured subexpressions. The comments show a version with fewer parens. + {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? + {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? + {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,2}`, `(?:aa?)?`}, // (aa?)? + {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? + {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,}`, `a*`}, + {`a{1,}`, `a+`}, + {`a{2,}`, `aa+`}, + {`a{5,}`, `aaaaa+`}, + + // Test that operators simplify their arguments. + {`(?:a{1,}){1,}`, `a+`}, + {`(a{1,}b{1,})`, `(a+b+)`}, + {`a{1,}|b{1,}`, `a+|b+`}, + {`(?:a{1,})*`, `(?:a+)*`}, + {`(?:a{1,})+`, `a+`}, + {`(?:a{1,})?`, `(?:a+)?`}, + {``, `(?:)`}, + {`a{0}`, `(?:)`}, + + // Character class simplification + {`[ab]`, `[a-b]`}, + {`[a-za-za-z]`, `[a-z]`}, + {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, + {`[ABCDEFGH]`, `[A-H]`}, + {`[AB-CD-EF-GH]`, `[A-H]`}, + {`[W-ZP-XE-R]`, `[E-Z]`}, + {`[a-ee-gg-m]`, `[a-m]`}, + {`[a-ea-ha-m]`, `[a-m]`}, + {`[a-ma-ha-e]`, `[a-m]`}, + {`[a-zA-Z0-9 -~]`, `[ -~]`}, + + // Empty character classes + {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, + + // Full character classes + {`[[:cntrl:][:^cntrl:]]`, `(?s:.)`}, + + // Unicode case folding. + {`(?i)A`, `(?i:A)`}, + {`(?i)a`, `(?i:A)`}, + {`(?i)[A]`, `(?i:A)`}, + {`(?i)[a]`, `(?i:A)`}, + {`(?i)K`, `(?i:K)`}, + {`(?i)k`, `(?i:K)`}, + {`(?i)\x{212a}`, "(?i:K)"}, + {`(?i)[K]`, "[Kk\u212A]"}, + {`(?i)[k]`, "[Kk\u212A]"}, + {`(?i)[\x{212a}]`, "[Kk\u212A]"}, + {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, + {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, + {`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`}, + + // Empty string as a regular expression. + // The empty string must be preserved inside parens in order + // to make submatches work right, so these tests are less + // interesting than they might otherwise be. String inserts + // explicit (?:) in place of non-parenthesized empty strings, + // to make them easier to spot for other parsers. + {`(a|b|)`, `([a-b]|(?:))`}, + {`(|)`, `()`}, + {`a()`, `a()`}, + {`(()|())`, `(()|())`}, + {`(a|)`, `(a|(?:))`}, + {`ab()cd()`, `ab()cd()`}, + {`()`, `()`}, + {`()*`, `()*`}, + {`()+`, `()+`}, + {`()?`, `()?`}, + {`(){0}`, `(?:)`}, + {`(){1}`, `()`}, + {`(){1,}`, `()+`}, + {`(){0,2}`, `(?:()()?)?`}, +} + +func TestSimplify(t *testing.T) { + for _, tt := range simplifyTests { + re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) + if err != nil { + t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) + continue + } + s := re.Simplify().String() + if s != tt.Simple { + t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) + } + } +} diff --git a/src/regexp/testdata/README b/src/regexp/testdata/README new file mode 100644 index 0000000..58cec82 --- /dev/null +++ b/src/regexp/testdata/README @@ -0,0 +1,24 @@ +AT&T POSIX Test Files +See textregex.c for copyright + license. + +testregex.c http://www2.research.att.com/~gsf/testregex/testregex.c +basic.dat http://www2.research.att.com/~gsf/testregex/basic.dat +nullsubexpr.dat http://www2.research.att.com/~gsf/testregex/nullsubexpr.dat +repetition.dat http://www2.research.att.com/~gsf/testregex/repetition.dat + +The test data has been edited to reflect RE2/Go differences: + * In a star of a possibly empty match like (a*)* matching x, + the no match case runs the starred subexpression zero times, + not once. This is consistent with (a*)* matching a, which + runs the starred subexpression one time, not twice. + * The submatch choice is first match, not the POSIX rule. + +Such changes are marked with 'RE2/Go'. + + +RE2 Test Files + +re2-exhaustive.txt.bz2 and re2-search.txt are built by running +'make log' in the RE2 distribution https://github.com/google/re2/ + +The exhaustive file is compressed because it is huge. diff --git a/src/regexp/testdata/basic.dat b/src/regexp/testdata/basic.dat new file mode 100644 index 0000000..1776b1f --- /dev/null +++ b/src/regexp/testdata/basic.dat @@ -0,0 +1,217 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[<element>]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\xff (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +E (a*)* - (0,0)(0,0) +E (a*)+ - (0,0)(0,0) +E (a*|b)* - (0,0)(0,0) +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +E (^)* - (0,0)(0,0) +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +E ((a*|b))* - (0,0)(0,0)(0,0) +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/src/regexp/testdata/nullsubexpr.dat b/src/regexp/testdata/nullsubexpr.dat new file mode 100644 index 0000000..68d9c99 --- /dev/null +++ b/src/regexp/testdata/nullsubexpr.dat @@ -0,0 +1,73 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +E SAME b (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaaaa (0,0)(0,0) +E ([^ab]*)* ccccxx (0,6)(0,6) +E SAME ababab (0,0)(0,0) + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/src/regexp/testdata/re2-exhaustive.txt.bz2 b/src/regexp/testdata/re2-exhaustive.txt.bz2 Binary files differnew file mode 100644 index 0000000..6638476 --- /dev/null +++ b/src/regexp/testdata/re2-exhaustive.txt.bz2 diff --git a/src/regexp/testdata/re2-search.txt b/src/regexp/testdata/re2-search.txt new file mode 100644 index 0000000..8c4098a --- /dev/null +++ b/src/regexp/testdata/re2-search.txt @@ -0,0 +1,3779 @@ +# RE2 basic search tests built by make log +# Wed May 12 12:13:22 EDT 2021 +Regexp.SearchTests +strings +"" +"a" +regexps +"a" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"zyzzyva" +regexps +"a" +-;-;-;- +-;6-7;-;6-7 +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;-;-;- +"(?:a)$" +-;-;-;- +-;6-7;-;6-7 +strings +"" +"aa" +regexps +"a+" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:a+)$" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:a+)" +-;-;-;- +0-2;0-2;0-2;0-2 +"(?:a+)$" +-;-;-;- +0-2;0-2;0-2;0-2 +strings +"" +"ab" +regexps +"(a+|b)+" +-;-;-;- +0-2 1-2;0-2 1-2;0-2 1-2;0-2 1-2 +"^(?:(a+|b)+)$" +-;-;-;- +0-2 1-2;0-2 1-2;0-2 1-2;0-2 1-2 +"^(?:(a+|b)+)" +-;-;-;- +0-2 1-2;0-2 1-2;0-2 1-2;0-2 1-2 +"(?:(a+|b)+)$" +-;-;-;- +0-2 1-2;0-2 1-2;0-2 1-2;0-2 1-2 +strings +"" +"xabcdx" +regexps +"ab|cd" +-;-;-;- +-;1-3;-;1-3 +"^(?:ab|cd)$" +-;-;-;- +-;-;-;- +"^(?:ab|cd)" +-;-;-;- +-;-;-;- +"(?:ab|cd)$" +-;-;-;- +-;-;-;- +strings +"" +"hello\ngoodbye\n" +regexps +"h.*od?" +-;-;-;- +-;0-5;-;0-5 +"^(?:h.*od?)$" +-;-;-;- +-;-;-;- +"^(?:h.*od?)" +-;-;-;- +-;0-5;-;0-5 +"(?:h.*od?)$" +-;-;-;- +-;-;-;- +strings +"" +"hello\ngoodbye\n" +regexps +"h.*o" +-;-;-;- +-;0-5;-;0-5 +"^(?:h.*o)$" +-;-;-;- +-;-;-;- +"^(?:h.*o)" +-;-;-;- +-;0-5;-;0-5 +"(?:h.*o)$" +-;-;-;- +-;-;-;- +strings +"" +"goodbye\nhello\n" +regexps +"h.*o" +-;-;-;- +-;8-13;-;8-13 +"^(?:h.*o)$" +-;-;-;- +-;-;-;- +"^(?:h.*o)" +-;-;-;- +-;-;-;- +"(?:h.*o)$" +-;-;-;- +-;-;-;- +strings +"" +"hello world" +regexps +"h.*o" +-;-;-;- +-;0-8;-;0-8 +"^(?:h.*o)$" +-;-;-;- +-;-;-;- +"^(?:h.*o)" +-;-;-;- +-;0-8;-;0-8 +"(?:h.*o)$" +-;-;-;- +-;-;-;- +strings +"" +"othello, world" +regexps +"h.*o" +-;-;-;- +-;2-11;-;2-11 +"^(?:h.*o)$" +-;-;-;- +-;-;-;- +"^(?:h.*o)" +-;-;-;- +-;-;-;- +"(?:h.*o)$" +-;-;-;- +-;-;-;- +strings +"" +"aaaaaaa" +regexps +"[^\\s\\S]" +-;-;-;- +-;-;-;- +"^(?:[^\\s\\S])$" +-;-;-;- +-;-;-;- +"^(?:[^\\s\\S])" +-;-;-;- +-;-;-;- +"(?:[^\\s\\S])$" +-;-;-;- +-;-;-;- +strings +"" +"aaaaaaa" +regexps +"a" +-;-;-;- +-;0-1;-;0-1 +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;0-1;-;0-1 +"(?:a)$" +-;-;-;- +-;6-7;-;6-7 +strings +"" +"aaaaaaa" +regexps +"a*" +0-0;0-0;0-0;0-0 +0-7;0-7;0-7;0-7 +"^(?:a*)$" +0-0;0-0;0-0;0-0 +0-7;0-7;0-7;0-7 +"^(?:a*)" +0-0;0-0;0-0;0-0 +0-7;0-7;0-7;0-7 +"(?:a*)$" +0-0;0-0;0-0;0-0 +0-7;0-7;0-7;0-7 +strings +"" +"" +regexps +"a*" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:a*)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:a*)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:a*)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"xabcdx" +regexps +"ab|cd" +-;-;-;- +-;1-3;-;1-3 +"^(?:ab|cd)$" +-;-;-;- +-;-;-;- +"^(?:ab|cd)" +-;-;-;- +-;-;-;- +"(?:ab|cd)$" +-;-;-;- +-;-;-;- +strings +"" +"cab" +regexps +"a" +-;-;-;- +-;1-2;-;1-2 +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;-;-;- +"(?:a)$" +-;-;-;- +-;-;-;- +strings +"" +"cab" +regexps +"a*b" +-;-;-;- +-;1-3;-;1-3 +"^(?:a*b)$" +-;-;-;- +-;-;-;- +"^(?:a*b)" +-;-;-;- +-;-;-;- +"(?:a*b)$" +-;-;-;- +-;1-3;-;1-3 +strings +"" +"x" +regexps +"((((((((((((((((((((x))))))))))))))))))))" +-;-;-;- +0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 +"^(?:((((((((((((((((((((x)))))))))))))))))))))$" +-;-;-;- +0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 +"^(?:((((((((((((((((((((x)))))))))))))))))))))" +-;-;-;- +0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 +"(?:((((((((((((((((((((x)))))))))))))))))))))$" +-;-;-;- +0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 +strings +"" +"xxxabcdxxx" +regexps +"[abcd]" +-;-;-;- +-;3-4;-;3-4 +"^(?:[abcd])$" +-;-;-;- +-;-;-;- +"^(?:[abcd])" +-;-;-;- +-;-;-;- +"(?:[abcd])$" +-;-;-;- +-;-;-;- +strings +"" +"xxxabcdxxx" +regexps +"[^x]" +-;-;-;- +-;3-4;-;3-4 +"^(?:[^x])$" +-;-;-;- +-;-;-;- +"^(?:[^x])" +-;-;-;- +-;-;-;- +"(?:[^x])$" +-;-;-;- +-;-;-;- +strings +"" +"xxxabcdxxx" +regexps +"[abcd]+" +-;-;-;- +-;3-7;-;3-7 +"^(?:[abcd]+)$" +-;-;-;- +-;-;-;- +"^(?:[abcd]+)" +-;-;-;- +-;-;-;- +"(?:[abcd]+)$" +-;-;-;- +-;-;-;- +strings +"" +"xxxabcdxxx" +regexps +"[^x]+" +-;-;-;- +-;3-7;-;3-7 +"^(?:[^x]+)$" +-;-;-;- +-;-;-;- +"^(?:[^x]+)" +-;-;-;- +-;-;-;- +"(?:[^x]+)$" +-;-;-;- +-;-;-;- +strings +"" +"fo" +regexps +"(fo|foo)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:(fo|foo))$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:(fo|foo))" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:(fo|foo))$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"(foo|fo)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|fo))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|fo))" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:(foo|fo))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"aA" +regexps +"aa" +-;-;-;- +-;-;-;- +"^(?:aa)$" +-;-;-;- +-;-;-;- +"^(?:aa)" +-;-;-;- +-;-;-;- +"(?:aa)$" +-;-;-;- +-;-;-;- +strings +"" +"Aa" +regexps +"a" +-;-;-;- +-;1-2;-;1-2 +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;-;-;- +"(?:a)$" +-;-;-;- +-;1-2;-;1-2 +strings +"" +"A" +regexps +"a" +-;-;-;- +-;-;-;- +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;-;-;- +"(?:a)$" +-;-;-;- +-;-;-;- +strings +"" +"abc" +regexps +"ABC" +-;-;-;- +-;-;-;- +"^(?:ABC)$" +-;-;-;- +-;-;-;- +"^(?:ABC)" +-;-;-;- +-;-;-;- +"(?:ABC)$" +-;-;-;- +-;-;-;- +strings +"" +"XABCY" +regexps +"abc" +-;-;-;- +-;-;-;- +"^(?:abc)$" +-;-;-;- +-;-;-;- +"^(?:abc)" +-;-;-;- +-;-;-;- +"(?:abc)$" +-;-;-;- +-;-;-;- +strings +"" +"xabcy" +regexps +"ABC" +-;-;-;- +-;-;-;- +"^(?:ABC)$" +-;-;-;- +-;-;-;- +"^(?:ABC)" +-;-;-;- +-;-;-;- +"(?:ABC)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"foo|bar|[A-Z]" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:foo|bar|[A-Z])$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:foo|bar|[A-Z])" +-;-;-;- +0-3;0-3;0-3;0-3 +"(?:foo|bar|[A-Z])$" +-;-;-;- +0-3;0-3;0-3;0-3 +strings +"" +"foo" +regexps +"^(foo|bar|[A-Z])" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z]))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z]))" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^(foo|bar|[A-Z]))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo\n" +regexps +"(foo|bar|[A-Z])$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])$)" +-;-;-;- +-;-;-;- +"(?:(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"(foo|bar|[A-Z])$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|bar|[A-Z])$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo\n" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +-;-;-;- +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +-;-;-;- +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"bar" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"X" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +strings +"" +"XY" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +-;-;-;- +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +-;-;-;- +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +strings +"" +"fo" +regexps +"^(fo|foo)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^(fo|foo)$)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^(fo|foo)$)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:^(fo|foo)$)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"^(fo|foo)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(fo|foo)$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(fo|foo)$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^(fo|foo)$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"fo" +regexps +"^^(fo|foo)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^^(fo|foo)$)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^^(fo|foo)$)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:^^(fo|foo)$)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"^^(fo|foo)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^^(fo|foo)$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^^(fo|foo)$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^^(fo|foo)$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"" +regexps +"^$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^^$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"" +regexps +"^$$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"^$$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^^$$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^$$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^^$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^^$$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^$$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^^$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^^^^^^^^$$$$$$$$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^^^^^^^$$$$$$$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^^^^^^^$$$$$$$$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^^^^^^^^$$$$$$$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"^(?:^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^)" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"(?:^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"$" +0-0;0-0;0-0;0-0 +-;1-1;-;1-1 +"^(?:$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:$)$" +0-0;0-0;0-0;0-0 +-;1-1;-;1-1 +strings +"" +"nofoo foo that" +regexps +"\\bfoo\\b" +-;-;-;- +-;6-9;-;6-9 +"^(?:\\bfoo\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bfoo\\b)" +-;-;-;- +-;-;-;- +"(?:\\bfoo\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"faoa x" +regexps +"a\\b" +-;-;-;- +-;3-4;-;3-4 +"^(?:a\\b)$" +-;-;-;- +-;-;-;- +"^(?:a\\b)" +-;-;-;- +-;-;-;- +"(?:a\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"bar x" +regexps +"\\bbar" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\bbar)$" +-;-;-;- +-;-;-;- +"^(?:\\bbar)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\bbar)$" +-;-;-;- +-;-;-;- +strings +"" +"foo\nbar x" +regexps +"\\bbar" +-;-;-;- +-;4-7;-;4-7 +"^(?:\\bbar)$" +-;-;-;- +-;-;-;- +"^(?:\\bbar)" +-;-;-;- +-;-;-;- +"(?:\\bbar)$" +-;-;-;- +-;-;-;- +strings +"" +"foobar" +regexps +"bar\\b" +-;-;-;- +-;3-6;-;3-6 +"^(?:bar\\b)$" +-;-;-;- +-;-;-;- +"^(?:bar\\b)" +-;-;-;- +-;-;-;- +"(?:bar\\b)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"foobar\nxxx" +regexps +"bar\\b" +-;-;-;- +-;3-6;-;3-6 +"^(?:bar\\b)$" +-;-;-;- +-;-;-;- +"^(?:bar\\b)" +-;-;-;- +-;-;-;- +"(?:bar\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"(foo|bar|[A-Z])\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|bar|[A-Z])\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo\n" +regexps +"(foo|bar|[A-Z])\\b" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"^(?:(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])\\b)" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"(?:(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"\\b" +-;-;-;- +-;-;-;- +"^(?:\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b)" +-;-;-;- +-;-;-;- +"(?:\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"\\b" +-;-;-;- +-;0-0;-;0-0 +"^(?:\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b)" +-;-;-;- +-;0-0;-;0-0 +"(?:\\b)$" +-;-;-;- +-;1-1;-;1-1 +strings +"" +"foo" +regexps +"\\b(foo|bar|[A-Z])" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z]))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z]))" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:\\b(foo|bar|[A-Z]))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"X" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +strings +"" +"XY" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +-;-;-;- +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +-;-;-;- +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"bar" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo\n" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"ffoo bbar N x" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +-;10-11 10-11;-;10-11 10-11 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +-;-;-;- +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"fo" +regexps +"\\b(fo|foo)\\b" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:\\b(fo|foo)\\b)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:\\b(fo|foo)\\b)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:\\b(fo|foo)\\b)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"\\b(fo|foo)\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(fo|foo)\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(fo|foo)\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:\\b(fo|foo)\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"" +regexps +"\\b\\b" +-;-;-;- +-;-;-;- +"^(?:\\b\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b\\b)" +-;-;-;- +-;-;-;- +"(?:\\b\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"\\b\\b" +-;-;-;- +-;0-0;-;0-0 +"^(?:\\b\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b\\b)" +-;-;-;- +-;0-0;-;0-0 +"(?:\\b\\b)$" +-;-;-;- +-;1-1;-;1-1 +strings +"" +"" +regexps +"\\b$" +-;-;-;- +-;-;-;- +"^(?:\\b$)$" +-;-;-;- +-;-;-;- +"^(?:\\b$)" +-;-;-;- +-;-;-;- +"(?:\\b$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"\\b$" +-;-;-;- +-;1-1;-;1-1 +"^(?:\\b$)$" +-;-;-;- +-;-;-;- +"^(?:\\b$)" +-;-;-;- +-;-;-;- +"(?:\\b$)$" +-;-;-;- +-;1-1;-;1-1 +strings +"" +"y x" +regexps +"\\b$" +-;-;-;- +-;3-3;-;3-3 +"^(?:\\b$)$" +-;-;-;- +-;-;-;- +"^(?:\\b$)" +-;-;-;- +-;-;-;- +"(?:\\b$)$" +-;-;-;- +-;3-3;-;3-3 +strings +"" +"x" +regexps +"\\b.$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\b.$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\b.$)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\b.$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"fo" +regexps +"^\\b(fo|foo)\\b" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^\\b(fo|foo)\\b)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^\\b(fo|foo)\\b)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:^\\b(fo|foo)\\b)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"^\\b(fo|foo)\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^\\b(fo|foo)\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^\\b(fo|foo)\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^\\b(fo|foo)\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"" +regexps +"^\\b" +-;-;-;- +-;-;-;- +"^(?:^\\b)$" +-;-;-;- +-;-;-;- +"^(?:^\\b)" +-;-;-;- +-;-;-;- +"(?:^\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\b" +-;-;-;- +-;0-0;-;0-0 +"^(?:^\\b)$" +-;-;-;- +-;-;-;- +"^(?:^\\b)" +-;-;-;- +-;0-0;-;0-0 +"(?:^\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^\\b\\b" +-;-;-;- +-;-;-;- +"^(?:^\\b\\b)$" +-;-;-;- +-;-;-;- +"^(?:^\\b\\b)" +-;-;-;- +-;-;-;- +"(?:^\\b\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\b\\b" +-;-;-;- +-;0-0;-;0-0 +"^(?:^\\b\\b)$" +-;-;-;- +-;-;-;- +"^(?:^\\b\\b)" +-;-;-;- +-;0-0;-;0-0 +"(?:^\\b\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^\\b$" +-;-;-;- +-;-;-;- +"^(?:^\\b$)$" +-;-;-;- +-;-;-;- +"^(?:^\\b$)" +-;-;-;- +-;-;-;- +"(?:^\\b$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\b$" +-;-;-;- +-;-;-;- +"^(?:^\\b$)$" +-;-;-;- +-;-;-;- +"^(?:^\\b$)" +-;-;-;- +-;-;-;- +"(?:^\\b$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\b.$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^\\b.$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^\\b.$)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:^\\b.$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"x" +regexps +"^\\b.\\b$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^\\b.\\b$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^\\b.\\b$)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:^\\b.\\b$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"" +regexps +"^^^^^^^^\\b$$$$$$$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\b$$$$$$$)$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\b$$$$$$$)" +-;-;-;- +-;-;-;- +"(?:^^^^^^^^\\b$$$$$$$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^^^^^^^^\\b.$$$$$$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^^^^^^^^\\b.$$$$$$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^^^^^^^^\\b.$$$$$$)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:^^^^^^^^\\b.$$$$$$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"x" +regexps +"^^^^^^^^\\b$$$$$$$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\b$$$$$$$)$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\b$$$$$$$)" +-;-;-;- +-;-;-;- +"(?:^^^^^^^^\\b$$$$$$$)$" +-;-;-;- +-;-;-;- +strings +"" +"n foo xfoox that" +regexps +"\\Bfoo\\B" +-;-;-;- +-;7-10;-;7-10 +"^(?:\\Bfoo\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\Bfoo\\B)" +-;-;-;- +-;-;-;- +"(?:\\Bfoo\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"faoa x" +regexps +"a\\B" +-;-;-;- +-;1-2;-;1-2 +"^(?:a\\B)$" +-;-;-;- +-;-;-;- +"^(?:a\\B)" +-;-;-;- +-;-;-;- +"(?:a\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"bar x" +regexps +"\\Bbar" +-;-;-;- +-;-;-;- +"^(?:\\Bbar)$" +-;-;-;- +-;-;-;- +"^(?:\\Bbar)" +-;-;-;- +-;-;-;- +"(?:\\Bbar)$" +-;-;-;- +-;-;-;- +strings +"" +"foo\nbar x" +regexps +"\\Bbar" +-;-;-;- +-;-;-;- +"^(?:\\Bbar)$" +-;-;-;- +-;-;-;- +"^(?:\\Bbar)" +-;-;-;- +-;-;-;- +"(?:\\Bbar)$" +-;-;-;- +-;-;-;- +strings +"" +"foobar" +regexps +"bar\\B" +-;-;-;- +-;-;-;- +"^(?:bar\\B)$" +-;-;-;- +-;-;-;- +"^(?:bar\\B)" +-;-;-;- +-;-;-;- +"(?:bar\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foobar\nxxx" +regexps +"bar\\B" +-;-;-;- +-;-;-;- +"^(?:bar\\B)$" +-;-;-;- +-;-;-;- +"^(?:bar\\B)" +-;-;-;- +-;-;-;- +"(?:bar\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foox" +regexps +"(foo|bar|[A-Z])\\B" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"^(?:(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])\\B)" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"(?:(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foo\n" +regexps +"(foo|bar|[A-Z])\\B" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"\\B" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"\\B" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"foo" +regexps +"\\B(foo|bar|[A-Z])" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z]))$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z]))" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z]))$" +-;-;-;- +-;-;-;- +strings +"" +"xXy" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;1-2 1-2;-;1-2 1-2 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"XY" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"XYZ" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;1-2 1-2;-;1-2 1-2 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"abara" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;1-4 1-4;-;1-4 1-4 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"xfoo_" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;1-4 1-4;-;1-4 1-4 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"xfoo\n" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foo bar vNx" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;9-10 9-10;-;9-10 9-10 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"xfoo" +regexps +"\\B(fo|foo)\\B" +-;-;-;- +-;1-3 1-3;-;1-3 1-3 +"^(?:\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(fo|foo)\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"xfooo" +regexps +"\\B(foo|fo)\\B" +-;-;-;- +-;1-4 1-4;-;1-4 1-4 +"^(?:\\B(foo|fo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|fo)\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|fo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"\\B\\B" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B\\B)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:\\B\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"\\B\\B" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B\\B)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:\\B\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"\\B$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:\\B$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"\\B$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"y x" +regexps +"\\B$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"\\B.$" +-;-;-;- +-;-;-;- +"^(?:\\B.$)$" +-;-;-;- +-;-;-;- +"^(?:\\B.$)" +-;-;-;- +-;-;-;- +"(?:\\B.$)$" +-;-;-;- +-;-;-;- +strings +"" +"fo" +regexps +"^\\B(fo|foo)\\B" +-;-;-;- +-;-;-;- +"^(?:^\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:^\\B(fo|foo)\\B)" +-;-;-;- +-;-;-;- +"(?:^\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"^\\B(fo|foo)\\B" +-;-;-;- +-;-;-;- +"^(?:^\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:^\\B(fo|foo)\\B)" +-;-;-;- +-;-;-;- +"(?:^\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^\\B" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^\\B" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^\\B\\B" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B\\B)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^\\B\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^\\B\\B" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B\\B)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^\\B\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^\\B$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^\\B$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^\\B$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"^\\B.$" +-;-;-;- +-;-;-;- +"^(?:^\\B.$)$" +-;-;-;- +-;-;-;- +"^(?:^\\B.$)" +-;-;-;- +-;-;-;- +"(?:^\\B.$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\B.\\B$" +-;-;-;- +-;-;-;- +"^(?:^\\B.\\B$)$" +-;-;-;- +-;-;-;- +"^(?:^\\B.\\B$)" +-;-;-;- +-;-;-;- +"(?:^\\B.\\B$)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^^^^^^^^\\B$$$$$$$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^^^^^^^\\B$$$$$$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^^^^^^^\\B$$$$$$$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^^^^^^^^\\B$$$$$$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^^^^^^^^\\B.$$$$$$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\B.$$$$$$)$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\B.$$$$$$)" +-;-;-;- +-;-;-;- +"(?:^^^^^^^^\\B.$$$$$$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^^^^^^^^\\B$$$$$$$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^^^^^^^\\B$$$$$$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^^^^^^^\\B$$$$$$$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^^^^^^^^\\B$$$$$$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"\\bx\\b" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\bx\\b)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\bx\\b)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\bx\\b)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"x>" +regexps +"\\bx\\b" +-;-;-;- +-;0-1;-;0-1 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;0-1;-;0-1 +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"<x" +regexps +"\\bx\\b" +-;-;-;- +-;1-2;-;1-2 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;1-2;-;1-2 +strings +"" +"<x>" +regexps +"\\bx\\b" +-;-;-;- +-;1-2;-;1-2 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"ax" +regexps +"\\bx\\b" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"xb" +regexps +"\\bx\\b" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"axb" +regexps +"\\bx\\b" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"«x" +regexps +"\\bx\\b" +-;-;-;- +-;2-3;-;2-3 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;2-3;-;2-3 +strings +"" +"x»" +regexps +"\\bx\\b" +-;-;-;- +-;0-1;-;0-1 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;0-1;-;0-1 +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"«x»" +regexps +"\\bx\\b" +-;-;-;- +-;2-3;-;2-3 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"axb" +regexps +"\\bx\\b" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"áxβ" +regexps +"\\bx\\b" +-;-;-;- +-;2-3;-;2-3 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"axb" +regexps +"\\Bx\\B" +-;-;-;- +-;1-2;-;1-2 +"^(?:\\Bx\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\Bx\\B)" +-;-;-;- +-;-;-;- +"(?:\\Bx\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"áxβ" +regexps +"\\Bx\\B" +-;-;-;- +-;-;-;- +"^(?:\\Bx\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\Bx\\B)" +-;-;-;- +-;-;-;- +"(?:\\Bx\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^$^$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$^$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^$^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"" +regexps +"^$^" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$^)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$^)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^$^)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"" +regexps +"$^$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:$^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:$^$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:$^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"^$^" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\ny" +regexps +"^$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\ny" +regexps +"^$^" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\ny" +regexps +"$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\n\ny" +regexps +"^$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\n\ny" +regexps +"^$^" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\n\ny" +regexps +"$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"foo$bar" +regexps +"^(foo\\$)$" +-;-;-;- +-;-;-;- +"^(?:^(foo\\$)$)$" +-;-;-;- +-;-;-;- +"^(?:^(foo\\$)$)" +-;-;-;- +-;-;-;- +"(?:^(foo\\$)$)$" +-;-;-;- +-;-;-;- +strings +"" +"foo$bar" +regexps +"(foo\\$)" +-;-;-;- +-;0-4 0-4;-;0-4 0-4 +"^(?:(foo\\$))$" +-;-;-;- +-;-;-;- +"^(?:(foo\\$))" +-;-;-;- +-;0-4 0-4;-;0-4 0-4 +"(?:(foo\\$))$" +-;-;-;- +-;-;-;- +strings +"" +"abc" +regexps +"^...$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^...$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^...$)" +-;-;-;- +0-3;0-3;0-3;0-3 +"(?:^...$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +strings +"" +"本" +regexps +"^本$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^本$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^本$)" +-;-;-;- +0-3;0-3;0-3;0-3 +"(?:^本$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +strings +"" +"日本語" +regexps +"^...$" +-;-;-;- +0-9;0-9;0-9;0-9 +"^(?:^...$)$" +-;-;-;- +0-9;0-9;0-9;0-9 +"^(?:^...$)" +-;-;-;- +0-9;0-9;0-9;0-9 +"(?:^...$)$" +-;-;-;- +0-9;0-9;0-9;0-9 +strings +"" +".本." +regexps +"^...$" +-;-;-;- +0-5;0-5;0-5;0-5 +"^(?:^...$)$" +-;-;-;- +0-5;0-5;0-5;0-5 +"^(?:^...$)" +-;-;-;- +0-5;0-5;0-5;0-5 +"(?:^...$)$" +-;-;-;- +0-5;0-5;0-5;0-5 +strings +"" +"本" +regexps +"^\\C\\C\\C$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^\\C\\C\\C$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^\\C\\C\\C$)" +-;-;-;- +0-3;0-3;0-3;0-3 +"(?:^\\C\\C\\C$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +strings +"" +"本" +regexps +"^\\C$" +-;-;-;- +-;-;-;- +"^(?:^\\C$)$" +-;-;-;- +-;-;-;- +"^(?:^\\C$)" +-;-;-;- +-;-;-;- +"(?:^\\C$)$" +-;-;-;- +-;-;-;- +strings +"" +"日本語" +regexps +"^\\C\\C\\C$" +-;-;-;- +-;-;-;- +"^(?:^\\C\\C\\C$)$" +-;-;-;- +-;-;-;- +"^(?:^\\C\\C\\C$)" +-;-;-;- +-;-;-;- +"(?:^\\C\\C\\C$)$" +-;-;-;- +-;-;-;- +strings +"" +"日本語" +regexps +"^...$" +-;-;-;- +0-9;0-9;0-9;0-9 +"^(?:^...$)$" +-;-;-;- +0-9;0-9;0-9;0-9 +"^(?:^...$)" +-;-;-;- +0-9;0-9;0-9;0-9 +"(?:^...$)$" +-;-;-;- +0-9;0-9;0-9;0-9 +strings +"" +"日本語" +regexps +"^.........$" +-;-;-;- +-;-;-;- +"^(?:^.........$)$" +-;-;-;- +-;-;-;- +"^(?:^.........$)" +-;-;-;- +-;-;-;- +"(?:^.........$)$" +-;-;-;- +-;-;-;- +strings +"" +".本." +regexps +"^...$" +-;-;-;- +0-5;0-5;0-5;0-5 +"^(?:^...$)$" +-;-;-;- +0-5;0-5;0-5;0-5 +"^(?:^...$)" +-;-;-;- +0-5;0-5;0-5;0-5 +"(?:^...$)$" +-;-;-;- +0-5;0-5;0-5;0-5 +strings +"" +".本." +regexps +"^.....$" +-;-;-;- +-;-;-;- +"^(?:^.....$)$" +-;-;-;- +-;-;-;- +"^(?:^.....$)" +-;-;-;- +-;-;-;- +"(?:^.....$)$" +-;-;-;- +-;-;-;- +strings +"" +"xfooo" +regexps +"\\B(fo|foo)\\B" +-;-;-;- +-;1-3 1-3;-;1-4 1-4 +"^(?:\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(fo|foo)\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"(fo|foo)" +-;-;-;- +0-3 0-3;0-2 0-2;0-3 0-3;0-3 0-3 +"^(?:(fo|foo))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(fo|foo))" +-;-;-;- +0-3 0-3;0-2 0-2;0-3 0-3;0-3 0-3 +"(?:(fo|foo))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"a" +regexps +"\\141" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\141)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\141)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\141)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"0" +regexps +"\\060" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\060)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\060)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\060)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"00" +regexps +"\\0600" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\0600)$" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\0600)" +-;-;-;- +0-2;0-2;0-2;0-2 +"(?:\\0600)$" +-;-;-;- +0-2;0-2;0-2;0-2 +strings +"" +"08" +regexps +"\\608" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\608)$" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\608)" +-;-;-;- +0-2;0-2;0-2;0-2 +"(?:\\608)$" +-;-;-;- +0-2;0-2;0-2;0-2 +strings +"" +"" +regexps +"\\01" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\01)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\01)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\01)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"8" +regexps +"\\018" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\018)$" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\018)" +-;-;-;- +0-2;0-2;0-2;0-2 +"(?:\\018)$" +-;-;-;- +0-2;0-2;0-2;0-2 +strings +"" +"a" +regexps +"\\x{61}" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x{61})$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x{61})" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\x{61})$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"\\x61" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x61)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x61)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\x61)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"\\x{00000061}" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x{00000061})$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x{00000061})" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\x{00000061})$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"aαβb" +regexps +"\\p{Greek}+" +-;-;-;- +-;1-5;-;1-5 +"^(?:\\p{Greek}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{Greek}+)" +-;-;-;- +-;-;-;- +"(?:\\p{Greek}+)$" +-;-;-;- +-;-;-;- +strings +"" +"aαβb" +regexps +"\\P{Greek}+" +-;-;-;- +-;0-1;-;0-1 +"^(?:\\P{Greek}+)$" +-;-;-;- +-;-;-;- +"^(?:\\P{Greek}+)" +-;-;-;- +-;0-1;-;0-1 +"(?:\\P{Greek}+)$" +-;-;-;- +-;5-6;-;5-6 +strings +"" +"aαβb" +regexps +"\\p{^Greek}+" +-;-;-;- +-;0-1;-;0-1 +"^(?:\\p{^Greek}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{^Greek}+)" +-;-;-;- +-;0-1;-;0-1 +"(?:\\p{^Greek}+)$" +-;-;-;- +-;5-6;-;5-6 +strings +"" +"aαβb" +regexps +"\\P{^Greek}+" +-;-;-;- +-;1-5;-;1-5 +"^(?:\\P{^Greek}+)$" +-;-;-;- +-;-;-;- +"^(?:\\P{^Greek}+)" +-;-;-;- +-;-;-;- +"(?:\\P{^Greek}+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123" +regexps +"[^0-9]+" +-;-;-;- +-;0-3;-;0-3 +"^(?:[^0-9]+)$" +-;-;-;- +-;-;-;- +"^(?:[^0-9]+)" +-;-;-;- +-;0-3;-;0-3 +"(?:[^0-9]+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\p{Nd}+" +-;-;-;- +-;3-6;-;3-6 +"^(?:\\p{Nd}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{Nd}+)" +-;-;-;- +-;-;-;- +"(?:\\p{Nd}+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\p{^Nd}+" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\p{^Nd}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{^Nd}+)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\p{^Nd}+)$" +-;-;-;- +-;6-22;-;6-22 +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\P{Nd}+" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\P{Nd}+)$" +-;-;-;- +-;-;-;- +"^(?:\\P{Nd}+)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\P{Nd}+)$" +-;-;-;- +-;6-22;-;6-22 +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\P{^Nd}+" +-;-;-;- +-;3-6;-;3-6 +"^(?:\\P{^Nd}+)$" +-;-;-;- +-;-;-;- +"^(?:\\P{^Nd}+)" +-;-;-;- +-;-;-;- +"(?:\\P{^Nd}+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\pN+" +-;-;-;- +-;3-22;-;3-22 +"^(?:\\pN+)$" +-;-;-;- +-;-;-;- +"^(?:\\pN+)" +-;-;-;- +-;-;-;- +"(?:\\pN+)$" +-;-;-;- +-;3-22;-;3-22 +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\p{N}+" +-;-;-;- +-;3-22;-;3-22 +"^(?:\\p{N}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{N}+)" +-;-;-;- +-;-;-;- +"(?:\\p{N}+)$" +-;-;-;- +-;3-22;-;3-22 +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\p{^N}+" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\p{^N}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{^N}+)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\p{^N}+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123" +regexps +"\\p{Any}+" +-;-;-;- +0-6;0-6;0-6;0-6 +"^(?:\\p{Any}+)$" +-;-;-;- +0-6;0-6;0-6;0-6 +"^(?:\\p{Any}+)" +-;-;-;- +0-6;0-6;0-6;0-6 +"(?:\\p{Any}+)$" +-;-;-;- +0-6;0-6;0-6;0-6 +strings +"" +"@AaB" +regexps +"(?i)[@-A]+" +-;-;-;- +-;0-3;-;0-3 +"^(?:(?i)[@-A]+)$" +-;-;-;- +-;-;-;- +"^(?:(?i)[@-A]+)" +-;-;-;- +-;0-3;-;0-3 +"(?:(?i)[@-A]+)$" +-;-;-;- +-;-;-;- +strings +"" +"aAzZ" +regexps +"(?i)[A-Z]+" +-;-;-;- +0-4;0-4;0-4;0-4 +"^(?:(?i)[A-Z]+)$" +-;-;-;- +0-4;0-4;0-4;0-4 +"^(?:(?i)[A-Z]+)" +-;-;-;- +0-4;0-4;0-4;0-4 +"(?:(?i)[A-Z]+)$" +-;-;-;- +0-4;0-4;0-4;0-4 +strings +"" +"Aa\\" +regexps +"(?i)[^\\\\]+" +-;-;-;- +-;0-2;-;0-2 +"^(?:(?i)[^\\\\]+)$" +-;-;-;- +-;-;-;- +"^(?:(?i)[^\\\\]+)" +-;-;-;- +-;0-2;-;0-2 +"(?:(?i)[^\\\\]+)$" +-;-;-;- +-;-;-;- +strings +"" +"acegikmoqsuwyACEGIKMOQSUWY" +regexps +"(?i)[acegikmoqsuwy]+" +-;-;-;- +0-26;0-26;0-26;0-26 +"^(?:(?i)[acegikmoqsuwy]+)$" +-;-;-;- +0-26;0-26;0-26;0-26 +"^(?:(?i)[acegikmoqsuwy]+)" +-;-;-;- +0-26;0-26;0-26;0-26 +"(?:(?i)[acegikmoqsuwy]+)$" +-;-;-;- +0-26;0-26;0-26;0-26 +strings +"" +"@AaB" +regexps +"[@-A]+" +-;-;-;- +-;0-2;-;0-2 +"^(?:[@-A]+)$" +-;-;-;- +-;-;-;- +"^(?:[@-A]+)" +-;-;-;- +-;0-2;-;0-2 +"(?:[@-A]+)$" +-;-;-;- +-;-;-;- +strings +"" +"aAzZ" +regexps +"[A-Z]+" +-;-;-;- +-;1-2;-;1-2 +"^(?:[A-Z]+)$" +-;-;-;- +-;-;-;- +"^(?:[A-Z]+)" +-;-;-;- +-;-;-;- +"(?:[A-Z]+)$" +-;-;-;- +-;3-4;-;3-4 +strings +"" +"Aa\\" +regexps +"[^\\\\]+" +-;-;-;- +-;0-2;-;0-2 +"^(?:[^\\\\]+)$" +-;-;-;- +-;-;-;- +"^(?:[^\\\\]+)" +-;-;-;- +-;0-2;-;0-2 +"(?:[^\\\\]+)$" +-;-;-;- +-;-;-;- +strings +"" +"acegikmoqsuwyACEGIKMOQSUWY" +regexps +"[acegikmoqsuwy]+" +-;-;-;- +-;0-13;-;0-13 +"^(?:[acegikmoqsuwy]+)$" +-;-;-;- +-;-;-;- +"^(?:[acegikmoqsuwy]+)" +-;-;-;- +-;0-13;-;0-13 +"(?:[acegikmoqsuwy]+)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"^abc" +-;-;-;- +-;0-3;-;0-3 +"^(?:^abc)$" +-;-;-;- +-;-;-;- +"^(?:^abc)" +-;-;-;- +-;0-3;-;0-3 +"(?:^abc)$" +-;-;-;- +-;-;-;- +strings +"" +"aabcdef" +regexps +"^abc" +-;-;-;- +-;-;-;- +"^(?:^abc)$" +-;-;-;- +-;-;-;- +"^(?:^abc)" +-;-;-;- +-;-;-;- +"(?:^abc)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"^[ay]*[bx]+c" +-;-;-;- +-;0-3;-;0-3 +"^(?:^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +"^(?:^[ay]*[bx]+c)" +-;-;-;- +-;0-3;-;0-3 +"(?:^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +strings +"" +"aabcdef" +regexps +"^[ay]*[bx]+c" +-;-;-;- +-;0-4;-;0-4 +"^(?:^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +"^(?:^[ay]*[bx]+c)" +-;-;-;- +-;0-4;-;0-4 +"(?:^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"def$" +-;-;-;- +-;3-6;-;3-6 +"^(?:def$)$" +-;-;-;- +-;-;-;- +"^(?:def$)" +-;-;-;- +-;-;-;- +"(?:def$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"def$" +-;-;-;- +-;-;-;- +"^(?:def$)$" +-;-;-;- +-;-;-;- +"^(?:def$)" +-;-;-;- +-;-;-;- +"(?:def$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"d[ex][fy]$" +-;-;-;- +-;3-6;-;3-6 +"^(?:d[ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:d[ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:d[ex][fy]$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"d[ex][fy]$" +-;-;-;- +-;-;-;- +"^(?:d[ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:d[ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:d[ex][fy]$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"[dz][ex][fy]$" +-;-;-;- +-;3-6;-;3-6 +"^(?:[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:[dz][ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:[dz][ex][fy]$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"[dz][ex][fy]$" +-;-;-;- +-;-;-;- +"^(?:[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:[dz][ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)^abc" +-;-;-;- +-;0-3;-;0-3 +"^(?:(?m)^abc)$" +-;-;-;- +-;-;-;- +"^(?:(?m)^abc)" +-;-;-;- +-;0-3;-;0-3 +"(?:(?m)^abc)$" +-;-;-;- +-;-;-;- +strings +"" +"aabcdef" +regexps +"(?m)^abc" +-;-;-;- +-;-;-;- +"^(?:(?m)^abc)$" +-;-;-;- +-;-;-;- +"^(?:(?m)^abc)" +-;-;-;- +-;-;-;- +"(?:(?m)^abc)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)^[ay]*[bx]+c" +-;-;-;- +-;0-3;-;0-3 +"^(?:(?m)^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +"^(?:(?m)^[ay]*[bx]+c)" +-;-;-;- +-;0-3;-;0-3 +"(?:(?m)^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +strings +"" +"aabcdef" +regexps +"(?m)^[ay]*[bx]+c" +-;-;-;- +-;0-4;-;0-4 +"^(?:(?m)^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +"^(?:(?m)^[ay]*[bx]+c)" +-;-;-;- +-;0-4;-;0-4 +"(?:(?m)^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)def$" +-;-;-;- +-;3-6;-;3-6 +"^(?:(?m)def$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)def$)" +-;-;-;- +-;-;-;- +"(?:(?m)def$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"(?m)def$" +-;-;-;- +-;-;-;- +"^(?:(?m)def$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)def$)" +-;-;-;- +-;-;-;- +"(?:(?m)def$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)d[ex][fy]$" +-;-;-;- +-;3-6;-;3-6 +"^(?:(?m)d[ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)d[ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:(?m)d[ex][fy]$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"(?m)d[ex][fy]$" +-;-;-;- +-;-;-;- +"^(?:(?m)d[ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)d[ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:(?m)d[ex][fy]$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)[dz][ex][fy]$" +-;-;-;- +-;3-6;-;3-6 +"^(?:(?m)[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)[dz][ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:(?m)[dz][ex][fy]$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"(?m)[dz][ex][fy]$" +-;-;-;- +-;-;-;- +"^(?:(?m)[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)[dz][ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:(?m)[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +strings +"" +"a" +regexps +"^" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"^(?:^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^)" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"(?:^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"a" +regexps +"^^" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"^(?:^^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^)" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"(?:^^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"a" +regexps +"a" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"ab*" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:ab*)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:ab*)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:ab*)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C*" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C*)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C+" +-;-;-;- +-;-;-;- +"^(?:a\\C+)$" +-;-;-;- +-;-;-;- +"^(?:a\\C+)" +-;-;-;- +-;-;-;- +"(?:a\\C+)$" +-;-;-;- +-;-;-;- +strings +"" +"a" +regexps +"a\\C?" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C?)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C*?" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*?)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C*?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C+?" +-;-;-;- +-;-;-;- +"^(?:a\\C+?)$" +-;-;-;- +-;-;-;- +"^(?:a\\C+?)" +-;-;-;- +-;-;-;- +"(?:a\\C+?)$" +-;-;-;- +-;-;-;- +strings +"" +"a" +regexps +"a\\C??" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C??)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C??)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C??)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"baba" +regexps +"a\\C*|ba\\C" +-;-;-;- +-;0-3;-;0-3 +"^(?:a\\C*|ba\\C)$" +-;-;-;- +-;-;-;- +"^(?:a\\C*|ba\\C)" +-;-;-;- +-;0-3;-;0-3 +"(?:a\\C*|ba\\C)$" +-;-;-;- +-;1-4;-;1-4 +strings +"" +"Inc." +regexps +"\\w*I\\w*" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\w*I\\w*)$" +-;-;-;- +-;-;-;- +"^(?:\\w*I\\w*)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\w*I\\w*)$" +-;-;-;- +-;-;-;- +strings +"" +"aaa" +regexps +"(?:|a)*" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"^(?:(?:|a)*)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +"^(?:(?:|a)*)" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"(?:(?:|a)*)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +strings +"" +"aaa" +regexps +"(?:|a)+" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"^(?:(?:|a)+)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +"^(?:(?:|a)+)" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"(?:(?:|a)+)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 diff --git a/src/regexp/testdata/repetition.dat b/src/regexp/testdata/repetition.dat new file mode 100644 index 0000000..e6361f5 --- /dev/null +++ b/src/regexp/testdata/repetition.dat @@ -0,0 +1,163 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +# These test a bug in OS X / FreeBSD / NetBSD, and libtree. +# Linux/GLIBC gets the {8,} and {8,8} wrong. + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/src/regexp/testdata/testregex.c b/src/regexp/testdata/testregex.c new file mode 100644 index 0000000..37545d0 --- /dev/null +++ b/src/regexp/testdata/testregex.c @@ -0,0 +1,2286 @@ +#pragma prototyped noticed + +/* + * regex(3) test harness + * + * build: cc -o testregex testregex.c + * help: testregex --man + * note: REG_* features are detected by #ifdef; if REG_* are enums + * then supply #define REG_foo REG_foo for each enum REG_foo + * + * Glenn Fowler <gsf@research.att.com> + * AT&T Research + * + * PLEASE: publish your tests so everyone can benefit + * + * The following license covers testregex.c and all associated test data. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software + * without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +static const char id[] = "\n@(#)$Id: testregex (AT&T Research) 2010-06-10 $\0\n"; + +#if _PACKAGE_ast +#include <ast.h> +#else +#include <sys/types.h> +#endif + +#include <stdio.h> +#include <regex.h> +#include <ctype.h> +#include <setjmp.h> +#include <signal.h> +#include <string.h> +#include <unistd.h> + +#ifdef __STDC__ +#include <stdlib.h> +#include <locale.h> +#endif + +#ifndef RE_DUP_MAX +#define RE_DUP_MAX 32767 +#endif + +#if !_PACKAGE_ast +#undef REG_DISCIPLINE +#endif + +#ifndef REG_DELIMITED +#undef _REG_subcomp +#endif + +#define TEST_ARE 0x00000001 +#define TEST_BRE 0x00000002 +#define TEST_ERE 0x00000004 +#define TEST_KRE 0x00000008 +#define TEST_LRE 0x00000010 +#define TEST_SRE 0x00000020 + +#define TEST_EXPAND 0x00000100 +#define TEST_LENIENT 0x00000200 + +#define TEST_QUERY 0x00000400 +#define TEST_SUB 0x00000800 +#define TEST_UNSPECIFIED 0x00001000 +#define TEST_VERIFY 0x00002000 +#define TEST_AND 0x00004000 +#define TEST_OR 0x00008000 + +#define TEST_DELIMIT 0x00010000 +#define TEST_OK 0x00020000 +#define TEST_SAME 0x00040000 + +#define TEST_ACTUAL 0x00100000 +#define TEST_BASELINE 0x00200000 +#define TEST_FAIL 0x00400000 +#define TEST_PASS 0x00800000 +#define TEST_SUMMARY 0x01000000 + +#define TEST_IGNORE_ERROR 0x02000000 +#define TEST_IGNORE_OVER 0x04000000 +#define TEST_IGNORE_POSITION 0x08000000 + +#define TEST_CATCH 0x10000000 +#define TEST_VERBOSE 0x20000000 + +#define TEST_DECOMP 0x40000000 + +#define TEST_GLOBAL (TEST_ACTUAL|TEST_AND|TEST_BASELINE|TEST_CATCH|TEST_FAIL|TEST_IGNORE_ERROR|TEST_IGNORE_OVER|TEST_IGNORE_POSITION|TEST_OR|TEST_PASS|TEST_SUMMARY|TEST_VERBOSE) + +#ifdef REG_DISCIPLINE + + +#include <stk.h> + +typedef struct Disc_s +{ + regdisc_t disc; + int ordinal; + Sfio_t* sp; +} Disc_t; + +static void* +compf(const regex_t* re, const char* xstr, size_t xlen, regdisc_t* disc) +{ + Disc_t* dp = (Disc_t*)disc; + + return (void*)((char*)0 + ++dp->ordinal); +} + +static int +execf(const regex_t* re, void* data, const char* xstr, size_t xlen, const char* sstr, size_t slen, char** snxt, regdisc_t* disc) +{ + Disc_t* dp = (Disc_t*)disc; + + sfprintf(dp->sp, "{%-.*s}(%lu:%d)", xlen, xstr, (char*)data - (char*)0, slen); + return atoi(xstr); +} + +static void* +resizef(void* handle, void* data, size_t size) +{ + if (!size) + return 0; + return stkalloc((Sfio_t*)handle, size); +} + +#endif + +#ifndef NiL +#ifdef __STDC__ +#define NiL 0 +#else +#define NiL (char*)0 +#endif +#endif + +#define H(x) do{if(html)fprintf(stderr,x);}while(0) +#define T(x) fprintf(stderr,x) + +static void +help(int html) +{ +H("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML//EN\">\n"); +H("<HTML>\n"); +H("<HEAD>\n"); +H("<TITLE>testregex man document</TITLE>\n"); +H("</HEAD>\n"); +H("<BODY bgcolor=white>\n"); +H("<PRE>\n"); +T("NAME\n"); +T(" testregex - regex(3) test harness\n"); +T("\n"); +T("SYNOPSIS\n"); +T(" testregex [ options ]\n"); +T("\n"); +T("DESCRIPTION\n"); +T(" testregex reads regex(3) test specifications, one per line, from the\n"); +T(" standard input and writes one output line for each failed test. A\n"); +T(" summary line is written after all tests are done. Each successful\n"); +T(" test is run again with REG_NOSUB. Unsupported features are noted\n"); +T(" before the first test, and tests requiring these features are\n"); +T(" silently ignored.\n"); +T("\n"); +T("OPTIONS\n"); +T(" -c catch signals and non-terminating calls\n"); +T(" -e ignore error return mismatches\n"); +T(" -h list help on standard error\n"); +T(" -n do not repeat successful tests with regnexec()\n"); +T(" -o ignore match[] overrun errors\n"); +T(" -p ignore negative position mismatches\n"); +T(" -s use stack instead of malloc\n"); +T(" -x do not repeat successful tests with REG_NOSUB\n"); +T(" -v list each test line\n"); +T(" -A list failed test lines with actual answers\n"); +T(" -B list all test lines with actual answers\n"); +T(" -F list failed test lines\n"); +T(" -P list passed test lines\n"); +T(" -S output one summary line\n"); +T("\n"); +T("INPUT FORMAT\n"); +T(" Input lines may be blank, a comment beginning with #, or a test\n"); +T(" specification. A specification is five fields separated by one\n"); +T(" or more tabs. NULL denotes the empty string and NIL denotes the\n"); +T(" 0 pointer.\n"); +T("\n"); +T(" Field 1: the regex(3) flags to apply, one character per REG_feature\n"); +T(" flag. The test is skipped if REG_feature is not supported by the\n"); +T(" implementation. If the first character is not [BEASKLP] then the\n"); +T(" specification is a global control line. One or more of [BEASKLP] may be\n"); +T(" specified; the test will be repeated for each mode.\n"); +T("\n"); +T(" B basic BRE (grep, ed, sed)\n"); +T(" E REG_EXTENDED ERE (egrep)\n"); +T(" A REG_AUGMENTED ARE (egrep with negation)\n"); +T(" S REG_SHELL SRE (sh glob)\n"); +T(" K REG_SHELL|REG_AUGMENTED KRE (ksh glob)\n"); +T(" L REG_LITERAL LRE (fgrep)\n"); +T("\n"); +T(" a REG_LEFT|REG_RIGHT implicit ^...$\n"); +T(" b REG_NOTBOL lhs does not match ^\n"); +T(" c REG_COMMENT ignore space and #...\\n\n"); +T(" d REG_SHELL_DOT explicit leading . match\n"); +T(" e REG_NOTEOL rhs does not match $\n"); +T(" f REG_MULTIPLE multiple \\n separated patterns\n"); +T(" g FNM_LEADING_DIR testfnmatch only -- match until /\n"); +T(" h REG_MULTIREF multiple digit backref\n"); +T(" i REG_ICASE ignore case\n"); +T(" j REG_SPAN . matches \\n\n"); +T(" k REG_ESCAPE \\ to ecape [...] delimiter\n"); +T(" l REG_LEFT implicit ^...\n"); +T(" m REG_MINIMAL minimal match\n"); +T(" n REG_NEWLINE explicit \\n match\n"); +T(" o REG_ENCLOSED (|&) magic inside [@|&](...)\n"); +T(" p REG_SHELL_PATH explicit / match\n"); +T(" q REG_DELIMITED delimited pattern\n"); +T(" r REG_RIGHT implicit ...$\n"); +T(" s REG_SHELL_ESCAPED \\ not special\n"); +T(" t REG_MUSTDELIM all delimiters must be specified\n"); +T(" u standard unspecified behavior -- errors not counted\n"); +T(" v REG_CLASS_ESCAPE \\ special inside [...]\n"); +T(" w REG_NOSUB no subexpression match array\n"); +T(" x REG_LENIENT let some errors slide\n"); +T(" y REG_LEFT regexec() implicit ^...\n"); +T(" z REG_NULL NULL subexpressions ok\n"); +T(" $ expand C \\c escapes in fields 2 and 3\n"); +T(" / field 2 is a regsubcomp() expression\n"); +T(" = field 3 is a regdecomp() expression\n"); +T("\n"); +T(" Field 1 control lines:\n"); +T("\n"); +T(" C set LC_COLLATE and LC_CTYPE to locale in field 2\n"); +T("\n"); +T(" ?test ... output field 5 if passed and != EXPECTED, silent otherwise\n"); +T(" &test ... output field 5 if current and previous passed\n"); +T(" |test ... output field 5 if current passed and previous failed\n"); +T(" ; ... output field 2 if previous failed\n"); +T(" {test ... skip if failed until }\n"); +T(" } end of skip\n"); +T("\n"); +T(" : comment comment copied as output NOTE\n"); +T(" :comment:test :comment: ignored\n"); +T(" N[OTE] comment comment copied as output NOTE\n"); +T(" T[EST] comment comment\n"); +T("\n"); +T(" number use number for nmatch (20 by default)\n"); +T("\n"); +T(" Field 2: the regular expression pattern; SAME uses the pattern from\n"); +T(" the previous specification. RE_DUP_MAX inside {...} expands to the\n"); +T(" value from <limits.h>.\n"); +T("\n"); +T(" Field 3: the string to match. X...{RE_DUP_MAX} expands to RE_DUP_MAX\n"); +T(" copies of X.\n"); +T("\n"); +T(" Field 4: the test outcome. This is either one of the posix error\n"); +T(" codes (with REG_ omitted) or the match array, a list of (m,n)\n"); +T(" entries with m and n being first and last+1 positions in the\n"); +T(" field 3 string, or NULL if REG_NOSUB is in effect and success\n"); +T(" is expected. BADPAT is acceptable in place of any regcomp(3)\n"); +T(" error code. The match[] array is initialized to (-2,-2) before\n"); +T(" each test. All array elements from 0 to nmatch-1 must be specified\n"); +T(" in the outcome. Unspecified endpoints (offset -1) are denoted by ?.\n"); +T(" Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a\n"); +T(" matched (?{...}) expression, where x is the text enclosed by {...},\n"); +T(" o is the expression ordinal counting from 1, and n is the length of\n"); +T(" the unmatched portion of the subject string. If x starts with a\n"); +T(" number then that is the return value of re_execf(), otherwise 0 is\n"); +T(" returned. RE_DUP_MAX[-+]N expands to the <limits.h> value -+N.\n"); +T("\n"); +T(" Field 5: optional comment appended to the report.\n"); +T("\n"); +T("CAVEAT\n"); +T(" If a regex implementation misbehaves with memory then all bets are off.\n"); +T("\n"); +T("CONTRIBUTORS\n"); +T(" Glenn Fowler gsf@research.att.com (ksh strmatch, regex extensions)\n"); +T(" David Korn dgk@research.att.com (ksh glob matcher)\n"); +T(" Doug McIlroy mcilroy@dartmouth.edu (ast regex/testre in C++)\n"); +T(" Tom Lord lord@regexps.com (rx tests)\n"); +T(" Henry Spencer henry@zoo.toronto.edu (original public regex)\n"); +T(" Andrew Hume andrew@research.att.com (gre tests)\n"); +T(" John Maddock John_Maddock@compuserve.com (regex++ tests)\n"); +T(" Philip Hazel ph10@cam.ac.uk (pcre tests)\n"); +T(" Ville Laurikari vl@iki.fi (libtre tests)\n"); +H("</PRE>\n"); +H("</BODY>\n"); +H("</HTML>\n"); +} + +#ifndef elementsof +#define elementsof(x) (sizeof(x)/sizeof(x[0])) +#endif + +#ifndef streq +#define streq(a,b) (*(a)==*(b)&&!strcmp(a,b)) +#endif + +#define HUNG 2 +#define NOTEST (~0) + +#ifndef REG_TEST_DEFAULT +#define REG_TEST_DEFAULT 0 +#endif + +#ifndef REG_EXEC_DEFAULT +#define REG_EXEC_DEFAULT 0 +#endif + +static const char* unsupported[] = +{ + "BASIC", +#ifndef REG_EXTENDED + "EXTENDED", +#endif +#ifndef REG_AUGMENTED + "AUGMENTED", +#endif +#ifndef REG_SHELL + "SHELL", +#endif + +#ifndef REG_CLASS_ESCAPE + "CLASS_ESCAPE", +#endif +#ifndef REG_COMMENT + "COMMENT", +#endif +#ifndef REG_DELIMITED + "DELIMITED", +#endif +#ifndef REG_DISCIPLINE + "DISCIPLINE", +#endif +#ifndef REG_ESCAPE + "ESCAPE", +#endif +#ifndef REG_ICASE + "ICASE", +#endif +#ifndef REG_LEFT + "LEFT", +#endif +#ifndef REG_LENIENT + "LENIENT", +#endif +#ifndef REG_LITERAL + "LITERAL", +#endif +#ifndef REG_MINIMAL + "MINIMAL", +#endif +#ifndef REG_MULTIPLE + "MULTIPLE", +#endif +#ifndef REG_MULTIREF + "MULTIREF", +#endif +#ifndef REG_MUSTDELIM + "MUSTDELIM", +#endif +#ifndef REG_NEWLINE + "NEWLINE", +#endif +#ifndef REG_NOTBOL + "NOTBOL", +#endif +#ifndef REG_NOTEOL + "NOTEOL", +#endif +#ifndef REG_NULL + "NULL", +#endif +#ifndef REG_RIGHT + "RIGHT", +#endif +#ifndef REG_SHELL_DOT + "SHELL_DOT", +#endif +#ifndef REG_SHELL_ESCAPED + "SHELL_ESCAPED", +#endif +#ifndef REG_SHELL_GROUP + "SHELL_GROUP", +#endif +#ifndef REG_SHELL_PATH + "SHELL_PATH", +#endif +#ifndef REG_SPAN + "SPAN", +#endif +#if REG_NOSUB & REG_TEST_DEFAULT + "SUBMATCH", +#endif +#if !_REG_nexec + "regnexec", +#endif +#if !_REG_subcomp + "regsubcomp", +#endif +#if !_REG_decomp + "redecomp", +#endif + 0 +}; + +#ifndef REG_CLASS_ESCAPE +#define REG_CLASS_ESCAPE NOTEST +#endif +#ifndef REG_COMMENT +#define REG_COMMENT NOTEST +#endif +#ifndef REG_DELIMITED +#define REG_DELIMITED NOTEST +#endif +#ifndef REG_ESCAPE +#define REG_ESCAPE NOTEST +#endif +#ifndef REG_ICASE +#define REG_ICASE NOTEST +#endif +#ifndef REG_LEFT +#define REG_LEFT NOTEST +#endif +#ifndef REG_LENIENT +#define REG_LENIENT 0 +#endif +#ifndef REG_MINIMAL +#define REG_MINIMAL NOTEST +#endif +#ifndef REG_MULTIPLE +#define REG_MULTIPLE NOTEST +#endif +#ifndef REG_MULTIREF +#define REG_MULTIREF NOTEST +#endif +#ifndef REG_MUSTDELIM +#define REG_MUSTDELIM NOTEST +#endif +#ifndef REG_NEWLINE +#define REG_NEWLINE NOTEST +#endif +#ifndef REG_NOTBOL +#define REG_NOTBOL NOTEST +#endif +#ifndef REG_NOTEOL +#define REG_NOTEOL NOTEST +#endif +#ifndef REG_NULL +#define REG_NULL NOTEST +#endif +#ifndef REG_RIGHT +#define REG_RIGHT NOTEST +#endif +#ifndef REG_SHELL_DOT +#define REG_SHELL_DOT NOTEST +#endif +#ifndef REG_SHELL_ESCAPED +#define REG_SHELL_ESCAPED NOTEST +#endif +#ifndef REG_SHELL_GROUP +#define REG_SHELL_GROUP NOTEST +#endif +#ifndef REG_SHELL_PATH +#define REG_SHELL_PATH NOTEST +#endif +#ifndef REG_SPAN +#define REG_SPAN NOTEST +#endif + +#define REG_UNKNOWN (-1) + +#ifndef REG_ENEWLINE +#define REG_ENEWLINE (REG_UNKNOWN-1) +#endif +#ifndef REG_ENULL +#ifndef REG_EMPTY +#define REG_ENULL (REG_UNKNOWN-2) +#else +#define REG_ENULL REG_EMPTY +#endif +#endif +#ifndef REG_ECOUNT +#define REG_ECOUNT (REG_UNKNOWN-3) +#endif +#ifndef REG_BADESC +#define REG_BADESC (REG_UNKNOWN-4) +#endif +#ifndef REG_EMEM +#define REG_EMEM (REG_UNKNOWN-5) +#endif +#ifndef REG_EHUNG +#define REG_EHUNG (REG_UNKNOWN-6) +#endif +#ifndef REG_EBUS +#define REG_EBUS (REG_UNKNOWN-7) +#endif +#ifndef REG_EFAULT +#define REG_EFAULT (REG_UNKNOWN-8) +#endif +#ifndef REG_EFLAGS +#define REG_EFLAGS (REG_UNKNOWN-9) +#endif +#ifndef REG_EDELIM +#define REG_EDELIM (REG_UNKNOWN-9) +#endif + +static const struct { int code; char* name; } codes[] = +{ + REG_UNKNOWN, "UNKNOWN", + REG_NOMATCH, "NOMATCH", + REG_BADPAT, "BADPAT", + REG_ECOLLATE, "ECOLLATE", + REG_ECTYPE, "ECTYPE", + REG_EESCAPE, "EESCAPE", + REG_ESUBREG, "ESUBREG", + REG_EBRACK, "EBRACK", + REG_EPAREN, "EPAREN", + REG_EBRACE, "EBRACE", + REG_BADBR, "BADBR", + REG_ERANGE, "ERANGE", + REG_ESPACE, "ESPACE", + REG_BADRPT, "BADRPT", + REG_ENEWLINE, "ENEWLINE", + REG_ENULL, "ENULL", + REG_ECOUNT, "ECOUNT", + REG_BADESC, "BADESC", + REG_EMEM, "EMEM", + REG_EHUNG, "EHUNG", + REG_EBUS, "EBUS", + REG_EFAULT, "EFAULT", + REG_EFLAGS, "EFLAGS", + REG_EDELIM, "EDELIM", +}; + +static struct +{ + regmatch_t NOMATCH; + int errors; + int extracted; + int ignored; + int lineno; + int passed; + int signals; + int unspecified; + int verify; + int warnings; + char* file; + char* stack; + char* which; + jmp_buf gotcha; +#ifdef REG_DISCIPLINE + Disc_t disc; +#endif +} state; + +static void +quote(char* s, int len, unsigned long test) +{ + unsigned char* u = (unsigned char*)s; + unsigned char* e; + int c; +#ifdef MB_CUR_MAX + int w; +#endif + + if (!u) + printf("NIL"); + else if (!*u && len <= 1) + printf("NULL"); + else if (test & TEST_EXPAND) + { + if (len < 0) + len = strlen((char*)u); + e = u + len; + if (test & TEST_DELIMIT) + printf("\""); + while (u < e) + switch (c = *u++) + { + case '\\': + printf("\\\\"); + break; + case '"': + if (test & TEST_DELIMIT) + printf("\\\""); + else + printf("\""); + break; + case '\a': + printf("\\a"); + break; + case '\b': + printf("\\b"); + break; + case 033: + printf("\\e"); + break; + case '\f': + printf("\\f"); + break; + case '\n': + printf("\\n"); + break; + case '\r': + printf("\\r"); + break; + case '\t': + printf("\\t"); + break; + case '\v': + printf("\\v"); + break; + default: +#ifdef MB_CUR_MAX + s = (char*)u - 1; + if ((w = mblen(s, (char*)e - s)) > 1) + { + u += w - 1; + fwrite(s, 1, w, stdout); + } + else +#endif + if (!iscntrl(c) && isprint(c)) + putchar(c); + else + printf("\\x%02x", c); + break; + } + if (test & TEST_DELIMIT) + printf("\""); + } + else + printf("%s", s); +} + +static void +report(char* comment, char* fun, char* re, char* s, int len, char* msg, int flags, unsigned long test) +{ + if (state.file) + printf("%s:", state.file); + printf("%d:", state.lineno); + if (re) + { + printf(" "); + quote(re, -1, test|TEST_DELIMIT); + if (s) + { + printf(" versus "); + quote(s, len, test|TEST_DELIMIT); + } + } + if (test & TEST_UNSPECIFIED) + { + state.unspecified++; + printf(" unspecified behavior"); + } + else + state.errors++; + if (state.which) + printf(" %s", state.which); + if (flags & REG_NOSUB) + printf(" NOSUB"); + if (fun) + printf(" %s", fun); + if (comment[strlen(comment)-1] == '\n') + printf(" %s", comment); + else + { + printf(" %s: ", comment); + if (msg) + printf("%s: ", msg); + } +} + +static void +error(regex_t* preg, int code) +{ + char* msg; + char buf[256]; + + switch (code) + { + case REG_EBUS: + msg = "bus error"; + break; + case REG_EFAULT: + msg = "memory fault"; + break; + case REG_EHUNG: + msg = "did not terminate"; + break; + default: + regerror(code, preg, msg = buf, sizeof buf); + break; + } + printf("%s\n", msg); +} + +static void +bad(char* comment, char* re, char* s, int len, unsigned long test) +{ + printf("bad test case "); + report(comment, NiL, re, s, len, NiL, 0, test); + exit(1); +} + +static int +escape(char* s) +{ + char* b; + char* t; + char* q; + char* e; + int c; + + for (b = t = s; *t = *s; s++, t++) + if (*s == '\\') + switch (*++s) + { + case '\\': + break; + case 'a': + *t = '\a'; + break; + case 'b': + *t = '\b'; + break; + case 'c': + if (*t = *++s) + *t &= 037; + else + s--; + break; + case 'e': + case 'E': + *t = 033; + break; + case 'f': + *t = '\f'; + break; + case 'n': + *t = '\n'; + break; + case 'r': + *t = '\r'; + break; + case 's': + *t = ' '; + break; + case 't': + *t = '\t'; + break; + case 'v': + *t = '\v'; + break; + case 'u': + case 'x': + c = 0; + q = c == 'u' ? (s + 5) : (char*)0; + e = s + 1; + while (!e || !q || s < q) + { + switch (*++s) + { + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + c = (c << 4) + *s - 'a' + 10; + continue; + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + c = (c << 4) + *s - 'A' + 10; + continue; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c = (c << 4) + *s - '0'; + continue; + case '{': + case '[': + if (s != e) + { + s--; + break; + } + e = 0; + continue; + case '}': + case ']': + if (e) + s--; + break; + default: + s--; + break; + } + break; + } + *t = c; + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + c = *s - '0'; + q = s + 2; + while (s < q) + { + switch (*++s) + { + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + c = (c << 3) + *s - '0'; + break; + default: + q = --s; + break; + } + } + *t = c; + break; + default: + *(s + 1) = 0; + bad("invalid C \\ escape\n", s - 1, NiL, 0, 0); + } + return t - b; +} + +static void +matchoffprint(int off) +{ + switch (off) + { + case -2: + printf("X"); + break; + case -1: + printf("?"); + break; + default: + printf("%d", off); + break; + } +} + +static void +matchprint(regmatch_t* match, int nmatch, int nsub, char* ans, unsigned long test) +{ + int i; + + for (; nmatch > nsub + 1; nmatch--) + if ((match[nmatch-1].rm_so != -1 || match[nmatch-1].rm_eo != -1) && (!(test & TEST_IGNORE_POSITION) || match[nmatch-1].rm_so >= 0 && match[nmatch-1].rm_eo >= 0)) + break; + for (i = 0; i < nmatch; i++) + { + printf("("); + matchoffprint(match[i].rm_so); + printf(","); + matchoffprint(match[i].rm_eo); + printf(")"); + } + if (!(test & (TEST_ACTUAL|TEST_BASELINE))) + { + if (ans) + printf(" expected: %s", ans); + printf("\n"); + } +} + +static int +matchcheck(regmatch_t* match, int nmatch, int nsub, char* ans, char* re, char* s, int len, int flags, unsigned long test) +{ + char* p; + int i; + int m; + int n; + + if (streq(ans, "OK")) + return test & (TEST_BASELINE|TEST_PASS|TEST_VERIFY); + for (i = 0, p = ans; i < nmatch && *p; i++) + { + if (*p == '{') + { +#ifdef REG_DISCIPLINE + char* x; + + if (!(x = sfstruse(state.disc.sp))) + bad("out of space [discipline string]\n", NiL, NiL, 0, 0); + if (strcmp(p, x)) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + return 0; + report("callout failed", NiL, re, s, len, NiL, flags, test); + quote(p, -1, test); + printf(" expected, "); + quote(x, -1, test); + printf(" returned\n"); + } +#endif + break; + } + if (*p++ != '(') + bad("improper answer\n", re, s, -1, test); + if (*p == '?') + { + m = -1; + p++; + } + else if (*p == 'R' && !memcmp(p, "RE_DUP_MAX", 10)) + { + m = RE_DUP_MAX; + p += 10; + if (*p == '+' || *p == '-') + m += strtol(p, &p, 10); + } + else + m = strtol(p, &p, 10); + if (*p++ != ',') + bad("improper answer\n", re, s, -1, test); + if (*p == '?') + { + n = -1; + p++; + } + else if (*p == 'R' && !memcmp(p, "RE_DUP_MAX", 10)) + { + n = RE_DUP_MAX; + p += 10; + if (*p == '+' || *p == '-') + n += strtol(p, &p, 10); + } + else + n = strtol(p, &p, 10); + if (*p++ != ')') + bad("improper answer\n", re, s, -1, test); + if (m!=match[i].rm_so || n!=match[i].rm_eo) + { + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY))) + { + report("failed: match was", NiL, re, s, len, NiL, flags, test); + matchprint(match, nmatch, nsub, ans, test); + } + return 0; + } + } + for (; i < nmatch; i++) + { + if (match[i].rm_so!=-1 || match[i].rm_eo!=-1) + { + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_VERIFY))) + { + if ((test & TEST_IGNORE_POSITION) && (match[i].rm_so<0 || match[i].rm_eo<0)) + { + state.ignored++; + return 0; + } + if (!(test & TEST_SUMMARY)) + { + report("failed: match was", NiL, re, s, len, NiL, flags, test); + matchprint(match, nmatch, nsub, ans, test); + } + } + return 0; + } + } + if (!(test & TEST_IGNORE_OVER) && match[nmatch].rm_so != state.NOMATCH.rm_so) + { + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY))) + { + report("failed: overran match array", NiL, re, s, len, NiL, flags, test); + matchprint(match, nmatch + 1, nsub, NiL, test); + } + return 0; + } + return 1; +} + +static void +sigunblock(int s) +{ +#ifdef SIG_SETMASK + int op; + sigset_t mask; + + sigemptyset(&mask); + if (s) + { + sigaddset(&mask, s); + op = SIG_UNBLOCK; + } + else op = SIG_SETMASK; + sigprocmask(op, &mask, NiL); +#else +#ifdef sigmask + sigsetmask(s ? (sigsetmask(0L) & ~sigmask(s)) : 0L); +#endif +#endif +} + +static void +gotcha(int sig) +{ + int ret; + + signal(sig, gotcha); + alarm(0); + state.signals++; + switch (sig) + { + case SIGALRM: + ret = REG_EHUNG; + break; + case SIGBUS: + ret = REG_EBUS; + break; + default: + ret = REG_EFAULT; + break; + } + sigunblock(sig); + longjmp(state.gotcha, ret); +} + +static char* +getline(FILE* fp) +{ + static char buf[32 * 1024]; + + register char* s = buf; + register char* e = &buf[sizeof(buf)]; + register char* b; + + for (;;) + { + if (!(b = fgets(s, e - s, fp))) + return 0; + state.lineno++; + s += strlen(s); + if (s == b || *--s != '\n' || s == b || *(s - 1) != '\\') + { + *s = 0; + break; + } + s--; + } + return buf; +} + +static unsigned long +note(unsigned long level, char* msg, unsigned long skip, unsigned long test) +{ + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY)) && !skip) + { + printf("NOTE\t"); + if (msg) + printf("%s: ", msg); + printf("skipping lines %d", state.lineno); + } + return skip | level; +} + +#define TABS(n) &ts[7-((n)&7)] + +static char ts[] = "\t\t\t\t\t\t\t"; + +static unsigned long +extract(int* tabs, char* spec, char* re, char* s, char* ans, char* msg, char* accept, regmatch_t* match, int nmatch, int nsub, unsigned long skip, unsigned long level, unsigned long test) +{ + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_OK|TEST_PASS|TEST_SUMMARY)) + { + state.extracted = 1; + if (test & TEST_OK) + { + state.passed++; + if ((test & TEST_VERIFY) && !(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + { + if (msg && strcmp(msg, "EXPECTED")) + printf("NOTE\t%s\n", msg); + return skip; + } + test &= ~(TEST_PASS|TEST_QUERY); + } + if (test & (TEST_QUERY|TEST_VERIFY)) + { + if (test & TEST_BASELINE) + test &= ~(TEST_BASELINE|TEST_PASS); + else + test |= TEST_PASS; + skip |= level; + } + if (!(test & TEST_OK)) + { + if (test & TEST_UNSPECIFIED) + state.unspecified++; + else + state.errors++; + } + if (test & (TEST_PASS|TEST_SUMMARY)) + return skip; + test &= ~TEST_DELIMIT; + printf("%s%s", spec, TABS(*tabs++)); + if ((test & (TEST_BASELINE|TEST_SAME)) == (TEST_BASELINE|TEST_SAME)) + printf("SAME"); + else + quote(re, -1, test); + printf("%s", TABS(*tabs++)); + quote(s, -1, test); + printf("%s", TABS(*tabs++)); + if (!(test & (TEST_ACTUAL|TEST_BASELINE)) || !accept && !match) + printf("%s", ans); + else if (accept) + printf("%s", accept); + else + matchprint(match, nmatch, nsub, NiL, test); + if (msg) + printf("%s%s", TABS(*tabs++), msg); + putchar('\n'); + } + else if (test & TEST_QUERY) + skip = note(level, msg, skip, test); + else if (test & TEST_VERIFY) + state.extracted = 1; + return skip; +} + +static int +catchfree(regex_t* preg, int flags, int* tabs, char* spec, char* re, char* s, char* ans, char* msg, char* accept, regmatch_t* match, int nmatch, int nsub, unsigned long skip, unsigned long level, unsigned long test) +{ + int eret; + + if (!(test & TEST_CATCH)) + { + regfree(preg); + eret = 0; + } + else if (!(eret = setjmp(state.gotcha))) + { + alarm(HUNG); + regfree(preg); + alarm(0); + } + else if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + extract(tabs, spec, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test); + else + { + report("failed", "regfree", re, NiL, -1, msg, flags, test); + error(preg, eret); + } + return eret; +} + +static char* +expand(char* os, char* ot) +{ + char* s = os; + char* t; + int n = 0; + int r; + long m; + + for (;;) + { + switch (*s++) + { + case 0: + break; + case '{': + n++; + continue; + case '}': + n--; + continue; + case 'R': + if (n == 1 && !memcmp(s, "E_DUP_MAX", 9)) + { + s--; + for (t = ot; os < s; *t++ = *os++); + r = ((t - ot) >= 5 && t[-1] == '{' && t[-2] == '.' && t[-3] == '.' && t[-4] == '.') ? t[-5] : 0; + os = ot; + m = RE_DUP_MAX; + if (*(s += 10) == '+' || *s == '-') + m += strtol(s, &s, 10); + if (r) + { + t -= 5; + while (m-- > 0) + *t++ = r; + while (*s && *s++ != '}'); + } + else + t += snprintf(t, 32, "%ld", m); + while (*t = *s++) + t++; + break; + } + continue; + default: + continue; + } + break; + } + return os; +} + +int +main(int argc, char** argv) +{ + int flags; + int cflags; + int eflags; + int nmatch; + int nexec; + int nstr; + int cret; + int eret; + int nsub; + int i; + int j; + int expected; + int got; + int locale; + int subunitlen; + int testno; + unsigned long level; + unsigned long skip; + char* p; + char* line; + char* spec; + char* re; + char* s; + char* ans; + char* msg; + char* fun; + char* ppat; + char* subunit; + char* version; + char* field[6]; + char* delim[6]; + FILE* fp; + int tabs[6]; + char unit[64]; + regmatch_t match[100]; + regex_t preg; + + static char pat[32 * 1024]; + static char patbuf[32 * 1024]; + static char strbuf[32 * 1024]; + + int nonosub = REG_NOSUB == 0; + int nonexec = 0; + + unsigned long test = 0; + + static char* filter[] = { "-", 0 }; + + state.NOMATCH.rm_so = state.NOMATCH.rm_eo = -2; + p = unit; + version = (char*)id + 10; + while (p < &unit[sizeof(unit)-1] && (*p = *version++) && !isspace(*p)) + p++; + *p = 0; + while ((p = *++argv) && *p == '-') + for (;;) + { + switch (*++p) + { + case 0: + break; + case 'c': + test |= TEST_CATCH; + continue; + case 'e': + test |= TEST_IGNORE_ERROR; + continue; + case 'h': + case '?': + help(0); + return 2; + case '-': + help(p[1] == 'h'); + return 2; + case 'n': + nonexec = 1; + continue; + case 'o': + test |= TEST_IGNORE_OVER; + continue; + case 'p': + test |= TEST_IGNORE_POSITION; + continue; + case 's': +#ifdef REG_DISCIPLINE + if (!(state.stack = stkalloc(stkstd, 0))) + fprintf(stderr, "%s: out of space [stack]", unit); + state.disc.disc.re_resizef = resizef; + state.disc.disc.re_resizehandle = (void*)stkstd; +#endif + continue; + case 'x': + nonosub = 1; + continue; + case 'v': + test |= TEST_VERBOSE; + continue; + case 'A': + test |= TEST_ACTUAL; + continue; + case 'B': + test |= TEST_BASELINE; + continue; + case 'F': + test |= TEST_FAIL; + continue; + case 'P': + test |= TEST_PASS; + continue; + case 'S': + test |= TEST_SUMMARY; + continue; + default: + fprintf(stderr, "%s: %c: invalid option\n", unit, *p); + return 2; + } + break; + } + if (!*argv) + argv = filter; + locale = 0; + while (state.file = *argv++) + { + if (streq(state.file, "-") || streq(state.file, "/dev/stdin") || streq(state.file, "/dev/fd/0")) + { + state.file = 0; + fp = stdin; + } + else if (!(fp = fopen(state.file, "r"))) + { + fprintf(stderr, "%s: %s: cannot read\n", unit, state.file); + return 2; + } + testno = state.errors = state.ignored = state.lineno = state.passed = + state.signals = state.unspecified = state.warnings = 0; + skip = 0; + level = 1; + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + { + printf("TEST\t%s ", unit); + if (s = state.file) + { + subunit = p = 0; + for (;;) + { + switch (*s++) + { + case 0: + break; + case '/': + subunit = s; + continue; + case '.': + p = s - 1; + continue; + default: + continue; + } + break; + } + if (!subunit) + subunit = state.file; + if (p < subunit) + p = s - 1; + subunitlen = p - subunit; + printf("%-.*s ", subunitlen, subunit); + } + else + subunit = 0; + for (s = version; *s && (*s != ' ' || *(s + 1) != '$'); s++) + putchar(*s); + if (test & TEST_CATCH) + printf(", catch"); + if (test & TEST_IGNORE_ERROR) + printf(", ignore error code mismatches"); + if (test & TEST_IGNORE_POSITION) + printf(", ignore negative position mismatches"); +#ifdef REG_DISCIPLINE + if (state.stack) + printf(", stack"); +#endif + if (test & TEST_VERBOSE) + printf(", verbose"); + printf("\n"); +#ifdef REG_VERSIONID + if (regerror(REG_VERSIONID, NiL, pat, sizeof(pat)) > 0) + s = pat; + else +#endif +#ifdef REG_TEST_VERSION + s = REG_TEST_VERSION; +#else + s = "regex"; +#endif + printf("NOTE\t%s\n", s); + if (elementsof(unsupported) > 1) + { +#if (REG_TEST_DEFAULT & (REG_AUGMENTED|REG_EXTENDED|REG_SHELL)) || !defined(REG_EXTENDED) + i = 0; +#else + i = REG_EXTENDED != 0; +#endif + for (got = 0; i < elementsof(unsupported) - 1; i++) + { + if (!got) + { + got = 1; + printf("NOTE\tunsupported: %s", unsupported[i]); + } + else + printf(",%s", unsupported[i]); + } + if (got) + printf("\n"); + } + } +#ifdef REG_DISCIPLINE + state.disc.disc.re_version = REG_VERSION; + state.disc.disc.re_compf = compf; + state.disc.disc.re_execf = execf; + if (!(state.disc.sp = sfstropen())) + bad("out of space [discipline string stream]\n", NiL, NiL, 0, 0); + preg.re_disc = &state.disc.disc; +#endif + if (test & TEST_CATCH) + { + signal(SIGALRM, gotcha); + signal(SIGBUS, gotcha); + signal(SIGSEGV, gotcha); + } + while (p = getline(fp)) + { + + /* parse: */ + + line = p; + if (*p == ':' && !isspace(*(p + 1))) + { + while (*++p && *p != ':'); + if (!*p++) + { + if (test & TEST_BASELINE) + printf("%s\n", line); + continue; + } + } + while (isspace(*p)) + p++; + if (*p == 0 || *p == '#' || *p == 'T') + { + if (test & TEST_BASELINE) + printf("%s\n", line); + continue; + } + if (*p == ':' || *p == 'N') + { + if (test & TEST_BASELINE) + printf("%s\n", line); + else if (!(test & (TEST_ACTUAL|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + { + while (*++p && !isspace(*p)); + while (isspace(*p)) + p++; + printf("NOTE %s\n", p); + } + continue; + } + j = 0; + i = 0; + field[i++] = p; + for (;;) + { + switch (*p++) + { + case 0: + p--; + j = 0; + goto checkfield; + case '\t': + *(delim[i] = p - 1) = 0; + j = 1; + checkfield: + s = field[i - 1]; + if (streq(s, "NIL")) + field[i - 1] = 0; + else if (streq(s, "NULL")) + *s = 0; + while (*p == '\t') + { + p++; + j++; + } + tabs[i - 1] = j; + if (!*p) + break; + if (i >= elementsof(field)) + bad("too many fields\n", NiL, NiL, 0, 0); + field[i++] = p; + /*FALLTHROUGH*/ + default: + continue; + } + break; + } + if (!(spec = field[0])) + bad("NIL spec\n", NiL, NiL, 0, 0); + + /* interpret: */ + + cflags = REG_TEST_DEFAULT; + eflags = REG_EXEC_DEFAULT; + test &= TEST_GLOBAL; + state.extracted = 0; + nmatch = 20; + nsub = -1; + for (p = spec; *p; p++) + { + if (isdigit(*p)) + { + nmatch = strtol(p, &p, 10); + if (nmatch >= elementsof(match)) + bad("nmatch must be < 100\n", NiL, NiL, 0, 0); + p--; + continue; + } + switch (*p) + { + case 'A': + test |= TEST_ARE; + continue; + case 'B': + test |= TEST_BRE; + continue; + case 'C': + if (!(test & TEST_QUERY) && !(skip & level)) + bad("locale must be nested\n", NiL, NiL, 0, 0); + test &= ~TEST_QUERY; + if (locale) + bad("locale nesting not supported\n", NiL, NiL, 0, 0); + if (i != 2) + bad("locale field expected\n", NiL, NiL, 0, 0); + if (!(skip & level)) + { +#if defined(LC_COLLATE) && defined(LC_CTYPE) + s = field[1]; + if (!s || streq(s, "POSIX")) + s = "C"; + if ((ans = setlocale(LC_COLLATE, s)) && streq(ans, "POSIX")) + ans = "C"; + if (!ans || !streq(ans, s) && streq(s, "C")) + ans = 0; + else if ((ans = setlocale(LC_CTYPE, s)) && streq(ans, "POSIX")) + ans = "C"; + if (!ans || !streq(ans, s) && streq(s, "C")) + skip = note(level, s, skip, test); + else + { + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + printf("NOTE \"%s\" locale\n", s); + locale = level; + } +#else + skip = note(level, skip, test, "locales not supported"); +#endif + } + cflags = NOTEST; + continue; + case 'E': + test |= TEST_ERE; + continue; + case 'K': + test |= TEST_KRE; + continue; + case 'L': + test |= TEST_LRE; + continue; + case 'S': + test |= TEST_SRE; + continue; + + case 'a': + cflags |= REG_LEFT|REG_RIGHT; + continue; + case 'b': + eflags |= REG_NOTBOL; + continue; + case 'c': + cflags |= REG_COMMENT; + continue; + case 'd': + cflags |= REG_SHELL_DOT; + continue; + case 'e': + eflags |= REG_NOTEOL; + continue; + case 'f': + cflags |= REG_MULTIPLE; + continue; + case 'g': + cflags |= NOTEST; + continue; + case 'h': + cflags |= REG_MULTIREF; + continue; + case 'i': + cflags |= REG_ICASE; + continue; + case 'j': + cflags |= REG_SPAN; + continue; + case 'k': + cflags |= REG_ESCAPE; + continue; + case 'l': + cflags |= REG_LEFT; + continue; + case 'm': + cflags |= REG_MINIMAL; + continue; + case 'n': + cflags |= REG_NEWLINE; + continue; + case 'o': + cflags |= REG_SHELL_GROUP; + continue; + case 'p': + cflags |= REG_SHELL_PATH; + continue; + case 'q': + cflags |= REG_DELIMITED; + continue; + case 'r': + cflags |= REG_RIGHT; + continue; + case 's': + cflags |= REG_SHELL_ESCAPED; + continue; + case 't': + cflags |= REG_MUSTDELIM; + continue; + case 'u': + test |= TEST_UNSPECIFIED; + continue; + case 'v': + cflags |= REG_CLASS_ESCAPE; + continue; + case 'w': + cflags |= REG_NOSUB; + continue; + case 'x': + if (REG_LENIENT) + cflags |= REG_LENIENT; + else + test |= TEST_LENIENT; + continue; + case 'y': + eflags |= REG_LEFT; + continue; + case 'z': + cflags |= REG_NULL; + continue; + + case '$': + test |= TEST_EXPAND; + continue; + + case '/': + test |= TEST_SUB; + continue; + + case '=': + test |= TEST_DECOMP; + continue; + + case '?': + test |= TEST_VERIFY; + test &= ~(TEST_AND|TEST_OR); + state.verify = state.passed; + continue; + case '&': + test |= TEST_VERIFY|TEST_AND; + test &= ~TEST_OR; + continue; + case '|': + test |= TEST_VERIFY|TEST_OR; + test &= ~TEST_AND; + continue; + case ';': + test |= TEST_OR; + test &= ~TEST_AND; + continue; + + case '{': + level <<= 1; + if (skip & (level >> 1)) + { + skip |= level; + cflags = NOTEST; + } + else + { + skip &= ~level; + test |= TEST_QUERY; + } + continue; + case '}': + if (level == 1) + bad("invalid {...} nesting\n", NiL, NiL, 0, 0); + if ((skip & level) && !(skip & (level>>1))) + { + if (!(test & (TEST_BASELINE|TEST_SUMMARY))) + { + if (test & (TEST_ACTUAL|TEST_FAIL)) + printf("}\n"); + else if (!(test & TEST_PASS)) + printf("-%d\n", state.lineno); + } + } +#if defined(LC_COLLATE) && defined(LC_CTYPE) + else if (locale & level) + { + locale = 0; + if (!(skip & level)) + { + s = "C"; + setlocale(LC_COLLATE, s); + setlocale(LC_CTYPE, s); + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + printf("NOTE \"%s\" locale\n", s); + else if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_PASS)) + printf("}\n"); + } + else if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL)) + printf("}\n"); + } +#endif + level >>= 1; + cflags = NOTEST; + continue; + + default: + bad("bad spec\n", spec, NiL, 0, test); + break; + + } + break; + } + if ((cflags|eflags) == NOTEST || (skip & level) && (test & TEST_BASELINE)) + { + if (test & TEST_BASELINE) + { + while (i > 1) + *delim[--i] = '\t'; + printf("%s\n", line); + } + continue; + } + if (test & TEST_OR) + { + if (!(test & TEST_VERIFY)) + { + test &= ~TEST_OR; + if (state.passed == state.verify && i > 1) + printf("NOTE\t%s\n", field[1]); + continue; + } + else if (state.passed > state.verify) + continue; + } + else if (test & TEST_AND) + { + if (state.passed == state.verify) + continue; + state.passed = state.verify; + } + if (i < ((test & TEST_DECOMP) ? 3 : 4)) + bad("too few fields\n", NiL, NiL, 0, test); + while (i < elementsof(field)) + field[i++] = 0; + if (re = field[1]) + { + if (streq(re, "SAME")) + { + re = ppat; + test |= TEST_SAME; + } + else + { + if (test & TEST_EXPAND) + escape(re); + re = expand(re, patbuf); + strcpy(ppat = pat, re); + } + } + else + ppat = 0; + nstr = -1; + if (s = field[2]) + { + s = expand(s, strbuf); + if (test & TEST_EXPAND) + { + nstr = escape(s); +#if _REG_nexec + if (nstr != strlen(s)) + nexec = nstr; +#endif + } + } + if (!(ans = field[(test & TEST_DECOMP) ? 2 : 3])) + bad("NIL answer\n", NiL, NiL, 0, test); + msg = field[4]; + fflush(stdout); + if (test & TEST_SUB) +#if _REG_subcomp + cflags |= REG_DELIMITED; +#else + continue; +#endif +#if !_REG_decomp + if (test & TEST_DECOMP) + continue; +#endif + + compile: + + if (state.extracted || (skip & level)) + continue; +#if !(REG_TEST_DEFAULT & (REG_AUGMENTED|REG_EXTENDED|REG_SHELL)) +#ifdef REG_EXTENDED + if (REG_EXTENDED != 0 && (test & TEST_BRE)) +#else + if (test & TEST_BRE) +#endif + { + test &= ~TEST_BRE; + flags = cflags; + state.which = "BRE"; + } + else +#endif +#ifdef REG_EXTENDED + if (test & TEST_ERE) + { + test &= ~TEST_ERE; + flags = cflags | REG_EXTENDED; + state.which = "ERE"; + } + else +#endif +#ifdef REG_AUGMENTED + if (test & TEST_ARE) + { + test &= ~TEST_ARE; + flags = cflags | REG_AUGMENTED; + state.which = "ARE"; + } + else +#endif +#ifdef REG_LITERAL + if (test & TEST_LRE) + { + test &= ~TEST_LRE; + flags = cflags | REG_LITERAL; + state.which = "LRE"; + } + else +#endif +#ifdef REG_SHELL + if (test & TEST_SRE) + { + test &= ~TEST_SRE; + flags = cflags | REG_SHELL; + state.which = "SRE"; + } + else +#ifdef REG_AUGMENTED + if (test & TEST_KRE) + { + test &= ~TEST_KRE; + flags = cflags | REG_SHELL | REG_AUGMENTED; + state.which = "KRE"; + } + else +#endif +#endif + { + if (test & (TEST_BASELINE|TEST_PASS|TEST_VERIFY)) + extract(tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test|TEST_OK); + continue; + } + if ((test & (TEST_QUERY|TEST_VERBOSE|TEST_VERIFY)) == TEST_VERBOSE) + { + printf("test %-3d %s ", state.lineno, state.which); + quote(re, -1, test|TEST_DELIMIT); + printf(" "); + quote(s, nstr, test|TEST_DELIMIT); + printf("\n"); + } + + nosub: + fun = "regcomp"; +#if _REG_nexec + if (nstr >= 0 && nstr != strlen(s)) + nexec = nstr; + + else +#endif + nexec = -1; + if (state.extracted || (skip & level)) + continue; + if (!(test & TEST_QUERY)) + testno++; +#ifdef REG_DISCIPLINE + if (state.stack) + stkset(stkstd, state.stack, 0); + flags |= REG_DISCIPLINE; + state.disc.ordinal = 0; + sfstrseek(state.disc.sp, 0, SEEK_SET); +#endif + if (!(test & TEST_CATCH)) + cret = regcomp(&preg, re, flags); + else if (!(cret = setjmp(state.gotcha))) + { + alarm(HUNG); + cret = regcomp(&preg, re, flags); + alarm(0); + } +#if _REG_subcomp + if (!cret && (test & TEST_SUB)) + { + fun = "regsubcomp"; + p = re + preg.re_npat; + if (!(test & TEST_CATCH)) + cret = regsubcomp(&preg, p, NiL, 0, 0); + else if (!(cret = setjmp(state.gotcha))) + { + alarm(HUNG); + cret = regsubcomp(&preg, p, NiL, 0, 0); + alarm(0); + } + if (!cret && *(p += preg.re_npat) && !(preg.re_sub->re_flags & REG_SUB_LAST)) + { + if (catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test)) + continue; + cret = REG_EFLAGS; + } + } +#endif +#if _REG_decomp + if (!cret && (test & TEST_DECOMP)) + { + char buf[128]; + + if ((j = nmatch) > sizeof(buf)) + j = sizeof(buf); + fun = "regdecomp"; + p = re + preg.re_npat; + if (!(test & TEST_CATCH)) + i = regdecomp(&preg, -1, buf, j); + else if (!(cret = setjmp(state.gotcha))) + { + alarm(HUNG); + i = regdecomp(&preg, -1, buf, j); + alarm(0); + } + if (!cret) + { + catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test); + if (i > j) + { + if (i != (strlen(ans) + 1)) + { + report("failed", fun, re, s, nstr, msg, flags, test); + printf(" %d byte buffer supplied, %d byte buffer required\n", j, i); + } + } + else if (strcmp(buf, ans)) + { + report("failed", fun, re, s, nstr, msg, flags, test); + quote(ans, -1, test|TEST_DELIMIT); + printf(" expected, "); + quote(buf, -1, test|TEST_DELIMIT); + printf(" returned\n"); + } + continue; + } + } +#endif + if (!cret) + { + if (!(flags & REG_NOSUB) && nsub < 0 && *ans == '(') + { + for (p = ans; *p; p++) + if (*p == '(') + nsub++; + else if (*p == '{') + nsub--; + if (nsub >= 0) + { + if (test & TEST_IGNORE_OVER) + { + if (nmatch > nsub) + nmatch = nsub + 1; + } + else if (nsub != preg.re_nsub) + { + if (nsub > preg.re_nsub) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, "OK", NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else + { + report("re_nsub incorrect", fun, re, NiL, -1, msg, flags, test); + printf("at least %d expected, %d returned\n", nsub, preg.re_nsub); + state.errors++; + } + } + else + nsub = preg.re_nsub; + } + } + } + if (!(test & (TEST_DECOMP|TEST_SUB)) && *ans && *ans != '(' && !streq(ans, "OK") && !streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, "OK", NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else if (!(test & TEST_LENIENT)) + { + report("failed", fun, re, NiL, -1, msg, flags, test); + printf("%s expected, OK returned\n", ans); + } + catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test); + continue; + } + } + else + { + if (test & TEST_LENIENT) + /* we'll let it go this time */; + else if (!*ans || ans[0]=='(' || cret == REG_BADPAT && streq(ans, "NOMATCH")) + { + got = 0; + for (i = 1; i < elementsof(codes); i++) + if (cret==codes[i].code) + got = i; + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, codes[got].name, NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else + { + report("failed", fun, re, NiL, -1, msg, flags, test); + printf("%s returned: ", codes[got].name); + error(&preg, cret); + } + } + else + { + expected = got = 0; + for (i = 1; i < elementsof(codes); i++) + { + if (streq(ans, codes[i].name)) + expected = i; + if (cret==codes[i].code) + got = i; + } + if (!expected) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, codes[got].name, NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else + { + report("failed: invalid error code", NiL, re, NiL, -1, msg, flags, test); + printf("%s expected, %s returned\n", ans, codes[got].name); + } + } + else if (cret != codes[expected].code && cret != REG_BADPAT) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, codes[got].name, NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else if (test & TEST_IGNORE_ERROR) + state.ignored++; + else + { + report("should fail and did", fun, re, NiL, -1, msg, flags, test); + printf("%s expected, %s returned: ", ans, codes[got].name); + state.errors--; + state.warnings++; + error(&preg, cret); + } + } + } + goto compile; + } + +#if _REG_nexec + execute: + if (nexec >= 0) + fun = "regnexec"; + else +#endif + fun = "regexec"; + + for (i = 0; i < elementsof(match); i++) + match[i] = state.NOMATCH; + +#if _REG_nexec + if (nexec >= 0) + { + eret = regnexec(&preg, s, nexec, nmatch, match, eflags); + s[nexec] = 0; + } + else +#endif + { + if (!(test & TEST_CATCH)) + eret = regexec(&preg, s, nmatch, match, eflags); + else if (!(eret = setjmp(state.gotcha))) + { + alarm(HUNG); + eret = regexec(&preg, s, nmatch, match, eflags); + alarm(0); + } + } +#if _REG_subcomp + if ((test & TEST_SUB) && !eret) + { + fun = "regsubexec"; + if (!(test & TEST_CATCH)) + eret = regsubexec(&preg, s, nmatch, match); + else if (!(eret = setjmp(state.gotcha))) + { + alarm(HUNG); + eret = regsubexec(&preg, s, nmatch, match); + alarm(0); + } + } +#endif + if (flags & REG_NOSUB) + { + if (eret) + { + if (eret != REG_NOMATCH || !streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, "NOMATCH", NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else + { + report("REG_NOSUB failed", fun, re, s, nstr, msg, flags, test); + error(&preg, eret); + } + } + } + else if (streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, match, nmatch, nsub, skip, level, test|TEST_DELIMIT); + else + { + report("should fail and didn't", fun, re, s, nstr, msg, flags, test); + error(&preg, eret); + } + } + } + else if (eret) + { + if (eret != REG_NOMATCH || !streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, "NOMATCH", NiL, 0, nsub, skip, level, test|TEST_DELIMIT); + else + { + report("failed", fun, re, s, nstr, msg, flags, test); + if (eret != REG_NOMATCH) + error(&preg, eret); + else if (*ans) + printf("expected: %s\n", ans); + else + printf("\n"); + } + } + } + else if (streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, match, nmatch, nsub, skip, level, test|TEST_DELIMIT); + else + { + report("should fail and didn't", fun, re, s, nstr, msg, flags, test); + matchprint(match, nmatch, nsub, NiL, test); + } + } +#if _REG_subcomp + else if (test & TEST_SUB) + { + p = preg.re_sub->re_buf; + if (strcmp(p, ans)) + { + report("failed", fun, re, s, nstr, msg, flags, test); + quote(ans, -1, test|TEST_DELIMIT); + printf(" expected, "); + quote(p, -1, test|TEST_DELIMIT); + printf(" returned\n"); + } + } +#endif + else if (!*ans) + { + if (match[0].rm_so != state.NOMATCH.rm_so) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test); + else + { + report("failed: no match but match array assigned", NiL, re, s, nstr, msg, flags, test); + matchprint(match, nmatch, nsub, NiL, test); + } + } + } + else if (matchcheck(match, nmatch, nsub, ans, re, s, nstr, flags, test)) + { +#if _REG_nexec + if (nexec < 0 && !nonexec) + { + nexec = nstr >= 0 ? nstr : strlen(s); + s[nexec] = '\n'; + testno++; + goto execute; + } +#endif + if (!(test & (TEST_DECOMP|TEST_SUB|TEST_VERIFY)) && !nonosub) + { + if (catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test)) + continue; + flags |= REG_NOSUB; + goto nosub; + } + if (test & (TEST_BASELINE|TEST_PASS|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, match, nmatch, nsub, skip, level, test|TEST_OK); + } + else if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, match, nmatch, nsub, skip, level, test|TEST_DELIMIT); + if (catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test)) + continue; + goto compile; + } + if (test & TEST_SUMMARY) + printf("tests=%-4d errors=%-4d warnings=%-2d ignored=%-2d unspecified=%-2d signals=%d\n", testno, state.errors, state.warnings, state.ignored, state.unspecified, state.signals); + else if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS))) + { + printf("TEST\t%s", unit); + if (subunit) + printf(" %-.*s", subunitlen, subunit); + printf(", %d test%s", testno, testno == 1 ? "" : "s"); + if (state.ignored) + printf(", %d ignored mismatche%s", state.ignored, state.ignored == 1 ? "" : "s"); + if (state.warnings) + printf(", %d warning%s", state.warnings, state.warnings == 1 ? "" : "s"); + if (state.unspecified) + printf(", %d unspecified difference%s", state.unspecified, state.unspecified == 1 ? "" : "s"); + if (state.signals) + printf(", %d signal%s", state.signals, state.signals == 1 ? "" : "s"); + printf(", %d error%s\n", state.errors, state.errors == 1 ? "" : "s"); + } + if (fp != stdin) + fclose(fp); + } + return 0; +} |