Adding upstream version 1.22.1.upstream/1.22.1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-16 19:25:22 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-16 19:25:22 +0000
commit: f6ad4dcef54c5ce997a4bad5a6d86de229015700 (patch)
tree: 7cfa4e31ace5c2bd95c72b154d15af494b2bcbef /src/regexp/onepass.go
parent: Initial commit. (diff)
download: golang-1.22-f6ad4dcef54c5ce997a4bad5a6d86de229015700.tar.xz
golang-1.22-f6ad4dcef54c5ce997a4bad5a6d86de229015700.zip
1 files changed, 507 insertions, 0 deletions
diff --git a/src/regexp/onepass.go b/src/regexp/onepass.go
new file mode 100644
index 0000000..b3066e8
--- /dev/null
+++ b/src/regexp/onepass.go
@@ -0,0 +1,507 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package regexp
+
+import (
+	"regexp/syntax"
+	"sort"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+// "One-pass" regexp execution.
+// Some regexps can be analyzed to determine that they never need
+// backtracking: they are guaranteed to run in one pass over the string
+// without bothering to save all the usual NFA state.
+// Detect those and execute them more quickly.
+
+// A onePassProg is a compiled one-pass regular expression program.
+// It is the same as syntax.Prog except for the use of onePassInst.
+type onePassProg struct {
+	Inst   []onePassInst
+	Start  int // index of start instruction
+	NumCap int // number of InstCapture insts in re
+}
+
+// A onePassInst is a single instruction in a one-pass regular expression program.
+// It is the same as syntax.Inst except for the new 'Next' field.
+type onePassInst struct {
+	syntax.Inst
+	Next []uint32
+}
+
+// onePassPrefix returns a literal string that all matches for the
+// regexp must start with. Complete is true if the prefix
+// is the entire match. Pc is the index of the last rune instruction
+// in the string. The onePassPrefix skips over the mandatory
+// EmptyBeginText.
+func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
+	i := &p.Inst[p.Start]
+	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
+		return "", i.Op == syntax.InstMatch, uint32(p.Start)
+	}
+	pc = i.Out
+	i = &p.Inst[pc]
+	for i.Op == syntax.InstNop {
+		pc = i.Out
+		i = &p.Inst[pc]
+	}
+	// Avoid allocation of buffer if prefix is empty.
+	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
+		return "", i.Op == syntax.InstMatch, uint32(p.Start)
+	}
+
+	// Have prefix; gather characters.
+	var buf strings.Builder
+	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
+		buf.WriteRune(i.Rune[0])
+		pc, i = i.Out, &p.Inst[i.Out]
+	}
+	if i.Op == syntax.InstEmptyWidth &&
+		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
+		p.Inst[i.Out].Op == syntax.InstMatch {
+		complete = true
+	}
+	return buf.String(), complete, pc
+}
+
+// onePassNext selects the next actionable state of the prog, based on the input character.
+// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
+// One of the alternates may ultimately lead without input to end of line. If the instruction
+// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
+func onePassNext(i *onePassInst, r rune) uint32 {
+	next := i.MatchRunePos(r)
+	if next >= 0 {
+		return i.Next[next]
+	}
+	if i.Op == syntax.InstAltMatch {
+		return i.Out
+	}
+	return 0
+}
+
+func iop(i *syntax.Inst) syntax.InstOp {
+	op := i.Op
+	switch op {
+	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+		op = syntax.InstRune
+	}
+	return op
+}
+
+// Sparse Array implementation is used as a queueOnePass.
+type queueOnePass struct {
+	sparse          []uint32
+	dense           []uint32
+	size, nextIndex uint32
+}
+
+func (q *queueOnePass) empty() bool {
+	return q.nextIndex >= q.size
+}
+
+func (q *queueOnePass) next() (n uint32) {
+	n = q.dense[q.nextIndex]
+	q.nextIndex++
+	return
+}
+
+func (q *queueOnePass) clear() {
+	q.size = 0
+	q.nextIndex = 0
+}
+
+func (q *queueOnePass) contains(u uint32) bool {
+	if u >= uint32(len(q.sparse)) {
+		return false
+	}
+	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
+}
+
+func (q *queueOnePass) insert(u uint32) {
+	if !q.contains(u) {
+		q.insertNew(u)
+	}
+}
+
+func (q *queueOnePass) insertNew(u uint32) {
+	if u >= uint32(len(q.sparse)) {
+		return
+	}
+	q.sparse[u] = q.size
+	q.dense[q.size] = u
+	q.size++
+}
+
+func newQueue(size int) (q *queueOnePass) {
+	return &queueOnePass{
+		sparse: make([]uint32, size),
+		dense:  make([]uint32, size),
+	}
+}
+
+// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
+// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
+// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
+// NextIp array with the single element mergeFailed is returned.
+// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
+const mergeFailed = uint32(0xffffffff)
+
+var (
+	noRune = []rune{}
+	noNext = []uint32{mergeFailed}
+)
+
+func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
+	leftLen := len(*leftRunes)
+	rightLen := len(*rightRunes)
+	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
+		panic("mergeRuneSets odd length []rune")
+	}
+	var (
+		lx, rx int
+	)
+	merged := make([]rune, 0)
+	next := make([]uint32, 0)
+	ok := true
+	defer func() {
+		if !ok {
+			merged = nil
+			next = nil
+		}
+	}()
+
+	ix := -1
+	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
+		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
+			return false
+		}
+		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
+		*newLow += 2
+		ix += 2
+		next = append(next, pc)
+		return true
+	}
+
+	for lx < leftLen || rx < rightLen {
+		switch {
+		case rx >= rightLen:
+			ok = extend(&lx, leftRunes, leftPC)
+		case lx >= leftLen:
+			ok = extend(&rx, rightRunes, rightPC)
+		case (*rightRunes)[rx] < (*leftRunes)[lx]:
+			ok = extend(&rx, rightRunes, rightPC)
+		default:
+			ok = extend(&lx, leftRunes, leftPC)
+		}
+		if !ok {
+			return noRune, noNext
+		}
+	}
+	return merged, next
+}
+
+// cleanupOnePass drops working memory, and restores certain shortcut instructions.
+func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
+	for ix, instOriginal := range original.Inst {
+		switch instOriginal.Op {
+		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
+		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
+			prog.Inst[ix].Next = nil
+		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+			prog.Inst[ix].Next = nil
+			prog.Inst[ix] = onePassInst{Inst: instOriginal}
+		}
+	}
+}
+
+// onePassCopy creates a copy of the original Prog, as we'll be modifying it.
+func onePassCopy(prog *syntax.Prog) *onePassProg {
+	p := &onePassProg{
+		Start:  prog.Start,
+		NumCap: prog.NumCap,
+		Inst:   make([]onePassInst, len(prog.Inst)),
+	}
+	for i, inst := range prog.Inst {
+		p.Inst[i] = onePassInst{Inst: inst}
+	}
+
+	// rewrites one or more common Prog constructs that enable some otherwise
+	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
+	// ip A, that points to ips B & C.
+	// A:BC + B:DA => A:BC + B:CD
+	// A:BC + B:DC => A:DC + B:DC
+	for pc := range p.Inst {
+		switch p.Inst[pc].Op {
+		default:
+			continue
+		case syntax.InstAlt, syntax.InstAltMatch:
+			// A:Bx + B:Ay
+			p_A_Other := &p.Inst[pc].Out
+			p_A_Alt := &p.Inst[pc].Arg
+			// make sure a target is another Alt
+			instAlt := p.Inst[*p_A_Alt]
+			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
+				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
+				instAlt = p.Inst[*p_A_Alt]
+				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
+					continue
+				}
+			}
+			instOther := p.Inst[*p_A_Other]
+			// Analyzing both legs pointing to Alts is for another day
+			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
+				// too complicated
+				continue
+			}
+			// simple empty transition loop
+			// A:BC + B:DA => A:BC + B:DC
+			p_B_Alt := &p.Inst[*p_A_Alt].Out
+			p_B_Other := &p.Inst[*p_A_Alt].Arg
+			patch := false
+			if instAlt.Out == uint32(pc) {
+				patch = true
+			} else if instAlt.Arg == uint32(pc) {
+				patch = true
+				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
+			}
+			if patch {
+				*p_B_Alt = *p_A_Other
+			}
+
+			// empty transition to common target
+			// A:BC + B:DC => A:DC + B:DC
+			if *p_A_Other == *p_B_Alt {
+				*p_A_Alt = *p_B_Other
+			}
+		}
+	}
+	return p
+}
+
+// runeSlice exists to permit sorting the case-folded rune sets.
+type runeSlice []rune
+
+func (p runeSlice) Len() int           { return len(p) }
+func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
+func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
+
+var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
+var anyRune = []rune{0, unicode.MaxRune}
+
+// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
+// the match engine can always tell which branch to take. The routine may modify
+// p if it is turned into a onepass Prog. If it isn't possible for this to be a
+// onepass Prog, the Prog nil is returned. makeOnePass is recursive
+// to the size of the Prog.
+func makeOnePass(p *onePassProg) *onePassProg {
+	// If the machine is very long, it's not worth the time to check if we can use one pass.
+	if len(p.Inst) >= 1000 {
+		return nil
+	}
+
+	var (
+		instQueue    = newQueue(len(p.Inst))
+		visitQueue   = newQueue(len(p.Inst))
+		check        func(uint32, []bool) bool
+		onePassRunes = make([][]rune, len(p.Inst))
+	)
+
+	// check that paths from Alt instructions are unambiguous, and rebuild the new
+	// program as a onepass program
+	check = func(pc uint32, m []bool) (ok bool) {
+		ok = true
+		inst := &p.Inst[pc]
+		if visitQueue.contains(pc) {
+			return
+		}
+		visitQueue.insert(pc)
+		switch inst.Op {
+		case syntax.InstAlt, syntax.InstAltMatch:
+			ok = check(inst.Out, m) && check(inst.Arg, m)
+			// check no-input paths to InstMatch
+			matchOut := m[inst.Out]
+			matchArg := m[inst.Arg]
+			if matchOut && matchArg {
+				ok = false
+				break
+			}
+			// Match on empty goes in inst.Out
+			if matchArg {
+				inst.Out, inst.Arg = inst.Arg, inst.Out
+				matchOut, matchArg = matchArg, matchOut
+			}
+			if matchOut {
+				m[pc] = true
+				inst.Op = syntax.InstAltMatch
+			}
+
+			// build a dispatch operator from the two legs of the alt.
+			onePassRunes[pc], inst.Next = mergeRuneSets(
+				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
+			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
+				ok = false
+				break
+			}
+		case syntax.InstCapture, syntax.InstNop:
+			ok = check(inst.Out, m)
+			m[pc] = m[inst.Out]
+			// pass matching runes back through these no-ops.
+			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
+			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
+			for i := range inst.Next {
+				inst.Next[i] = inst.Out
+			}
+		case syntax.InstEmptyWidth:
+			ok = check(inst.Out, m)
+			m[pc] = m[inst.Out]
+			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
+			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
+			for i := range inst.Next {
+				inst.Next[i] = inst.Out
+			}
+		case syntax.InstMatch, syntax.InstFail:
+			m[pc] = inst.Op == syntax.InstMatch
+		case syntax.InstRune:
+			m[pc] = false
+			if len(inst.Next) > 0 {
+				break
+			}
+			instQueue.insert(inst.Out)
+			if len(inst.Rune) == 0 {
+				onePassRunes[pc] = []rune{}
+				inst.Next = []uint32{inst.Out}
+				break
+			}
+			runes := make([]rune, 0)
+			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
+				r0 := inst.Rune[0]
+				runes = append(runes, r0, r0)
+				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
+					runes = append(runes, r1, r1)
+				}
+				sort.Sort(runeSlice(runes))
+			} else {
+				runes = append(runes, inst.Rune...)
+			}
+			onePassRunes[pc] = runes
+			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
+			for i := range inst.Next {
+				inst.Next[i] = inst.Out
+			}
+			inst.Op = syntax.InstRune
+		case syntax.InstRune1:
+			m[pc] = false
+			if len(inst.Next) > 0 {
+				break
+			}
+			instQueue.insert(inst.Out)
+			runes := []rune{}
+			// expand case-folded runes
+			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
+				r0 := inst.Rune[0]
+				runes = append(runes, r0, r0)
+				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
+					runes = append(runes, r1, r1)
+				}
+				sort.Sort(runeSlice(runes))
+			} else {
+				runes = append(runes, inst.Rune[0], inst.Rune[0])
+			}
+			onePassRunes[pc] = runes
+			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
+			for i := range inst.Next {
+				inst.Next[i] = inst.Out
+			}
+			inst.Op = syntax.InstRune
+		case syntax.InstRuneAny:
+			m[pc] = false
+			if len(inst.Next) > 0 {
+				break
+			}
+			instQueue.insert(inst.Out)
+			onePassRunes[pc] = append([]rune{}, anyRune...)
+			inst.Next = []uint32{inst.Out}
+		case syntax.InstRuneAnyNotNL:
+			m[pc] = false
+			if len(inst.Next) > 0 {
+				break
+			}
+			instQueue.insert(inst.Out)
+			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
+			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
+			for i := range inst.Next {
+				inst.Next[i] = inst.Out
+			}
+		}
+		return
+	}
+
+	instQueue.clear()
+	instQueue.insert(uint32(p.Start))
+	m := make([]bool, len(p.Inst))
+	for !instQueue.empty() {
+		visitQueue.clear()
+		pc := instQueue.next()
+		if !check(pc, m) {
+			p = nil
+			break
+		}
+	}
+	if p != nil {
+		for i := range p.Inst {
+			p.Inst[i].Rune = onePassRunes[i]
+		}
+	}
+	return p
+}
+
+// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
+// can be recharacterized as a one-pass regexp program, or syntax.nil if the
+// Prog cannot be converted. For a one pass prog, the fundamental condition that must
+// be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
+func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
+	if prog.Start == 0 {
+		return nil
+	}
+	// onepass regexp is anchored
+	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
+		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
+		return nil
+	}
+	// every instruction leading to InstMatch must be EmptyEndText
+	for _, inst := range prog.Inst {
+		opOut := prog.Inst[inst.Out].Op
+		switch inst.Op {
+		default:
+			if opOut == syntax.InstMatch {
+				return nil
+			}
+		case syntax.InstAlt, syntax.InstAltMatch:
+			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
+				return nil
+			}
+		case syntax.InstEmptyWidth:
+			if opOut == syntax.InstMatch {
+				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
+					continue
+				}
+				return nil
+			}
+		}
+	}
+	// Creates a slightly optimized copy of the original Prog
+	// that cleans up some Prog idioms that block valid onepass programs
+	p = onePassCopy(prog)
+
+	// checkAmbiguity on InstAlts, build onepass Prog if possible
+	p = makeOnePass(p)
+
+	if p != nil {
+		cleanupOnePass(p, prog)
+	}
+	return p
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-16 19:25:22 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-16 19:25:22 +0000
commit	f6ad4dcef54c5ce997a4bad5a6d86de229015700 (patch)
tree	7cfa4e31ace5c2bd95c72b154d15af494b2bcbef /src/regexp/onepass.go
parent	Initial commit. (diff)
download	golang-1.22-f6ad4dcef54c5ce997a4bad5a6d86de229015700.tar.xz golang-1.22-f6ad4dcef54c5ce997a4bad5a6d86de229015700.zip