From 43a123c1ae6613b3efeed291fa552ecd909d3acf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Tue, 16 Apr 2024 21:23:18 +0200 Subject: Adding upstream version 1.20.14. Signed-off-by: Daniel Baumann --- src/strings/builder.go | 117 +++ src/strings/builder_test.go | 371 ++++++++ src/strings/clone.go | 28 + src/strings/clone_test.go | 45 + src/strings/compare.go | 28 + src/strings/compare_test.go | 119 +++ src/strings/example_test.go | 404 +++++++++ src/strings/export_test.go | 47 + src/strings/reader.go | 160 ++++ src/strings/reader_test.go | 233 +++++ src/strings/replace.go | 578 ++++++++++++ src/strings/replace_test.go | 583 ++++++++++++ src/strings/search.go | 124 +++ src/strings/search_test.go | 90 ++ src/strings/strings.go | 1289 +++++++++++++++++++++++++++ src/strings/strings_test.go | 2042 +++++++++++++++++++++++++++++++++++++++++++ 16 files changed, 6258 insertions(+) create mode 100644 src/strings/builder.go create mode 100644 src/strings/builder_test.go create mode 100644 src/strings/clone.go create mode 100644 src/strings/clone_test.go create mode 100644 src/strings/compare.go create mode 100644 src/strings/compare_test.go create mode 100644 src/strings/example_test.go create mode 100644 src/strings/export_test.go create mode 100644 src/strings/reader.go create mode 100644 src/strings/reader_test.go create mode 100644 src/strings/replace.go create mode 100644 src/strings/replace_test.go create mode 100644 src/strings/search.go create mode 100644 src/strings/search_test.go create mode 100644 src/strings/strings.go create mode 100644 src/strings/strings_test.go (limited to 'src/strings') diff --git a/src/strings/builder.go b/src/strings/builder.go new file mode 100644 index 0000000..7710464 --- /dev/null +++ b/src/strings/builder.go @@ -0,0 +1,117 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "unicode/utf8" + "unsafe" +) + +// A Builder is used to efficiently build a string using Write methods. +// It minimizes memory copying. The zero value is ready to use. +// Do not copy a non-zero Builder. +type Builder struct { + addr *Builder // of receiver, to detect copies by value + buf []byte +} + +// noescape hides a pointer from escape analysis. It is the identity function +// but escape analysis doesn't think the output depends on the input. +// noescape is inlined and currently compiles down to zero instructions. +// USE CAREFULLY! +// This was copied from the runtime; see issues 23382 and 7921. +// +//go:nosplit +//go:nocheckptr +func noescape(p unsafe.Pointer) unsafe.Pointer { + x := uintptr(p) + return unsafe.Pointer(x ^ 0) +} + +func (b *Builder) copyCheck() { + if b.addr == nil { + // This hack works around a failing of Go's escape analysis + // that was causing b to escape and be heap allocated. + // See issue 23382. + // TODO: once issue 7921 is fixed, this should be reverted to + // just "b.addr = b". + b.addr = (*Builder)(noescape(unsafe.Pointer(b))) + } else if b.addr != b { + panic("strings: illegal use of non-zero Builder copied by value") + } +} + +// String returns the accumulated string. +func (b *Builder) String() string { + return unsafe.String(unsafe.SliceData(b.buf), len(b.buf)) +} + +// Len returns the number of accumulated bytes; b.Len() == len(b.String()). +func (b *Builder) Len() int { return len(b.buf) } + +// Cap returns the capacity of the builder's underlying byte slice. It is the +// total space allocated for the string being built and includes any bytes +// already written. +func (b *Builder) Cap() int { return cap(b.buf) } + +// Reset resets the Builder to be empty. +func (b *Builder) Reset() { + b.addr = nil + b.buf = nil +} + +// grow copies the buffer to a new, larger buffer so that there are at least n +// bytes of capacity beyond len(b.buf). +func (b *Builder) grow(n int) { + buf := make([]byte, len(b.buf), 2*cap(b.buf)+n) + copy(buf, b.buf) + b.buf = buf +} + +// Grow grows b's capacity, if necessary, to guarantee space for +// another n bytes. After Grow(n), at least n bytes can be written to b +// without another allocation. If n is negative, Grow panics. +func (b *Builder) Grow(n int) { + b.copyCheck() + if n < 0 { + panic("strings.Builder.Grow: negative count") + } + if cap(b.buf)-len(b.buf) < n { + b.grow(n) + } +} + +// Write appends the contents of p to b's buffer. +// Write always returns len(p), nil. +func (b *Builder) Write(p []byte) (int, error) { + b.copyCheck() + b.buf = append(b.buf, p...) + return len(p), nil +} + +// WriteByte appends the byte c to b's buffer. +// The returned error is always nil. +func (b *Builder) WriteByte(c byte) error { + b.copyCheck() + b.buf = append(b.buf, c) + return nil +} + +// WriteRune appends the UTF-8 encoding of Unicode code point r to b's buffer. +// It returns the length of r and a nil error. +func (b *Builder) WriteRune(r rune) (int, error) { + b.copyCheck() + n := len(b.buf) + b.buf = utf8.AppendRune(b.buf, r) + return len(b.buf) - n, nil +} + +// WriteString appends the contents of s to b's buffer. +// It returns the length of s and a nil error. +func (b *Builder) WriteString(s string) (int, error) { + b.copyCheck() + b.buf = append(b.buf, s...) + return len(s), nil +} diff --git a/src/strings/builder_test.go b/src/strings/builder_test.go new file mode 100644 index 0000000..dbc2c19 --- /dev/null +++ b/src/strings/builder_test.go @@ -0,0 +1,371 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "bytes" + . "strings" + "testing" + "unicode/utf8" +) + +func check(t *testing.T, b *Builder, want string) { + t.Helper() + got := b.String() + if got != want { + t.Errorf("String: got %#q; want %#q", got, want) + return + } + if n := b.Len(); n != len(got) { + t.Errorf("Len: got %d; but len(String()) is %d", n, len(got)) + } + if n := b.Cap(); n < len(got) { + t.Errorf("Cap: got %d; but len(String()) is %d", n, len(got)) + } +} + +func TestBuilder(t *testing.T) { + var b Builder + check(t, &b, "") + n, err := b.WriteString("hello") + if err != nil || n != 5 { + t.Errorf("WriteString: got %d,%s; want 5,nil", n, err) + } + check(t, &b, "hello") + if err = b.WriteByte(' '); err != nil { + t.Errorf("WriteByte: %s", err) + } + check(t, &b, "hello ") + n, err = b.WriteString("world") + if err != nil || n != 5 { + t.Errorf("WriteString: got %d,%s; want 5,nil", n, err) + } + check(t, &b, "hello world") +} + +func TestBuilderString(t *testing.T) { + var b Builder + b.WriteString("alpha") + check(t, &b, "alpha") + s1 := b.String() + b.WriteString("beta") + check(t, &b, "alphabeta") + s2 := b.String() + b.WriteString("gamma") + check(t, &b, "alphabetagamma") + s3 := b.String() + + // Check that subsequent operations didn't change the returned strings. + if want := "alpha"; s1 != want { + t.Errorf("first String result is now %q; want %q", s1, want) + } + if want := "alphabeta"; s2 != want { + t.Errorf("second String result is now %q; want %q", s2, want) + } + if want := "alphabetagamma"; s3 != want { + t.Errorf("third String result is now %q; want %q", s3, want) + } +} + +func TestBuilderReset(t *testing.T) { + var b Builder + check(t, &b, "") + b.WriteString("aaa") + s := b.String() + check(t, &b, "aaa") + b.Reset() + check(t, &b, "") + + // Ensure that writing after Reset doesn't alter + // previously returned strings. + b.WriteString("bbb") + check(t, &b, "bbb") + if want := "aaa"; s != want { + t.Errorf("previous String result changed after Reset: got %q; want %q", s, want) + } +} + +func TestBuilderGrow(t *testing.T) { + for _, growLen := range []int{0, 100, 1000, 10000, 100000} { + p := bytes.Repeat([]byte{'a'}, growLen) + allocs := testing.AllocsPerRun(100, func() { + var b Builder + b.Grow(growLen) // should be only alloc, when growLen > 0 + if b.Cap() < growLen { + t.Fatalf("growLen=%d: Cap() is lower than growLen", growLen) + } + b.Write(p) + if b.String() != string(p) { + t.Fatalf("growLen=%d: bad data written after Grow", growLen) + } + }) + wantAllocs := 1 + if growLen == 0 { + wantAllocs = 0 + } + if g, w := int(allocs), wantAllocs; g != w { + t.Errorf("growLen=%d: got %d allocs during Write; want %v", growLen, g, w) + } + } + // when growLen < 0, should panic + var a Builder + n := -1 + defer func() { + if r := recover(); r == nil { + t.Errorf("a.Grow(%d) should panic()", n) + } + }() + a.Grow(n) +} + +func TestBuilderWrite2(t *testing.T) { + const s0 = "hello 世界" + for _, tt := range []struct { + name string + fn func(b *Builder) (int, error) + n int + want string + }{ + { + "Write", + func(b *Builder) (int, error) { return b.Write([]byte(s0)) }, + len(s0), + s0, + }, + { + "WriteRune", + func(b *Builder) (int, error) { return b.WriteRune('a') }, + 1, + "a", + }, + { + "WriteRuneWide", + func(b *Builder) (int, error) { return b.WriteRune('世') }, + 3, + "世", + }, + { + "WriteString", + func(b *Builder) (int, error) { return b.WriteString(s0) }, + len(s0), + s0, + }, + } { + t.Run(tt.name, func(t *testing.T) { + var b Builder + n, err := tt.fn(&b) + if err != nil { + t.Fatalf("first call: got %s", err) + } + if n != tt.n { + t.Errorf("first call: got n=%d; want %d", n, tt.n) + } + check(t, &b, tt.want) + + n, err = tt.fn(&b) + if err != nil { + t.Fatalf("second call: got %s", err) + } + if n != tt.n { + t.Errorf("second call: got n=%d; want %d", n, tt.n) + } + check(t, &b, tt.want+tt.want) + }) + } +} + +func TestBuilderWriteByte(t *testing.T) { + var b Builder + if err := b.WriteByte('a'); err != nil { + t.Error(err) + } + if err := b.WriteByte(0); err != nil { + t.Error(err) + } + check(t, &b, "a\x00") +} + +func TestBuilderAllocs(t *testing.T) { + // Issue 23382; verify that copyCheck doesn't force the + // Builder to escape and be heap allocated. + n := testing.AllocsPerRun(10000, func() { + var b Builder + b.Grow(5) + b.WriteString("abcde") + _ = b.String() + }) + if n != 1 { + t.Errorf("Builder allocs = %v; want 1", n) + } +} + +func TestBuilderCopyPanic(t *testing.T) { + tests := []struct { + name string + fn func() + wantPanic bool + }{ + { + name: "String", + wantPanic: false, + fn: func() { + var a Builder + a.WriteByte('x') + b := a + _ = b.String() // appease vet + }, + }, + { + name: "Len", + wantPanic: false, + fn: func() { + var a Builder + a.WriteByte('x') + b := a + b.Len() + }, + }, + { + name: "Cap", + wantPanic: false, + fn: func() { + var a Builder + a.WriteByte('x') + b := a + b.Cap() + }, + }, + { + name: "Reset", + wantPanic: false, + fn: func() { + var a Builder + a.WriteByte('x') + b := a + b.Reset() + b.WriteByte('y') + }, + }, + { + name: "Write", + wantPanic: true, + fn: func() { + var a Builder + a.Write([]byte("x")) + b := a + b.Write([]byte("y")) + }, + }, + { + name: "WriteByte", + wantPanic: true, + fn: func() { + var a Builder + a.WriteByte('x') + b := a + b.WriteByte('y') + }, + }, + { + name: "WriteString", + wantPanic: true, + fn: func() { + var a Builder + a.WriteString("x") + b := a + b.WriteString("y") + }, + }, + { + name: "WriteRune", + wantPanic: true, + fn: func() { + var a Builder + a.WriteRune('x') + b := a + b.WriteRune('y') + }, + }, + { + name: "Grow", + wantPanic: true, + fn: func() { + var a Builder + a.Grow(1) + b := a + b.Grow(2) + }, + }, + } + for _, tt := range tests { + didPanic := make(chan bool) + go func() { + defer func() { didPanic <- recover() != nil }() + tt.fn() + }() + if got := <-didPanic; got != tt.wantPanic { + t.Errorf("%s: panicked = %v; want %v", tt.name, got, tt.wantPanic) + } + } +} + +func TestBuilderWriteInvalidRune(t *testing.T) { + // Invalid runes, including negative ones, should be written as + // utf8.RuneError. + for _, r := range []rune{-1, utf8.MaxRune + 1} { + var b Builder + b.WriteRune(r) + check(t, &b, "\uFFFD") + } +} + +var someBytes = []byte("some bytes sdljlk jsklj3lkjlk djlkjw") + +var sinkS string + +func benchmarkBuilder(b *testing.B, f func(b *testing.B, numWrite int, grow bool)) { + b.Run("1Write_NoGrow", func(b *testing.B) { + b.ReportAllocs() + f(b, 1, false) + }) + b.Run("3Write_NoGrow", func(b *testing.B) { + b.ReportAllocs() + f(b, 3, false) + }) + b.Run("3Write_Grow", func(b *testing.B) { + b.ReportAllocs() + f(b, 3, true) + }) +} + +func BenchmarkBuildString_Builder(b *testing.B) { + benchmarkBuilder(b, func(b *testing.B, numWrite int, grow bool) { + for i := 0; i < b.N; i++ { + var buf Builder + if grow { + buf.Grow(len(someBytes) * numWrite) + } + for i := 0; i < numWrite; i++ { + buf.Write(someBytes) + } + sinkS = buf.String() + } + }) +} + +func BenchmarkBuildString_ByteBuffer(b *testing.B) { + benchmarkBuilder(b, func(b *testing.B, numWrite int, grow bool) { + for i := 0; i < b.N; i++ { + var buf bytes.Buffer + if grow { + buf.Grow(len(someBytes) * numWrite) + } + for i := 0; i < numWrite; i++ { + buf.Write(someBytes) + } + sinkS = buf.String() + } + }) +} diff --git a/src/strings/clone.go b/src/strings/clone.go new file mode 100644 index 0000000..d14df11 --- /dev/null +++ b/src/strings/clone.go @@ -0,0 +1,28 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "unsafe" +) + +// Clone returns a fresh copy of s. +// It guarantees to make a copy of s into a new allocation, +// which can be important when retaining only a small substring +// of a much larger string. Using Clone can help such programs +// use less memory. Of course, since using Clone makes a copy, +// overuse of Clone can make programs use more memory. +// Clone should typically be used only rarely, and only when +// profiling indicates that it is needed. +// For strings of length zero the string "" will be returned +// and no allocation is made. +func Clone(s string) string { + if len(s) == 0 { + return "" + } + b := make([]byte, len(s)) + copy(b, s) + return unsafe.String(&b[0], len(b)) +} diff --git a/src/strings/clone_test.go b/src/strings/clone_test.go new file mode 100644 index 0000000..64f2760 --- /dev/null +++ b/src/strings/clone_test.go @@ -0,0 +1,45 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "strings" + "testing" + "unsafe" +) + +var emptyString string + +func TestClone(t *testing.T) { + var cloneTests = []string{ + "", + strings.Clone(""), + strings.Repeat("a", 42)[:0], + "short", + strings.Repeat("a", 42), + } + for _, input := range cloneTests { + clone := strings.Clone(input) + if clone != input { + t.Errorf("Clone(%q) = %q; want %q", input, clone, input) + } + + if len(input) != 0 && unsafe.StringData(clone) == unsafe.StringData(input) { + t.Errorf("Clone(%q) return value should not reference inputs backing memory.", input) + } + + if len(input) == 0 && unsafe.StringData(clone) != unsafe.StringData(emptyString) { + t.Errorf("Clone(%#v) return value should be equal to empty string.", unsafe.StringData(input)) + } + } +} + +func BenchmarkClone(b *testing.B) { + var str = strings.Repeat("a", 42) + b.ReportAllocs() + for i := 0; i < b.N; i++ { + stringSink = strings.Clone(str) + } +} diff --git a/src/strings/compare.go b/src/strings/compare.go new file mode 100644 index 0000000..2bd4a24 --- /dev/null +++ b/src/strings/compare.go @@ -0,0 +1,28 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +// Compare returns an integer comparing two strings lexicographically. +// The result will be 0 if a == b, -1 if a < b, and +1 if a > b. +// +// Compare is included only for symmetry with package bytes. +// It is usually clearer and always faster to use the built-in +// string comparison operators ==, <, >, and so on. +func Compare(a, b string) int { + // NOTE(rsc): This function does NOT call the runtime cmpstring function, + // because we do not want to provide any performance justification for + // using strings.Compare. Basically no one should use strings.Compare. + // As the comment above says, it is here only for symmetry with package bytes. + // If performance is important, the compiler should be changed to recognize + // the pattern so that all code doing three-way comparisons, not just code + // using strings.Compare, can benefit. + if a == b { + return 0 + } + if a < b { + return -1 + } + return +1 +} diff --git a/src/strings/compare_test.go b/src/strings/compare_test.go new file mode 100644 index 0000000..a435784 --- /dev/null +++ b/src/strings/compare_test.go @@ -0,0 +1,119 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +// Derived from bytes/compare_test.go. +// Benchmarks omitted since the underlying implementation is identical. + +import ( + "internal/testenv" + . "strings" + "testing" + "unsafe" +) + +var compareTests = []struct { + a, b string + i int +}{ + {"", "", 0}, + {"a", "", 1}, + {"", "a", -1}, + {"abc", "abc", 0}, + {"ab", "abc", -1}, + {"abc", "ab", 1}, + {"x", "ab", 1}, + {"ab", "x", -1}, + {"x", "a", 1}, + {"b", "x", -1}, + // test runtime·memeq's chunked implementation + {"abcdefgh", "abcdefgh", 0}, + {"abcdefghi", "abcdefghi", 0}, + {"abcdefghi", "abcdefghj", -1}, +} + +func TestCompare(t *testing.T) { + for _, tt := range compareTests { + cmp := Compare(tt.a, tt.b) + if cmp != tt.i { + t.Errorf(`Compare(%q, %q) = %v`, tt.a, tt.b, cmp) + } + } +} + +func TestCompareIdenticalString(t *testing.T) { + var s = "Hello Gophers!" + if Compare(s, s) != 0 { + t.Error("s != s") + } + if Compare(s, s[:1]) != 1 { + t.Error("s > s[:1] failed") + } +} + +func TestCompareStrings(t *testing.T) { + // unsafeString converts a []byte to a string with no allocation. + // The caller must not modify b while the result string is in use. + unsafeString := func(b []byte) string { + return unsafe.String(unsafe.SliceData(b), len(b)) + } + + lengths := make([]int, 0) // lengths to test in ascending order + for i := 0; i <= 128; i++ { + lengths = append(lengths, i) + } + lengths = append(lengths, 256, 512, 1024, 1333, 4095, 4096, 4097) + + if !testing.Short() || testenv.Builder() != "" { + lengths = append(lengths, 65535, 65536, 65537, 99999) + } + + n := lengths[len(lengths)-1] + a := make([]byte, n+1) + b := make([]byte, n+1) + lastLen := 0 + for _, len := range lengths { + // randomish but deterministic data. No 0 or 255. + for i := 0; i < len; i++ { + a[i] = byte(1 + 31*i%254) + b[i] = byte(1 + 31*i%254) + } + // data past the end is different + for i := len; i <= n; i++ { + a[i] = 8 + b[i] = 9 + } + + sa, sb := unsafeString(a), unsafeString(b) + cmp := Compare(sa[:len], sb[:len]) + if cmp != 0 { + t.Errorf(`CompareIdentical(%d) = %d`, len, cmp) + } + if len > 0 { + cmp = Compare(sa[:len-1], sb[:len]) + if cmp != -1 { + t.Errorf(`CompareAshorter(%d) = %d`, len, cmp) + } + cmp = Compare(sa[:len], sb[:len-1]) + if cmp != 1 { + t.Errorf(`CompareBshorter(%d) = %d`, len, cmp) + } + } + for k := lastLen; k < len; k++ { + b[k] = a[k] - 1 + cmp = Compare(unsafeString(a[:len]), unsafeString(b[:len])) + if cmp != 1 { + t.Errorf(`CompareAbigger(%d,%d) = %d`, len, k, cmp) + } + b[k] = a[k] + 1 + cmp = Compare(unsafeString(a[:len]), unsafeString(b[:len])) + if cmp != -1 { + t.Errorf(`CompareBbigger(%d,%d) = %d`, len, k, cmp) + } + b[k] = a[k] + } + lastLen = len + } +} diff --git a/src/strings/example_test.go b/src/strings/example_test.go new file mode 100644 index 0000000..2a59512 --- /dev/null +++ b/src/strings/example_test.go @@ -0,0 +1,404 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "fmt" + "strings" + "unicode" +) + +func ExampleBuilder() { + var b strings.Builder + for i := 3; i >= 1; i-- { + fmt.Fprintf(&b, "%d...", i) + } + b.WriteString("ignition") + fmt.Println(b.String()) + + // Output: 3...2...1...ignition +} + +func ExampleCompare() { + fmt.Println(strings.Compare("a", "b")) + fmt.Println(strings.Compare("a", "a")) + fmt.Println(strings.Compare("b", "a")) + // Output: + // -1 + // 0 + // 1 +} + +func ExampleContains() { + fmt.Println(strings.Contains("seafood", "foo")) + fmt.Println(strings.Contains("seafood", "bar")) + fmt.Println(strings.Contains("seafood", "")) + fmt.Println(strings.Contains("", "")) + // Output: + // true + // false + // true + // true +} + +func ExampleContainsAny() { + fmt.Println(strings.ContainsAny("team", "i")) + fmt.Println(strings.ContainsAny("fail", "ui")) + fmt.Println(strings.ContainsAny("ure", "ui")) + fmt.Println(strings.ContainsAny("failure", "ui")) + fmt.Println(strings.ContainsAny("foo", "")) + fmt.Println(strings.ContainsAny("", "")) + // Output: + // false + // true + // true + // true + // false + // false +} + +func ExampleContainsRune() { + // Finds whether a string contains a particular Unicode code point. + // The code point for the lowercase letter "a", for example, is 97. + fmt.Println(strings.ContainsRune("aardvark", 97)) + fmt.Println(strings.ContainsRune("timeout", 97)) + // Output: + // true + // false +} + +func ExampleCount() { + fmt.Println(strings.Count("cheese", "e")) + fmt.Println(strings.Count("five", "")) // before & after each rune + // Output: + // 3 + // 5 +} + +func ExampleCut() { + show := func(s, sep string) { + before, after, found := strings.Cut(s, sep) + fmt.Printf("Cut(%q, %q) = %q, %q, %v\n", s, sep, before, after, found) + } + show("Gopher", "Go") + show("Gopher", "ph") + show("Gopher", "er") + show("Gopher", "Badger") + // Output: + // Cut("Gopher", "Go") = "", "pher", true + // Cut("Gopher", "ph") = "Go", "er", true + // Cut("Gopher", "er") = "Goph", "", true + // Cut("Gopher", "Badger") = "Gopher", "", false +} + +func ExampleEqualFold() { + fmt.Println(strings.EqualFold("Go", "go")) + fmt.Println(strings.EqualFold("AB", "ab")) // true because comparison uses simple case-folding + fmt.Println(strings.EqualFold("ß", "ss")) // false because comparison does not use full case-folding + // Output: + // true + // true + // false +} + +func ExampleFields() { + fmt.Printf("Fields are: %q", strings.Fields(" foo bar baz ")) + // Output: Fields are: ["foo" "bar" "baz"] +} + +func ExampleFieldsFunc() { + f := func(c rune) bool { + return !unicode.IsLetter(c) && !unicode.IsNumber(c) + } + fmt.Printf("Fields are: %q", strings.FieldsFunc(" foo1;bar2,baz3...", f)) + // Output: Fields are: ["foo1" "bar2" "baz3"] +} + +func ExampleHasPrefix() { + fmt.Println(strings.HasPrefix("Gopher", "Go")) + fmt.Println(strings.HasPrefix("Gopher", "C")) + fmt.Println(strings.HasPrefix("Gopher", "")) + // Output: + // true + // false + // true +} + +func ExampleHasSuffix() { + fmt.Println(strings.HasSuffix("Amigo", "go")) + fmt.Println(strings.HasSuffix("Amigo", "O")) + fmt.Println(strings.HasSuffix("Amigo", "Ami")) + fmt.Println(strings.HasSuffix("Amigo", "")) + // Output: + // true + // false + // false + // true +} + +func ExampleIndex() { + fmt.Println(strings.Index("chicken", "ken")) + fmt.Println(strings.Index("chicken", "dmr")) + // Output: + // 4 + // -1 +} + +func ExampleIndexFunc() { + f := func(c rune) bool { + return unicode.Is(unicode.Han, c) + } + fmt.Println(strings.IndexFunc("Hello, 世界", f)) + fmt.Println(strings.IndexFunc("Hello, world", f)) + // Output: + // 7 + // -1 +} + +func ExampleIndexAny() { + fmt.Println(strings.IndexAny("chicken", "aeiouy")) + fmt.Println(strings.IndexAny("crwth", "aeiouy")) + // Output: + // 2 + // -1 +} + +func ExampleIndexByte() { + fmt.Println(strings.IndexByte("golang", 'g')) + fmt.Println(strings.IndexByte("gophers", 'h')) + fmt.Println(strings.IndexByte("golang", 'x')) + // Output: + // 0 + // 3 + // -1 +} +func ExampleIndexRune() { + fmt.Println(strings.IndexRune("chicken", 'k')) + fmt.Println(strings.IndexRune("chicken", 'd')) + // Output: + // 4 + // -1 +} + +func ExampleLastIndex() { + fmt.Println(strings.Index("go gopher", "go")) + fmt.Println(strings.LastIndex("go gopher", "go")) + fmt.Println(strings.LastIndex("go gopher", "rodent")) + // Output: + // 0 + // 3 + // -1 +} + +func ExampleLastIndexAny() { + fmt.Println(strings.LastIndexAny("go gopher", "go")) + fmt.Println(strings.LastIndexAny("go gopher", "rodent")) + fmt.Println(strings.LastIndexAny("go gopher", "fail")) + // Output: + // 4 + // 8 + // -1 +} + +func ExampleLastIndexByte() { + fmt.Println(strings.LastIndexByte("Hello, world", 'l')) + fmt.Println(strings.LastIndexByte("Hello, world", 'o')) + fmt.Println(strings.LastIndexByte("Hello, world", 'x')) + // Output: + // 10 + // 8 + // -1 +} + +func ExampleLastIndexFunc() { + fmt.Println(strings.LastIndexFunc("go 123", unicode.IsNumber)) + fmt.Println(strings.LastIndexFunc("123 go", unicode.IsNumber)) + fmt.Println(strings.LastIndexFunc("go", unicode.IsNumber)) + // Output: + // 5 + // 2 + // -1 +} + +func ExampleJoin() { + s := []string{"foo", "bar", "baz"} + fmt.Println(strings.Join(s, ", ")) + // Output: foo, bar, baz +} + +func ExampleRepeat() { + fmt.Println("ba" + strings.Repeat("na", 2)) + // Output: banana +} + +func ExampleReplace() { + fmt.Println(strings.Replace("oink oink oink", "k", "ky", 2)) + fmt.Println(strings.Replace("oink oink oink", "oink", "moo", -1)) + // Output: + // oinky oinky oink + // moo moo moo +} + +func ExampleReplaceAll() { + fmt.Println(strings.ReplaceAll("oink oink oink", "oink", "moo")) + // Output: + // moo moo moo +} + +func ExampleSplit() { + fmt.Printf("%q\n", strings.Split("a,b,c", ",")) + fmt.Printf("%q\n", strings.Split("a man a plan a canal panama", "a ")) + fmt.Printf("%q\n", strings.Split(" xyz ", "")) + fmt.Printf("%q\n", strings.Split("", "Bernardo O'Higgins")) + // Output: + // ["a" "b" "c"] + // ["" "man " "plan " "canal panama"] + // [" " "x" "y" "z" " "] + // [""] +} + +func ExampleSplitN() { + fmt.Printf("%q\n", strings.SplitN("a,b,c", ",", 2)) + z := strings.SplitN("a,b,c", ",", 0) + fmt.Printf("%q (nil = %v)\n", z, z == nil) + // Output: + // ["a" "b,c"] + // [] (nil = true) +} + +func ExampleSplitAfter() { + fmt.Printf("%q\n", strings.SplitAfter("a,b,c", ",")) + // Output: ["a," "b," "c"] +} + +func ExampleSplitAfterN() { + fmt.Printf("%q\n", strings.SplitAfterN("a,b,c", ",", 2)) + // Output: ["a," "b,c"] +} + +func ExampleTitle() { + // Compare this example to the ToTitle example. + fmt.Println(strings.Title("her royal highness")) + fmt.Println(strings.Title("loud noises")) + fmt.Println(strings.Title("хлеб")) + // Output: + // Her Royal Highness + // Loud Noises + // Хлеб +} + +func ExampleToTitle() { + // Compare this example to the Title example. + fmt.Println(strings.ToTitle("her royal highness")) + fmt.Println(strings.ToTitle("loud noises")) + fmt.Println(strings.ToTitle("хлеб")) + // Output: + // HER ROYAL HIGHNESS + // LOUD NOISES + // ХЛЕБ +} + +func ExampleToTitleSpecial() { + fmt.Println(strings.ToTitleSpecial(unicode.TurkishCase, "dünyanın ilk borsa yapısı Aizonai kabul edilir")) + // Output: + // DÜNYANIN İLK BORSA YAPISI AİZONAİ KABUL EDİLİR +} + +func ExampleMap() { + rot13 := func(r rune) rune { + switch { + case r >= 'A' && r <= 'Z': + return 'A' + (r-'A'+13)%26 + case r >= 'a' && r <= 'z': + return 'a' + (r-'a'+13)%26 + } + return r + } + fmt.Println(strings.Map(rot13, "'Twas brillig and the slithy gopher...")) + // Output: 'Gjnf oevyyvt naq gur fyvgul tbcure... +} + +func ExampleNewReplacer() { + r := strings.NewReplacer("<", "<", ">", ">") + fmt.Println(r.Replace("This is HTML!")) + // Output: This is <b>HTML</b>! +} + +func ExampleToUpper() { + fmt.Println(strings.ToUpper("Gopher")) + // Output: GOPHER +} + +func ExampleToUpperSpecial() { + fmt.Println(strings.ToUpperSpecial(unicode.TurkishCase, "örnek iş")) + // Output: ÖRNEK İŞ +} + +func ExampleToLower() { + fmt.Println(strings.ToLower("Gopher")) + // Output: gopher +} + +func ExampleToLowerSpecial() { + fmt.Println(strings.ToLowerSpecial(unicode.TurkishCase, "Önnek İş")) + // Output: önnek iş +} + +func ExampleTrim() { + fmt.Print(strings.Trim("¡¡¡Hello, Gophers!!!", "!¡")) + // Output: Hello, Gophers +} + +func ExampleTrimSpace() { + fmt.Println(strings.TrimSpace(" \t\n Hello, Gophers \n\t\r\n")) + // Output: Hello, Gophers +} + +func ExampleTrimPrefix() { + var s = "¡¡¡Hello, Gophers!!!" + s = strings.TrimPrefix(s, "¡¡¡Hello, ") + s = strings.TrimPrefix(s, "¡¡¡Howdy, ") + fmt.Print(s) + // Output: Gophers!!! +} + +func ExampleTrimSuffix() { + var s = "¡¡¡Hello, Gophers!!!" + s = strings.TrimSuffix(s, ", Gophers!!!") + s = strings.TrimSuffix(s, ", Marmots!!!") + fmt.Print(s) + // Output: ¡¡¡Hello +} + +func ExampleTrimFunc() { + fmt.Print(strings.TrimFunc("¡¡¡Hello, Gophers!!!", func(r rune) bool { + return !unicode.IsLetter(r) && !unicode.IsNumber(r) + })) + // Output: Hello, Gophers +} + +func ExampleTrimLeft() { + fmt.Print(strings.TrimLeft("¡¡¡Hello, Gophers!!!", "!¡")) + // Output: Hello, Gophers!!! +} + +func ExampleTrimLeftFunc() { + fmt.Print(strings.TrimLeftFunc("¡¡¡Hello, Gophers!!!", func(r rune) bool { + return !unicode.IsLetter(r) && !unicode.IsNumber(r) + })) + // Output: Hello, Gophers!!! +} + +func ExampleTrimRight() { + fmt.Print(strings.TrimRight("¡¡¡Hello, Gophers!!!", "!¡")) + // Output: ¡¡¡Hello, Gophers +} + +func ExampleTrimRightFunc() { + fmt.Print(strings.TrimRightFunc("¡¡¡Hello, Gophers!!!", func(r rune) bool { + return !unicode.IsLetter(r) && !unicode.IsNumber(r) + })) + // Output: ¡¡¡Hello, Gophers +} diff --git a/src/strings/export_test.go b/src/strings/export_test.go new file mode 100644 index 0000000..81d1cab --- /dev/null +++ b/src/strings/export_test.go @@ -0,0 +1,47 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +func (r *Replacer) Replacer() any { + r.once.Do(r.buildOnce) + return r.r +} + +func (r *Replacer) PrintTrie() string { + r.once.Do(r.buildOnce) + gen := r.r.(*genericReplacer) + return gen.printNode(&gen.root, 0) +} + +func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) { + if t.priority > 0 { + s += "+" + } else { + s += "-" + } + s += "\n" + + if t.prefix != "" { + s += Repeat(".", depth) + t.prefix + s += r.printNode(t.next, depth+len(t.prefix)) + } else if t.table != nil { + for b, m := range r.mapping { + if int(m) != r.tableSize && t.table[m] != nil { + s += Repeat(".", depth) + string([]byte{byte(b)}) + s += r.printNode(t.table[m], depth+1) + } + } + } + return +} + +func StringFind(pattern, text string) int { + return makeStringFinder(pattern).next(text) +} + +func DumpTables(pattern string) ([]int, []int) { + finder := makeStringFinder(pattern) + return finder.badCharSkip[:], finder.goodSuffixSkip +} diff --git a/src/strings/reader.go b/src/strings/reader.go new file mode 100644 index 0000000..6f069a6 --- /dev/null +++ b/src/strings/reader.go @@ -0,0 +1,160 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "errors" + "io" + "unicode/utf8" +) + +// A Reader implements the io.Reader, io.ReaderAt, io.ByteReader, io.ByteScanner, +// io.RuneReader, io.RuneScanner, io.Seeker, and io.WriterTo interfaces by reading +// from a string. +// The zero value for Reader operates like a Reader of an empty string. +type Reader struct { + s string + i int64 // current reading index + prevRune int // index of previous rune; or < 0 +} + +// Len returns the number of bytes of the unread portion of the +// string. +func (r *Reader) Len() int { + if r.i >= int64(len(r.s)) { + return 0 + } + return int(int64(len(r.s)) - r.i) +} + +// Size returns the original length of the underlying string. +// Size is the number of bytes available for reading via ReadAt. +// The returned value is always the same and is not affected by calls +// to any other method. +func (r *Reader) Size() int64 { return int64(len(r.s)) } + +// Read implements the io.Reader interface. +func (r *Reader) Read(b []byte) (n int, err error) { + if r.i >= int64(len(r.s)) { + return 0, io.EOF + } + r.prevRune = -1 + n = copy(b, r.s[r.i:]) + r.i += int64(n) + return +} + +// ReadAt implements the io.ReaderAt interface. +func (r *Reader) ReadAt(b []byte, off int64) (n int, err error) { + // cannot modify state - see io.ReaderAt + if off < 0 { + return 0, errors.New("strings.Reader.ReadAt: negative offset") + } + if off >= int64(len(r.s)) { + return 0, io.EOF + } + n = copy(b, r.s[off:]) + if n < len(b) { + err = io.EOF + } + return +} + +// ReadByte implements the io.ByteReader interface. +func (r *Reader) ReadByte() (byte, error) { + r.prevRune = -1 + if r.i >= int64(len(r.s)) { + return 0, io.EOF + } + b := r.s[r.i] + r.i++ + return b, nil +} + +// UnreadByte implements the io.ByteScanner interface. +func (r *Reader) UnreadByte() error { + if r.i <= 0 { + return errors.New("strings.Reader.UnreadByte: at beginning of string") + } + r.prevRune = -1 + r.i-- + return nil +} + +// ReadRune implements the io.RuneReader interface. +func (r *Reader) ReadRune() (ch rune, size int, err error) { + if r.i >= int64(len(r.s)) { + r.prevRune = -1 + return 0, 0, io.EOF + } + r.prevRune = int(r.i) + if c := r.s[r.i]; c < utf8.RuneSelf { + r.i++ + return rune(c), 1, nil + } + ch, size = utf8.DecodeRuneInString(r.s[r.i:]) + r.i += int64(size) + return +} + +// UnreadRune implements the io.RuneScanner interface. +func (r *Reader) UnreadRune() error { + if r.i <= 0 { + return errors.New("strings.Reader.UnreadRune: at beginning of string") + } + if r.prevRune < 0 { + return errors.New("strings.Reader.UnreadRune: previous operation was not ReadRune") + } + r.i = int64(r.prevRune) + r.prevRune = -1 + return nil +} + +// Seek implements the io.Seeker interface. +func (r *Reader) Seek(offset int64, whence int) (int64, error) { + r.prevRune = -1 + var abs int64 + switch whence { + case io.SeekStart: + abs = offset + case io.SeekCurrent: + abs = r.i + offset + case io.SeekEnd: + abs = int64(len(r.s)) + offset + default: + return 0, errors.New("strings.Reader.Seek: invalid whence") + } + if abs < 0 { + return 0, errors.New("strings.Reader.Seek: negative position") + } + r.i = abs + return abs, nil +} + +// WriteTo implements the io.WriterTo interface. +func (r *Reader) WriteTo(w io.Writer) (n int64, err error) { + r.prevRune = -1 + if r.i >= int64(len(r.s)) { + return 0, nil + } + s := r.s[r.i:] + m, err := io.WriteString(w, s) + if m > len(s) { + panic("strings.Reader.WriteTo: invalid WriteString count") + } + r.i += int64(m) + n = int64(m) + if m != len(s) && err == nil { + err = io.ErrShortWrite + } + return +} + +// Reset resets the Reader to be reading from s. +func (r *Reader) Reset(s string) { *r = Reader{s, 0, -1} } + +// NewReader returns a new Reader reading from s. +// It is similar to bytes.NewBufferString but more efficient and read-only. +func NewReader(s string) *Reader { return &Reader{s, 0, -1} } diff --git a/src/strings/reader_test.go b/src/strings/reader_test.go new file mode 100644 index 0000000..dc99f9c --- /dev/null +++ b/src/strings/reader_test.go @@ -0,0 +1,233 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "bytes" + "fmt" + "io" + "strings" + "sync" + "testing" +) + +func TestReader(t *testing.T) { + r := strings.NewReader("0123456789") + tests := []struct { + off int64 + seek int + n int + want string + wantpos int64 + readerr error + seekerr string + }{ + {seek: io.SeekStart, off: 0, n: 20, want: "0123456789"}, + {seek: io.SeekStart, off: 1, n: 1, want: "1"}, + {seek: io.SeekCurrent, off: 1, wantpos: 3, n: 2, want: "34"}, + {seek: io.SeekStart, off: -1, seekerr: "strings.Reader.Seek: negative position"}, + {seek: io.SeekStart, off: 1 << 33, wantpos: 1 << 33, readerr: io.EOF}, + {seek: io.SeekCurrent, off: 1, wantpos: 1<<33 + 1, readerr: io.EOF}, + {seek: io.SeekStart, n: 5, want: "01234"}, + {seek: io.SeekCurrent, n: 5, want: "56789"}, + {seek: io.SeekEnd, off: -1, n: 1, wantpos: 9, want: "9"}, + } + + for i, tt := range tests { + pos, err := r.Seek(tt.off, tt.seek) + if err == nil && tt.seekerr != "" { + t.Errorf("%d. want seek error %q", i, tt.seekerr) + continue + } + if err != nil && err.Error() != tt.seekerr { + t.Errorf("%d. seek error = %q; want %q", i, err.Error(), tt.seekerr) + continue + } + if tt.wantpos != 0 && tt.wantpos != pos { + t.Errorf("%d. pos = %d, want %d", i, pos, tt.wantpos) + } + buf := make([]byte, tt.n) + n, err := r.Read(buf) + if err != tt.readerr { + t.Errorf("%d. read = %v; want %v", i, err, tt.readerr) + continue + } + got := string(buf[:n]) + if got != tt.want { + t.Errorf("%d. got %q; want %q", i, got, tt.want) + } + } +} + +func TestReadAfterBigSeek(t *testing.T) { + r := strings.NewReader("0123456789") + if _, err := r.Seek(1<<31+5, io.SeekStart); err != nil { + t.Fatal(err) + } + if n, err := r.Read(make([]byte, 10)); n != 0 || err != io.EOF { + t.Errorf("Read = %d, %v; want 0, EOF", n, err) + } +} + +func TestReaderAt(t *testing.T) { + r := strings.NewReader("0123456789") + tests := []struct { + off int64 + n int + want string + wanterr any + }{ + {0, 10, "0123456789", nil}, + {1, 10, "123456789", io.EOF}, + {1, 9, "123456789", nil}, + {11, 10, "", io.EOF}, + {0, 0, "", nil}, + {-1, 0, "", "strings.Reader.ReadAt: negative offset"}, + } + for i, tt := range tests { + b := make([]byte, tt.n) + rn, err := r.ReadAt(b, tt.off) + got := string(b[:rn]) + if got != tt.want { + t.Errorf("%d. got %q; want %q", i, got, tt.want) + } + if fmt.Sprintf("%v", err) != fmt.Sprintf("%v", tt.wanterr) { + t.Errorf("%d. got error = %v; want %v", i, err, tt.wanterr) + } + } +} + +func TestReaderAtConcurrent(t *testing.T) { + // Test for the race detector, to verify ReadAt doesn't mutate + // any state. + r := strings.NewReader("0123456789") + var wg sync.WaitGroup + for i := 0; i < 5; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + var buf [1]byte + r.ReadAt(buf[:], int64(i)) + }(i) + } + wg.Wait() +} + +func TestEmptyReaderConcurrent(t *testing.T) { + // Test for the race detector, to verify a Read that doesn't yield any bytes + // is okay to use from multiple goroutines. This was our historic behavior. + // See golang.org/issue/7856 + r := strings.NewReader("") + var wg sync.WaitGroup + for i := 0; i < 5; i++ { + wg.Add(2) + go func() { + defer wg.Done() + var buf [1]byte + r.Read(buf[:]) + }() + go func() { + defer wg.Done() + r.Read(nil) + }() + } + wg.Wait() +} + +func TestWriteTo(t *testing.T) { + const str = "0123456789" + for i := 0; i <= len(str); i++ { + s := str[i:] + r := strings.NewReader(s) + var b bytes.Buffer + n, err := r.WriteTo(&b) + if expect := int64(len(s)); n != expect { + t.Errorf("got %v; want %v", n, expect) + } + if err != nil { + t.Errorf("for length %d: got error = %v; want nil", len(s), err) + } + if b.String() != s { + t.Errorf("got string %q; want %q", b.String(), s) + } + if r.Len() != 0 { + t.Errorf("reader contains %v bytes; want 0", r.Len()) + } + } +} + +// tests that Len is affected by reads, but Size is not. +func TestReaderLenSize(t *testing.T) { + r := strings.NewReader("abc") + io.CopyN(io.Discard, r, 1) + if r.Len() != 2 { + t.Errorf("Len = %d; want 2", r.Len()) + } + if r.Size() != 3 { + t.Errorf("Size = %d; want 3", r.Size()) + } +} + +func TestReaderReset(t *testing.T) { + r := strings.NewReader("世界") + if _, _, err := r.ReadRune(); err != nil { + t.Errorf("ReadRune: unexpected error: %v", err) + } + + const want = "abcdef" + r.Reset(want) + if err := r.UnreadRune(); err == nil { + t.Errorf("UnreadRune: expected error, got nil") + } + buf, err := io.ReadAll(r) + if err != nil { + t.Errorf("ReadAll: unexpected error: %v", err) + } + if got := string(buf); got != want { + t.Errorf("ReadAll: got %q, want %q", got, want) + } +} + +func TestReaderZero(t *testing.T) { + if l := (&strings.Reader{}).Len(); l != 0 { + t.Errorf("Len: got %d, want 0", l) + } + + if n, err := (&strings.Reader{}).Read(nil); n != 0 || err != io.EOF { + t.Errorf("Read: got %d, %v; want 0, io.EOF", n, err) + } + + if n, err := (&strings.Reader{}).ReadAt(nil, 11); n != 0 || err != io.EOF { + t.Errorf("ReadAt: got %d, %v; want 0, io.EOF", n, err) + } + + if b, err := (&strings.Reader{}).ReadByte(); b != 0 || err != io.EOF { + t.Errorf("ReadByte: got %d, %v; want 0, io.EOF", b, err) + } + + if ch, size, err := (&strings.Reader{}).ReadRune(); ch != 0 || size != 0 || err != io.EOF { + t.Errorf("ReadRune: got %d, %d, %v; want 0, 0, io.EOF", ch, size, err) + } + + if offset, err := (&strings.Reader{}).Seek(11, io.SeekStart); offset != 11 || err != nil { + t.Errorf("Seek: got %d, %v; want 11, nil", offset, err) + } + + if s := (&strings.Reader{}).Size(); s != 0 { + t.Errorf("Size: got %d, want 0", s) + } + + if (&strings.Reader{}).UnreadByte() == nil { + t.Errorf("UnreadByte: got nil, want error") + } + + if (&strings.Reader{}).UnreadRune() == nil { + t.Errorf("UnreadRune: got nil, want error") + } + + if n, err := (&strings.Reader{}).WriteTo(io.Discard); n != 0 || err != nil { + t.Errorf("WriteTo: got %d, %v; want 0, nil", n, err) + } +} diff --git a/src/strings/replace.go b/src/strings/replace.go new file mode 100644 index 0000000..f504fb4 --- /dev/null +++ b/src/strings/replace.go @@ -0,0 +1,578 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "io" + "sync" +) + +// Replacer replaces a list of strings with replacements. +// It is safe for concurrent use by multiple goroutines. +type Replacer struct { + once sync.Once // guards buildOnce method + r replacer + oldnew []string +} + +// replacer is the interface that a replacement algorithm needs to implement. +type replacer interface { + Replace(s string) string + WriteString(w io.Writer, s string) (n int, err error) +} + +// NewReplacer returns a new Replacer from a list of old, new string +// pairs. Replacements are performed in the order they appear in the +// target string, without overlapping matches. The old string +// comparisons are done in argument order. +// +// NewReplacer panics if given an odd number of arguments. +func NewReplacer(oldnew ...string) *Replacer { + if len(oldnew)%2 == 1 { + panic("strings.NewReplacer: odd argument count") + } + return &Replacer{oldnew: append([]string(nil), oldnew...)} +} + +func (r *Replacer) buildOnce() { + r.r = r.build() + r.oldnew = nil +} + +func (b *Replacer) build() replacer { + oldnew := b.oldnew + if len(oldnew) == 2 && len(oldnew[0]) > 1 { + return makeSingleStringReplacer(oldnew[0], oldnew[1]) + } + + allNewBytes := true + for i := 0; i < len(oldnew); i += 2 { + if len(oldnew[i]) != 1 { + return makeGenericReplacer(oldnew) + } + if len(oldnew[i+1]) != 1 { + allNewBytes = false + } + } + + if allNewBytes { + r := byteReplacer{} + for i := range r { + r[i] = byte(i) + } + // The first occurrence of old->new map takes precedence + // over the others with the same old string. + for i := len(oldnew) - 2; i >= 0; i -= 2 { + o := oldnew[i][0] + n := oldnew[i+1][0] + r[o] = n + } + return &r + } + + r := byteStringReplacer{toReplace: make([]string, 0, len(oldnew)/2)} + // The first occurrence of old->new map takes precedence + // over the others with the same old string. + for i := len(oldnew) - 2; i >= 0; i -= 2 { + o := oldnew[i][0] + n := oldnew[i+1] + // To avoid counting repetitions multiple times. + if r.replacements[o] == nil { + // We need to use string([]byte{o}) instead of string(o), + // to avoid utf8 encoding of o. + // E. g. byte(150) produces string of length 2. + r.toReplace = append(r.toReplace, string([]byte{o})) + } + r.replacements[o] = []byte(n) + + } + return &r +} + +// Replace returns a copy of s with all replacements performed. +func (r *Replacer) Replace(s string) string { + r.once.Do(r.buildOnce) + return r.r.Replace(s) +} + +// WriteString writes s to w with all replacements performed. +func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) { + r.once.Do(r.buildOnce) + return r.r.WriteString(w, s) +} + +// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys +// and values may be empty. For example, the trie containing keys "ax", "ay", +// "bcbc", "x" and "xy" could have eight nodes: +// +// n0 - +// n1 a- +// n2 .x+ +// n3 .y+ +// n4 b- +// n5 .cbc+ +// n6 x+ +// n7 .y+ +// +// n0 is the root node, and its children are n1, n4 and n6; n1's children are +// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked +// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7 +// (marked with a trailing "+") are complete keys. +type trieNode struct { + // value is the value of the trie node's key/value pair. It is empty if + // this node is not a complete key. + value string + // priority is the priority (higher is more important) of the trie node's + // key/value pair; keys are not necessarily matched shortest- or longest- + // first. Priority is positive if this node is a complete key, and zero + // otherwise. In the example above, positive/zero priorities are marked + // with a trailing "+" or "-". + priority int + + // A trie node may have zero, one or more child nodes: + // * if the remaining fields are zero, there are no children. + // * if prefix and next are non-zero, there is one child in next. + // * if table is non-zero, it defines all the children. + // + // Prefixes are preferred over tables when there is one child, but the + // root node always uses a table for lookup efficiency. + + // prefix is the difference in keys between this trie node and the next. + // In the example above, node n4 has prefix "cbc" and n4's next node is n5. + // Node n5 has no children and so has zero prefix, next and table fields. + prefix string + next *trieNode + + // table is a lookup table indexed by the next byte in the key, after + // remapping that byte through genericReplacer.mapping to create a dense + // index. In the example above, the keys only use 'a', 'b', 'c', 'x' and + // 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and + // genericReplacer.tableSize will be 5. Node n0's table will be + // []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped + // 'a', 'b' and 'x'. + table []*trieNode +} + +func (t *trieNode) add(key, val string, priority int, r *genericReplacer) { + if key == "" { + if t.priority == 0 { + t.value = val + t.priority = priority + } + return + } + + if t.prefix != "" { + // Need to split the prefix among multiple nodes. + var n int // length of the longest common prefix + for ; n < len(t.prefix) && n < len(key); n++ { + if t.prefix[n] != key[n] { + break + } + } + if n == len(t.prefix) { + t.next.add(key[n:], val, priority, r) + } else if n == 0 { + // First byte differs, start a new lookup table here. Looking up + // what is currently t.prefix[0] will lead to prefixNode, and + // looking up key[0] will lead to keyNode. + var prefixNode *trieNode + if len(t.prefix) == 1 { + prefixNode = t.next + } else { + prefixNode = &trieNode{ + prefix: t.prefix[1:], + next: t.next, + } + } + keyNode := new(trieNode) + t.table = make([]*trieNode, r.tableSize) + t.table[r.mapping[t.prefix[0]]] = prefixNode + t.table[r.mapping[key[0]]] = keyNode + t.prefix = "" + t.next = nil + keyNode.add(key[1:], val, priority, r) + } else { + // Insert new node after the common section of the prefix. + next := &trieNode{ + prefix: t.prefix[n:], + next: t.next, + } + t.prefix = t.prefix[:n] + t.next = next + next.add(key[n:], val, priority, r) + } + } else if t.table != nil { + // Insert into existing table. + m := r.mapping[key[0]] + if t.table[m] == nil { + t.table[m] = new(trieNode) + } + t.table[m].add(key[1:], val, priority, r) + } else { + t.prefix = key + t.next = new(trieNode) + t.next.add("", val, priority, r) + } +} + +func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) { + // Iterate down the trie to the end, and grab the value and keylen with + // the highest priority. + bestPriority := 0 + node := &r.root + n := 0 + for node != nil { + if node.priority > bestPriority && !(ignoreRoot && node == &r.root) { + bestPriority = node.priority + val = node.value + keylen = n + found = true + } + + if s == "" { + break + } + if node.table != nil { + index := r.mapping[s[0]] + if int(index) == r.tableSize { + break + } + node = node.table[index] + s = s[1:] + n++ + } else if node.prefix != "" && HasPrefix(s, node.prefix) { + n += len(node.prefix) + s = s[len(node.prefix):] + node = node.next + } else { + break + } + } + return +} + +// genericReplacer is the fully generic algorithm. +// It's used as a fallback when nothing faster can be used. +type genericReplacer struct { + root trieNode + // tableSize is the size of a trie node's lookup table. It is the number + // of unique key bytes. + tableSize int + // mapping maps from key bytes to a dense index for trieNode.table. + mapping [256]byte +} + +func makeGenericReplacer(oldnew []string) *genericReplacer { + r := new(genericReplacer) + // Find each byte used, then assign them each an index. + for i := 0; i < len(oldnew); i += 2 { + key := oldnew[i] + for j := 0; j < len(key); j++ { + r.mapping[key[j]] = 1 + } + } + + for _, b := range r.mapping { + r.tableSize += int(b) + } + + var index byte + for i, b := range r.mapping { + if b == 0 { + r.mapping[i] = byte(r.tableSize) + } else { + r.mapping[i] = index + index++ + } + } + // Ensure root node uses a lookup table (for performance). + r.root.table = make([]*trieNode, r.tableSize) + + for i := 0; i < len(oldnew); i += 2 { + r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r) + } + return r +} + +type appendSliceWriter []byte + +// Write writes to the buffer to satisfy io.Writer. +func (w *appendSliceWriter) Write(p []byte) (int, error) { + *w = append(*w, p...) + return len(p), nil +} + +// WriteString writes to the buffer without string->[]byte->string allocations. +func (w *appendSliceWriter) WriteString(s string) (int, error) { + *w = append(*w, s...) + return len(s), nil +} + +type stringWriter struct { + w io.Writer +} + +func (w stringWriter) WriteString(s string) (int, error) { + return w.w.Write([]byte(s)) +} + +func getStringWriter(w io.Writer) io.StringWriter { + sw, ok := w.(io.StringWriter) + if !ok { + sw = stringWriter{w} + } + return sw +} + +func (r *genericReplacer) Replace(s string) string { + buf := make(appendSliceWriter, 0, len(s)) + r.WriteString(&buf, s) + return string(buf) +} + +func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + var last, wn int + var prevMatchEmpty bool + for i := 0; i <= len(s); { + // Fast path: s[i] is not a prefix of any pattern. + if i != len(s) && r.root.priority == 0 { + index := int(r.mapping[s[i]]) + if index == r.tableSize || r.root.table[index] == nil { + i++ + continue + } + } + + // Ignore the empty match iff the previous loop found the empty match. + val, keylen, match := r.lookup(s[i:], prevMatchEmpty) + prevMatchEmpty = match && keylen == 0 + if match { + wn, err = sw.WriteString(s[last:i]) + n += wn + if err != nil { + return + } + wn, err = sw.WriteString(val) + n += wn + if err != nil { + return + } + i += keylen + last = i + continue + } + i++ + } + if last != len(s) { + wn, err = sw.WriteString(s[last:]) + n += wn + } + return +} + +// singleStringReplacer is the implementation that's used when there is only +// one string to replace (and that string has more than one byte). +type singleStringReplacer struct { + finder *stringFinder + // value is the new string that replaces that pattern when it's found. + value string +} + +func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer { + return &singleStringReplacer{finder: makeStringFinder(pattern), value: value} +} + +func (r *singleStringReplacer) Replace(s string) string { + var buf Builder + i, matched := 0, false + for { + match := r.finder.next(s[i:]) + if match == -1 { + break + } + matched = true + buf.Grow(match + len(r.value)) + buf.WriteString(s[i : i+match]) + buf.WriteString(r.value) + i += match + len(r.finder.pattern) + } + if !matched { + return s + } + buf.WriteString(s[i:]) + return buf.String() +} + +func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + var i, wn int + for { + match := r.finder.next(s[i:]) + if match == -1 { + break + } + wn, err = sw.WriteString(s[i : i+match]) + n += wn + if err != nil { + return + } + wn, err = sw.WriteString(r.value) + n += wn + if err != nil { + return + } + i += match + len(r.finder.pattern) + } + wn, err = sw.WriteString(s[i:]) + n += wn + return +} + +// byteReplacer is the implementation that's used when all the "old" +// and "new" values are single ASCII bytes. +// The array contains replacement bytes indexed by old byte. +type byteReplacer [256]byte + +func (r *byteReplacer) Replace(s string) string { + var buf []byte // lazily allocated + for i := 0; i < len(s); i++ { + b := s[i] + if r[b] != b { + if buf == nil { + buf = []byte(s) + } + buf[i] = r[b] + } + } + if buf == nil { + return s + } + return string(buf) +} + +func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + last := 0 + for i := 0; i < len(s); i++ { + b := s[i] + if r[b] == b { + continue + } + if last != i { + wn, err := sw.WriteString(s[last:i]) + n += wn + if err != nil { + return n, err + } + } + last = i + 1 + nw, err := w.Write(r[b : int(b)+1]) + n += nw + if err != nil { + return n, err + } + } + if last != len(s) { + nw, err := sw.WriteString(s[last:]) + n += nw + if err != nil { + return n, err + } + } + return n, nil +} + +// byteStringReplacer is the implementation that's used when all the +// "old" values are single ASCII bytes but the "new" values vary in size. +type byteStringReplacer struct { + // replacements contains replacement byte slices indexed by old byte. + // A nil []byte means that the old byte should not be replaced. + replacements [256][]byte + // toReplace keeps a list of bytes to replace. Depending on length of toReplace + // and length of target string it may be faster to use Count, or a plain loop. + // We store single byte as a string, because Count takes a string. + toReplace []string +} + +// countCutOff controls the ratio of a string length to a number of replacements +// at which (*byteStringReplacer).Replace switches algorithms. +// For strings with higher ration of length to replacements than that value, +// we call Count, for each replacement from toReplace. +// For strings, with a lower ratio we use simple loop, because of Count overhead. +// countCutOff is an empirically determined overhead multiplier. +// TODO(tocarip) revisit once we have register-based abi/mid-stack inlining. +const countCutOff = 8 + +func (r *byteStringReplacer) Replace(s string) string { + newSize := len(s) + anyChanges := false + // Is it faster to use Count? + if len(r.toReplace)*countCutOff <= len(s) { + for _, x := range r.toReplace { + if c := Count(s, x); c != 0 { + // The -1 is because we are replacing 1 byte with len(replacements[b]) bytes. + newSize += c * (len(r.replacements[x[0]]) - 1) + anyChanges = true + } + + } + } else { + for i := 0; i < len(s); i++ { + b := s[i] + if r.replacements[b] != nil { + // See above for explanation of -1 + newSize += len(r.replacements[b]) - 1 + anyChanges = true + } + } + } + if !anyChanges { + return s + } + buf := make([]byte, newSize) + j := 0 + for i := 0; i < len(s); i++ { + b := s[i] + if r.replacements[b] != nil { + j += copy(buf[j:], r.replacements[b]) + } else { + buf[j] = b + j++ + } + } + return string(buf) +} + +func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + last := 0 + for i := 0; i < len(s); i++ { + b := s[i] + if r.replacements[b] == nil { + continue + } + if last != i { + nw, err := sw.WriteString(s[last:i]) + n += nw + if err != nil { + return n, err + } + } + last = i + 1 + nw, err := w.Write(r.replacements[b]) + n += nw + if err != nil { + return n, err + } + } + if last != len(s) { + var nw int + nw, err = sw.WriteString(s[last:]) + n += nw + } + return +} diff --git a/src/strings/replace_test.go b/src/strings/replace_test.go new file mode 100644 index 0000000..34b5bad --- /dev/null +++ b/src/strings/replace_test.go @@ -0,0 +1,583 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "bytes" + "fmt" + . "strings" + "testing" +) + +var htmlEscaper = NewReplacer( + "&", "&", + "<", "<", + ">", ">", + `"`, """, + "'", "'", +) + +var htmlUnescaper = NewReplacer( + "&", "&", + "<", "<", + ">", ">", + """, `"`, + "'", "'", +) + +// The http package's old HTML escaping function. +func oldHTMLEscape(s string) string { + s = Replace(s, "&", "&", -1) + s = Replace(s, "<", "<", -1) + s = Replace(s, ">", ">", -1) + s = Replace(s, `"`, """, -1) + s = Replace(s, "'", "'", -1) + return s +} + +var capitalLetters = NewReplacer("a", "A", "b", "B") + +// TestReplacer tests the replacer implementations. +func TestReplacer(t *testing.T) { + type testCase struct { + r *Replacer + in, out string + } + var testCases []testCase + + // str converts 0xff to "\xff". This isn't just string(b) since that converts to UTF-8. + str := func(b byte) string { + return string([]byte{b}) + } + var s []string + + // inc maps "\x00"->"\x01", ..., "a"->"b", "b"->"c", ..., "\xff"->"\x00". + s = nil + for i := 0; i < 256; i++ { + s = append(s, str(byte(i)), str(byte(i+1))) + } + inc := NewReplacer(s...) + + // Test cases with 1-byte old strings, 1-byte new strings. + testCases = append(testCases, + testCase{capitalLetters, "brad", "BrAd"}, + testCase{capitalLetters, Repeat("a", (32<<10)+123), Repeat("A", (32<<10)+123)}, + testCase{capitalLetters, "", ""}, + + testCase{inc, "brad", "csbe"}, + testCase{inc, "\x00\xff", "\x01\x00"}, + testCase{inc, "", ""}, + + testCase{NewReplacer("a", "1", "a", "2"), "brad", "br1d"}, + ) + + // repeat maps "a"->"a", "b"->"bb", "c"->"ccc", ... + s = nil + for i := 0; i < 256; i++ { + n := i + 1 - 'a' + if n < 1 { + n = 1 + } + s = append(s, str(byte(i)), Repeat(str(byte(i)), n)) + } + repeat := NewReplacer(s...) + + // Test cases with 1-byte old strings, variable length new strings. + testCases = append(testCases, + testCase{htmlEscaper, "No changes", "No changes"}, + testCase{htmlEscaper, "I <3 escaping & stuff", "I <3 escaping & stuff"}, + testCase{htmlEscaper, "&&&", "&&&"}, + testCase{htmlEscaper, "", ""}, + + testCase{repeat, "brad", "bbrrrrrrrrrrrrrrrrrradddd"}, + testCase{repeat, "abba", "abbbba"}, + testCase{repeat, "", ""}, + + testCase{NewReplacer("a", "11", "a", "22"), "brad", "br11d"}, + ) + + // The remaining test cases have variable length old strings. + + testCases = append(testCases, + testCase{htmlUnescaper, "&amp;", "&"}, + testCase{htmlUnescaper, "<b>HTML's neat</b>", "HTML's neat"}, + testCase{htmlUnescaper, "", ""}, + + testCase{NewReplacer("a", "1", "a", "2", "xxx", "xxx"), "brad", "br1d"}, + + testCase{NewReplacer("a", "1", "aa", "2", "aaa", "3"), "aaaa", "1111"}, + + testCase{NewReplacer("aaa", "3", "aa", "2", "a", "1"), "aaaa", "31"}, + ) + + // gen1 has multiple old strings of variable length. There is no + // overall non-empty common prefix, but some pairwise common prefixes. + gen1 := NewReplacer( + "aaa", "3[aaa]", + "aa", "2[aa]", + "a", "1[a]", + "i", "i", + "longerst", "most long", + "longer", "medium", + "long", "short", + "xx", "xx", + "x", "X", + "X", "Y", + "Y", "Z", + ) + testCases = append(testCases, + testCase{gen1, "fooaaabar", "foo3[aaa]b1[a]r"}, + testCase{gen1, "long, longerst, longer", "short, most long, medium"}, + testCase{gen1, "xxxxx", "xxxxX"}, + testCase{gen1, "XiX", "YiY"}, + testCase{gen1, "", ""}, + ) + + // gen2 has multiple old strings with no pairwise common prefix. + gen2 := NewReplacer( + "roses", "red", + "violets", "blue", + "sugar", "sweet", + ) + testCases = append(testCases, + testCase{gen2, "roses are red, violets are blue...", "red are red, blue are blue..."}, + testCase{gen2, "", ""}, + ) + + // gen3 has multiple old strings with an overall common prefix. + gen3 := NewReplacer( + "abracadabra", "poof", + "abracadabrakazam", "splat", + "abraham", "lincoln", + "abrasion", "scrape", + "abraham", "isaac", + ) + testCases = append(testCases, + testCase{gen3, "abracadabrakazam abraham", "poofkazam lincoln"}, + testCase{gen3, "abrasion abracad", "scrape abracad"}, + testCase{gen3, "abba abram abrasive", "abba abram abrasive"}, + testCase{gen3, "", ""}, + ) + + // foo{1,2,3,4} have multiple old strings with an overall common prefix + // and 1- or 2- byte extensions from the common prefix. + foo1 := NewReplacer( + "foo1", "A", + "foo2", "B", + "foo3", "C", + ) + foo2 := NewReplacer( + "foo1", "A", + "foo2", "B", + "foo31", "C", + "foo32", "D", + ) + foo3 := NewReplacer( + "foo11", "A", + "foo12", "B", + "foo31", "C", + "foo32", "D", + ) + foo4 := NewReplacer( + "foo12", "B", + "foo32", "D", + ) + testCases = append(testCases, + testCase{foo1, "fofoofoo12foo32oo", "fofooA2C2oo"}, + testCase{foo1, "", ""}, + + testCase{foo2, "fofoofoo12foo32oo", "fofooA2Doo"}, + testCase{foo2, "", ""}, + + testCase{foo3, "fofoofoo12foo32oo", "fofooBDoo"}, + testCase{foo3, "", ""}, + + testCase{foo4, "fofoofoo12foo32oo", "fofooBDoo"}, + testCase{foo4, "", ""}, + ) + + // genAll maps "\x00\x01\x02...\xfe\xff" to "[all]", amongst other things. + allBytes := make([]byte, 256) + for i := range allBytes { + allBytes[i] = byte(i) + } + allString := string(allBytes) + genAll := NewReplacer( + allString, "[all]", + "\xff", "[ff]", + "\x00", "[00]", + ) + testCases = append(testCases, + testCase{genAll, allString, "[all]"}, + testCase{genAll, "a\xff" + allString + "\x00", "a[ff][all][00]"}, + testCase{genAll, "", ""}, + ) + + // Test cases with empty old strings. + + blankToX1 := NewReplacer("", "X") + blankToX2 := NewReplacer("", "X", "", "") + blankHighPriority := NewReplacer("", "X", "o", "O") + blankLowPriority := NewReplacer("o", "O", "", "X") + blankNoOp1 := NewReplacer("", "") + blankNoOp2 := NewReplacer("", "", "", "A") + blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z") + testCases = append(testCases, + testCase{blankToX1, "foo", "XfXoXoX"}, + testCase{blankToX1, "", "X"}, + + testCase{blankToX2, "foo", "XfXoXoX"}, + testCase{blankToX2, "", "X"}, + + testCase{blankHighPriority, "oo", "XOXOX"}, + testCase{blankHighPriority, "ii", "XiXiX"}, + testCase{blankHighPriority, "oiio", "XOXiXiXOX"}, + testCase{blankHighPriority, "iooi", "XiXOXOXiX"}, + testCase{blankHighPriority, "", "X"}, + + testCase{blankLowPriority, "oo", "OOX"}, + testCase{blankLowPriority, "ii", "XiXiX"}, + testCase{blankLowPriority, "oiio", "OXiXiOX"}, + testCase{blankLowPriority, "iooi", "XiOOXiX"}, + testCase{blankLowPriority, "", "X"}, + + testCase{blankNoOp1, "foo", "foo"}, + testCase{blankNoOp1, "", ""}, + + testCase{blankNoOp2, "foo", "foo"}, + testCase{blankNoOp2, "", ""}, + + testCase{blankFoo, "foobarfoobaz", "XRXZX"}, + testCase{blankFoo, "foobar-foobaz", "XRX-XZX"}, + testCase{blankFoo, "", "X"}, + ) + + // single string replacer + + abcMatcher := NewReplacer("abc", "[match]") + + testCases = append(testCases, + testCase{abcMatcher, "", ""}, + testCase{abcMatcher, "ab", "ab"}, + testCase{abcMatcher, "abc", "[match]"}, + testCase{abcMatcher, "abcd", "[match]d"}, + testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"}, + ) + + // Issue 6659 cases (more single string replacer) + + noHello := NewReplacer("Hello", "") + testCases = append(testCases, + testCase{noHello, "Hello", ""}, + testCase{noHello, "Hellox", "x"}, + testCase{noHello, "xHello", "x"}, + testCase{noHello, "xHellox", "xx"}, + ) + + // No-arg test cases. + + nop := NewReplacer() + testCases = append(testCases, + testCase{nop, "abc", "abc"}, + testCase{nop, "", ""}, + ) + + // Run the test cases. + + for i, tc := range testCases { + if s := tc.r.Replace(tc.in); s != tc.out { + t.Errorf("%d. Replace(%q) = %q, want %q", i, tc.in, s, tc.out) + } + var buf bytes.Buffer + n, err := tc.r.WriteString(&buf, tc.in) + if err != nil { + t.Errorf("%d. WriteString: %v", i, err) + continue + } + got := buf.String() + if got != tc.out { + t.Errorf("%d. WriteString(%q) wrote %q, want %q", i, tc.in, got, tc.out) + continue + } + if n != len(tc.out) { + t.Errorf("%d. WriteString(%q) wrote correct string but reported %d bytes; want %d (%q)", + i, tc.in, n, len(tc.out), tc.out) + } + } +} + +var algorithmTestCases = []struct { + r *Replacer + want string +}{ + {capitalLetters, "*strings.byteReplacer"}, + {htmlEscaper, "*strings.byteStringReplacer"}, + {NewReplacer("12", "123"), "*strings.singleStringReplacer"}, + {NewReplacer("1", "12"), "*strings.byteStringReplacer"}, + {NewReplacer("", "X"), "*strings.genericReplacer"}, + {NewReplacer("a", "1", "b", "12", "cde", "123"), "*strings.genericReplacer"}, +} + +// TestPickAlgorithm tests that NewReplacer picks the correct algorithm. +func TestPickAlgorithm(t *testing.T) { + for i, tc := range algorithmTestCases { + got := fmt.Sprintf("%T", tc.r.Replacer()) + if got != tc.want { + t.Errorf("%d. algorithm = %s, want %s", i, got, tc.want) + } + } +} + +type errWriter struct{} + +func (errWriter) Write(p []byte) (n int, err error) { + return 0, fmt.Errorf("unwritable") +} + +// TestWriteStringError tests that WriteString returns an error +// received from the underlying io.Writer. +func TestWriteStringError(t *testing.T) { + for i, tc := range algorithmTestCases { + n, err := tc.r.WriteString(errWriter{}, "abc") + if n != 0 || err == nil || err.Error() != "unwritable" { + t.Errorf("%d. WriteStringError = %d, %v, want 0, unwritable", i, n, err) + } + } +} + +// TestGenericTrieBuilding verifies the structure of the generated trie. There +// is one node per line, and the key ending with the current line is in the +// trie if it ends with a "+". +func TestGenericTrieBuilding(t *testing.T) { + testCases := []struct{ in, out string }{ + {"abc;abdef;abdefgh;xx;xy;z", `- + a- + .b- + ..c+ + ..d- + ...ef+ + .....gh+ + x- + .x+ + .y+ + z+ + `}, + {"abracadabra;abracadabrakazam;abraham;abrasion", `- + a- + .bra- + ....c- + .....adabra+ + ...........kazam+ + ....h- + .....am+ + ....s- + .....ion+ + `}, + {"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `- + X+ + Y+ + a+ + .a+ + ..a+ + i+ + l- + .ong+ + ....er+ + ......st+ + x+ + .x+ + `}, + {"foo;;foo;foo1", `+ + f- + .oo+ + ...1+ + `}, + } + + for _, tc := range testCases { + keys := Split(tc.in, ";") + args := make([]string, len(keys)*2) + for i, key := range keys { + args[i*2] = key + } + + got := NewReplacer(args...).PrintTrie() + // Remove tabs from tc.out + wantbuf := make([]byte, 0, len(tc.out)) + for i := 0; i < len(tc.out); i++ { + if tc.out[i] != '\t' { + wantbuf = append(wantbuf, tc.out[i]) + } + } + want := string(wantbuf) + + if got != want { + t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want) + } + } +} + +func BenchmarkGenericNoMatch(b *testing.B) { + str := Repeat("A", 100) + Repeat("B", 100) + generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic + for i := 0; i < b.N; i++ { + generic.Replace(str) + } +} + +func BenchmarkGenericMatch1(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + generic := NewReplacer("a", "A", "b", "B", "12", "123") + for i := 0; i < b.N; i++ { + generic.Replace(str) + } +} + +func BenchmarkGenericMatch2(b *testing.B) { + str := Repeat("It's <b>HTML</b>!", 100) + for i := 0; i < b.N; i++ { + htmlUnescaper.Replace(str) + } +} + +func benchmarkSingleString(b *testing.B, pattern, text string) { + r := NewReplacer(pattern, "[match]") + b.SetBytes(int64(len(text))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + r.Replace(text) + } +} + +func BenchmarkSingleMaxSkipping(b *testing.B) { + benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000)) +} + +func BenchmarkSingleLongSuffixFail(b *testing.B) { + benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002)) +} + +func BenchmarkSingleMatch(b *testing.B) { + benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000)) +} + +func BenchmarkByteByteNoMatch(b *testing.B) { + str := Repeat("A", 100) + Repeat("B", 100) + for i := 0; i < b.N; i++ { + capitalLetters.Replace(str) + } +} + +func BenchmarkByteByteMatch(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + for i := 0; i < b.N; i++ { + capitalLetters.Replace(str) + } +} + +func BenchmarkByteStringMatch(b *testing.B) { + str := "<" + Repeat("a", 99) + Repeat("b", 99) + ">" + for i := 0; i < b.N; i++ { + htmlEscaper.Replace(str) + } +} + +func BenchmarkHTMLEscapeNew(b *testing.B) { + str := "I <3 to escape HTML & other text too." + for i := 0; i < b.N; i++ { + htmlEscaper.Replace(str) + } +} + +func BenchmarkHTMLEscapeOld(b *testing.B) { + str := "I <3 to escape HTML & other text too." + for i := 0; i < b.N; i++ { + oldHTMLEscape(str) + } +} + +func BenchmarkByteStringReplacerWriteString(b *testing.B) { + str := Repeat("I <3 to escape HTML & other text too.", 100) + buf := new(bytes.Buffer) + for i := 0; i < b.N; i++ { + htmlEscaper.WriteString(buf, str) + buf.Reset() + } +} + +func BenchmarkByteReplacerWriteString(b *testing.B) { + str := Repeat("abcdefghijklmnopqrstuvwxyz", 100) + buf := new(bytes.Buffer) + for i := 0; i < b.N; i++ { + capitalLetters.WriteString(buf, str) + buf.Reset() + } +} + +// BenchmarkByteByteReplaces compares byteByteImpl against multiple Replaces. +func BenchmarkByteByteReplaces(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + for i := 0; i < b.N; i++ { + Replace(Replace(str, "a", "A", -1), "b", "B", -1) + } +} + +// BenchmarkByteByteMap compares byteByteImpl against Map. +func BenchmarkByteByteMap(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + fn := func(r rune) rune { + switch r { + case 'a': + return 'A' + case 'b': + return 'B' + } + return r + } + for i := 0; i < b.N; i++ { + Map(fn, str) + } +} + +var mapdata = []struct{ name, data string }{ + {"ASCII", "a b c d e f g h i j k l m n o p q r s t u v w x y z"}, + {"Greek", "α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ ς σ τ υ φ χ ψ ω"}, +} + +func BenchmarkMap(b *testing.B) { + mapidentity := func(r rune) rune { + return r + } + + b.Run("identity", func(b *testing.B) { + for _, md := range mapdata { + b.Run(md.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + Map(mapidentity, md.data) + } + }) + } + }) + + mapchange := func(r rune) rune { + if 'a' <= r && r <= 'z' { + return r + 'A' - 'a' + } + if 'α' <= r && r <= 'ω' { + return r + 'Α' - 'α' + } + return r + } + + b.Run("change", func(b *testing.B) { + for _, md := range mapdata { + b.Run(md.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + Map(mapchange, md.data) + } + }) + } + }) +} diff --git a/src/strings/search.go b/src/strings/search.go new file mode 100644 index 0000000..e5bffbb --- /dev/null +++ b/src/strings/search.go @@ -0,0 +1,124 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +// stringFinder efficiently finds strings in a source text. It's implemented +// using the Boyer-Moore string search algorithm: +// https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm +// https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged +// document uses 1-based indexing) +type stringFinder struct { + // pattern is the string that we are searching for in the text. + pattern string + + // badCharSkip[b] contains the distance between the last byte of pattern + // and the rightmost occurrence of b in pattern. If b is not in pattern, + // badCharSkip[b] is len(pattern). + // + // Whenever a mismatch is found with byte b in the text, we can safely + // shift the matching frame at least badCharSkip[b] until the next time + // the matching char could be in alignment. + badCharSkip [256]int + + // goodSuffixSkip[i] defines how far we can shift the matching frame given + // that the suffix pattern[i+1:] matches, but the byte pattern[i] does + // not. There are two cases to consider: + // + // 1. The matched suffix occurs elsewhere in pattern (with a different + // byte preceding it that we might possibly match). In this case, we can + // shift the matching frame to align with the next suffix chunk. For + // example, the pattern "mississi" has the suffix "issi" next occurring + // (in right-to-left order) at index 1, so goodSuffixSkip[3] == + // shift+len(suffix) == 3+4 == 7. + // + // 2. If the matched suffix does not occur elsewhere in pattern, then the + // matching frame may share part of its prefix with the end of the + // matching suffix. In this case, goodSuffixSkip[i] will contain how far + // to shift the frame to align this portion of the prefix to the + // suffix. For example, in the pattern "abcxxxabc", when the first + // mismatch from the back is found to be in position 3, the matching + // suffix "xxabc" is not found elsewhere in the pattern. However, its + // rightmost "abc" (at position 6) is a prefix of the whole pattern, so + // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11. + goodSuffixSkip []int +} + +func makeStringFinder(pattern string) *stringFinder { + f := &stringFinder{ + pattern: pattern, + goodSuffixSkip: make([]int, len(pattern)), + } + // last is the index of the last character in the pattern. + last := len(pattern) - 1 + + // Build bad character table. + // Bytes not in the pattern can skip one pattern's length. + for i := range f.badCharSkip { + f.badCharSkip[i] = len(pattern) + } + // The loop condition is < instead of <= so that the last byte does not + // have a zero distance to itself. Finding this byte out of place implies + // that it is not in the last position. + for i := 0; i < last; i++ { + f.badCharSkip[pattern[i]] = last - i + } + + // Build good suffix table. + // First pass: set each value to the next index which starts a prefix of + // pattern. + lastPrefix := last + for i := last; i >= 0; i-- { + if HasPrefix(pattern, pattern[i+1:]) { + lastPrefix = i + 1 + } + // lastPrefix is the shift, and (last-i) is len(suffix). + f.goodSuffixSkip[i] = lastPrefix + last - i + } + // Second pass: find repeats of pattern's suffix starting from the front. + for i := 0; i < last; i++ { + lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1]) + if pattern[i-lenSuffix] != pattern[last-lenSuffix] { + // (last-i) is the shift, and lenSuffix is len(suffix). + f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i + } + } + + return f +} + +func longestCommonSuffix(a, b string) (i int) { + for ; i < len(a) && i < len(b); i++ { + if a[len(a)-1-i] != b[len(b)-1-i] { + break + } + } + return +} + +// next returns the index in text of the first occurrence of the pattern. If +// the pattern is not found, it returns -1. +func (f *stringFinder) next(text string) int { + i := len(f.pattern) - 1 + for i < len(text) { + // Compare backwards from the end until the first unmatching character. + j := len(f.pattern) - 1 + for j >= 0 && text[i] == f.pattern[j] { + i-- + j-- + } + if j < 0 { + return i + 1 // match + } + i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) + } + return -1 +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/src/strings/search_test.go b/src/strings/search_test.go new file mode 100644 index 0000000..c01a393 --- /dev/null +++ b/src/strings/search_test.go @@ -0,0 +1,90 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "reflect" + . "strings" + "testing" +) + +func TestFinderNext(t *testing.T) { + testCases := []struct { + pat, text string + index int + }{ + {"", "", 0}, + {"", "abc", 0}, + {"abc", "", -1}, + {"abc", "abc", 0}, + {"d", "abcdefg", 3}, + {"nan", "banana", 2}, + {"pan", "anpanman", 2}, + {"nnaaman", "anpanmanam", -1}, + {"abcd", "abc", -1}, + {"abcd", "bcd", -1}, + {"bcd", "abcd", 1}, + {"abc", "acca", -1}, + {"aa", "aaa", 0}, + {"baa", "aaaaa", -1}, + {"at that", "which finally halts. at that point", 22}, + } + + for _, tc := range testCases { + got := StringFind(tc.pat, tc.text) + want := tc.index + if got != want { + t.Errorf("stringFind(%q, %q) got %d, want %d\n", tc.pat, tc.text, got, want) + } + } +} + +func TestFinderCreation(t *testing.T) { + testCases := []struct { + pattern string + bad [256]int + suf []int + }{ + { + "abc", + [256]int{'a': 2, 'b': 1, 'c': 3}, + []int{5, 4, 1}, + }, + { + "mississi", + [256]int{'i': 3, 'm': 7, 's': 1}, + []int{15, 14, 13, 7, 11, 10, 7, 1}, + }, + // From https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf + { + "abcxxxabc", + [256]int{'a': 2, 'b': 1, 'c': 6, 'x': 3}, + []int{14, 13, 12, 11, 10, 9, 11, 10, 1}, + }, + { + "abyxcdeyx", + [256]int{'a': 8, 'b': 7, 'c': 4, 'd': 3, 'e': 2, 'y': 1, 'x': 5}, + []int{17, 16, 15, 14, 13, 12, 7, 10, 1}, + }, + } + + for _, tc := range testCases { + bad, good := DumpTables(tc.pattern) + + for i, got := range bad { + want := tc.bad[i] + if want == 0 { + want = len(tc.pattern) + } + if got != want { + t.Errorf("boyerMoore(%q) bad['%c']: got %d want %d", tc.pattern, i, got, want) + } + } + + if !reflect.DeepEqual(good, tc.suf) { + t.Errorf("boyerMoore(%q) got %v want %v", tc.pattern, good, tc.suf) + } + } +} diff --git a/src/strings/strings.go b/src/strings/strings.go new file mode 100644 index 0000000..646161f --- /dev/null +++ b/src/strings/strings.go @@ -0,0 +1,1289 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package strings implements simple functions to manipulate UTF-8 encoded strings. +// +// For information about UTF-8 strings in Go, see https://blog.golang.org/strings. +package strings + +import ( + "internal/bytealg" + "unicode" + "unicode/utf8" +) + +// explode splits s into a slice of UTF-8 strings, +// one string per Unicode character up to a maximum of n (n < 0 means no limit). +// Invalid UTF-8 bytes are sliced individually. +func explode(s string, n int) []string { + l := utf8.RuneCountInString(s) + if n < 0 || n > l { + n = l + } + a := make([]string, n) + for i := 0; i < n-1; i++ { + _, size := utf8.DecodeRuneInString(s) + a[i] = s[:size] + s = s[size:] + } + if n > 0 { + a[n-1] = s + } + return a +} + +// Count counts the number of non-overlapping instances of substr in s. +// If substr is an empty string, Count returns 1 + the number of Unicode code points in s. +func Count(s, substr string) int { + // special case + if len(substr) == 0 { + return utf8.RuneCountInString(s) + 1 + } + if len(substr) == 1 { + return bytealg.CountString(s, substr[0]) + } + n := 0 + for { + i := Index(s, substr) + if i == -1 { + return n + } + n++ + s = s[i+len(substr):] + } +} + +// Contains reports whether substr is within s. +func Contains(s, substr string) bool { + return Index(s, substr) >= 0 +} + +// ContainsAny reports whether any Unicode code points in chars are within s. +func ContainsAny(s, chars string) bool { + return IndexAny(s, chars) >= 0 +} + +// ContainsRune reports whether the Unicode code point r is within s. +func ContainsRune(s string, r rune) bool { + return IndexRune(s, r) >= 0 +} + +// LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s. +func LastIndex(s, substr string) int { + n := len(substr) + switch { + case n == 0: + return len(s) + case n == 1: + return LastIndexByte(s, substr[0]) + case n == len(s): + if substr == s { + return 0 + } + return -1 + case n > len(s): + return -1 + } + // Rabin-Karp search from the end of the string + hashss, pow := bytealg.HashStrRev(substr) + last := len(s) - n + var h uint32 + for i := len(s) - 1; i >= last; i-- { + h = h*bytealg.PrimeRK + uint32(s[i]) + } + if h == hashss && s[last:] == substr { + return last + } + for i := last - 1; i >= 0; i-- { + h *= bytealg.PrimeRK + h += uint32(s[i]) + h -= pow * uint32(s[i+n]) + if h == hashss && s[i:i+n] == substr { + return i + } + } + return -1 +} + +// IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s. +func IndexByte(s string, c byte) int { + return bytealg.IndexByteString(s, c) +} + +// IndexRune returns the index of the first instance of the Unicode code point +// r, or -1 if rune is not present in s. +// If r is utf8.RuneError, it returns the first instance of any +// invalid UTF-8 byte sequence. +func IndexRune(s string, r rune) int { + switch { + case 0 <= r && r < utf8.RuneSelf: + return IndexByte(s, byte(r)) + case r == utf8.RuneError: + for i, r := range s { + if r == utf8.RuneError { + return i + } + } + return -1 + case !utf8.ValidRune(r): + return -1 + default: + return Index(s, string(r)) + } +} + +// IndexAny returns the index of the first instance of any Unicode code point +// from chars in s, or -1 if no Unicode code point from chars is present in s. +func IndexAny(s, chars string) int { + if chars == "" { + // Avoid scanning all of s. + return -1 + } + if len(chars) == 1 { + // Avoid scanning all of s. + r := rune(chars[0]) + if r >= utf8.RuneSelf { + r = utf8.RuneError + } + return IndexRune(s, r) + } + if len(s) > 8 { + if as, isASCII := makeASCIISet(chars); isASCII { + for i := 0; i < len(s); i++ { + if as.contains(s[i]) { + return i + } + } + return -1 + } + } + for i, c := range s { + if IndexRune(chars, c) >= 0 { + return i + } + } + return -1 +} + +// LastIndexAny returns the index of the last instance of any Unicode code +// point from chars in s, or -1 if no Unicode code point from chars is +// present in s. +func LastIndexAny(s, chars string) int { + if chars == "" { + // Avoid scanning all of s. + return -1 + } + if len(s) == 1 { + rc := rune(s[0]) + if rc >= utf8.RuneSelf { + rc = utf8.RuneError + } + if IndexRune(chars, rc) >= 0 { + return 0 + } + return -1 + } + if len(s) > 8 { + if as, isASCII := makeASCIISet(chars); isASCII { + for i := len(s) - 1; i >= 0; i-- { + if as.contains(s[i]) { + return i + } + } + return -1 + } + } + if len(chars) == 1 { + rc := rune(chars[0]) + if rc >= utf8.RuneSelf { + rc = utf8.RuneError + } + for i := len(s); i > 0; { + r, size := utf8.DecodeLastRuneInString(s[:i]) + i -= size + if rc == r { + return i + } + } + return -1 + } + for i := len(s); i > 0; { + r, size := utf8.DecodeLastRuneInString(s[:i]) + i -= size + if IndexRune(chars, r) >= 0 { + return i + } + } + return -1 +} + +// LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. +func LastIndexByte(s string, c byte) int { + for i := len(s) - 1; i >= 0; i-- { + if s[i] == c { + return i + } + } + return -1 +} + +// Generic split: splits after each instance of sep, +// including sepSave bytes of sep in the subarrays. +func genSplit(s, sep string, sepSave, n int) []string { + if n == 0 { + return nil + } + if sep == "" { + return explode(s, n) + } + if n < 0 { + n = Count(s, sep) + 1 + } + + if n > len(s)+1 { + n = len(s) + 1 + } + a := make([]string, n) + n-- + i := 0 + for i < n { + m := Index(s, sep) + if m < 0 { + break + } + a[i] = s[:m+sepSave] + s = s[m+len(sep):] + i++ + } + a[i] = s + return a[:i+1] +} + +// SplitN slices s into substrings separated by sep and returns a slice of +// the substrings between those separators. +// +// The count determines the number of substrings to return: +// +// n > 0: at most n substrings; the last substring will be the unsplit remainder. +// n == 0: the result is nil (zero substrings) +// n < 0: all substrings +// +// Edge cases for s and sep (for example, empty strings) are handled +// as described in the documentation for Split. +// +// To split around the first instance of a separator, see Cut. +func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) } + +// SplitAfterN slices s into substrings after each instance of sep and +// returns a slice of those substrings. +// +// The count determines the number of substrings to return: +// +// n > 0: at most n substrings; the last substring will be the unsplit remainder. +// n == 0: the result is nil (zero substrings) +// n < 0: all substrings +// +// Edge cases for s and sep (for example, empty strings) are handled +// as described in the documentation for SplitAfter. +func SplitAfterN(s, sep string, n int) []string { + return genSplit(s, sep, len(sep), n) +} + +// Split slices s into all substrings separated by sep and returns a slice of +// the substrings between those separators. +// +// If s does not contain sep and sep is not empty, Split returns a +// slice of length 1 whose only element is s. +// +// If sep is empty, Split splits after each UTF-8 sequence. If both s +// and sep are empty, Split returns an empty slice. +// +// It is equivalent to SplitN with a count of -1. +// +// To split around the first instance of a separator, see Cut. +func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) } + +// SplitAfter slices s into all substrings after each instance of sep and +// returns a slice of those substrings. +// +// If s does not contain sep and sep is not empty, SplitAfter returns +// a slice of length 1 whose only element is s. +// +// If sep is empty, SplitAfter splits after each UTF-8 sequence. If +// both s and sep are empty, SplitAfter returns an empty slice. +// +// It is equivalent to SplitAfterN with a count of -1. +func SplitAfter(s, sep string) []string { + return genSplit(s, sep, len(sep), -1) +} + +var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} + +// Fields splits the string s around each instance of one or more consecutive white space +// characters, as defined by unicode.IsSpace, returning a slice of substrings of s or an +// empty slice if s contains only white space. +func Fields(s string) []string { + // First count the fields. + // This is an exact count if s is ASCII, otherwise it is an approximation. + n := 0 + wasSpace := 1 + // setBits is used to track which bits are set in the bytes of s. + setBits := uint8(0) + for i := 0; i < len(s); i++ { + r := s[i] + setBits |= r + isSpace := int(asciiSpace[r]) + n += wasSpace & ^isSpace + wasSpace = isSpace + } + + if setBits >= utf8.RuneSelf { + // Some runes in the input string are not ASCII. + return FieldsFunc(s, unicode.IsSpace) + } + // ASCII fast path + a := make([]string, n) + na := 0 + fieldStart := 0 + i := 0 + // Skip spaces in the front of the input. + for i < len(s) && asciiSpace[s[i]] != 0 { + i++ + } + fieldStart = i + for i < len(s) { + if asciiSpace[s[i]] == 0 { + i++ + continue + } + a[na] = s[fieldStart:i] + na++ + i++ + // Skip spaces in between fields. + for i < len(s) && asciiSpace[s[i]] != 0 { + i++ + } + fieldStart = i + } + if fieldStart < len(s) { // Last field might end at EOF. + a[na] = s[fieldStart:] + } + return a +} + +// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) +// and returns an array of slices of s. If all code points in s satisfy f(c) or the +// string is empty, an empty slice is returned. +// +// FieldsFunc makes no guarantees about the order in which it calls f(c) +// and assumes that f always returns the same value for a given c. +func FieldsFunc(s string, f func(rune) bool) []string { + // A span is used to record a slice of s of the form s[start:end]. + // The start index is inclusive and the end index is exclusive. + type span struct { + start int + end int + } + spans := make([]span, 0, 32) + + // Find the field start and end indices. + // Doing this in a separate pass (rather than slicing the string s + // and collecting the result substrings right away) is significantly + // more efficient, possibly due to cache effects. + start := -1 // valid span start if >= 0 + for end, rune := range s { + if f(rune) { + if start >= 0 { + spans = append(spans, span{start, end}) + // Set start to a negative value. + // Note: using -1 here consistently and reproducibly + // slows down this code by a several percent on amd64. + start = ^start + } + } else { + if start < 0 { + start = end + } + } + } + + // Last field might end at EOF. + if start >= 0 { + spans = append(spans, span{start, len(s)}) + } + + // Create strings from recorded field indices. + a := make([]string, len(spans)) + for i, span := range spans { + a[i] = s[span.start:span.end] + } + + return a +} + +// Join concatenates the elements of its first argument to create a single string. The separator +// string sep is placed between elements in the resulting string. +func Join(elems []string, sep string) string { + switch len(elems) { + case 0: + return "" + case 1: + return elems[0] + } + n := len(sep) * (len(elems) - 1) + for i := 0; i < len(elems); i++ { + n += len(elems[i]) + } + + var b Builder + b.Grow(n) + b.WriteString(elems[0]) + for _, s := range elems[1:] { + b.WriteString(sep) + b.WriteString(s) + } + return b.String() +} + +// HasPrefix tests whether the string s begins with prefix. +func HasPrefix(s, prefix string) bool { + return len(s) >= len(prefix) && s[0:len(prefix)] == prefix +} + +// HasSuffix tests whether the string s ends with suffix. +func HasSuffix(s, suffix string) bool { + return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix +} + +// Map returns a copy of the string s with all its characters modified +// according to the mapping function. If mapping returns a negative value, the character is +// dropped from the string with no replacement. +func Map(mapping func(rune) rune, s string) string { + // In the worst case, the string can grow when mapped, making + // things unpleasant. But it's so rare we barge in assuming it's + // fine. It could also shrink but that falls out naturally. + + // The output buffer b is initialized on demand, the first + // time a character differs. + var b Builder + + for i, c := range s { + r := mapping(c) + if r == c && c != utf8.RuneError { + continue + } + + var width int + if c == utf8.RuneError { + c, width = utf8.DecodeRuneInString(s[i:]) + if width != 1 && r == c { + continue + } + } else { + width = utf8.RuneLen(c) + } + + b.Grow(len(s) + utf8.UTFMax) + b.WriteString(s[:i]) + if r >= 0 { + b.WriteRune(r) + } + + s = s[i+width:] + break + } + + // Fast path for unchanged input + if b.Cap() == 0 { // didn't call b.Grow above + return s + } + + for _, c := range s { + r := mapping(c) + + if r >= 0 { + // common case + // Due to inlining, it is more performant to determine if WriteByte should be + // invoked rather than always call WriteRune + if r < utf8.RuneSelf { + b.WriteByte(byte(r)) + } else { + // r is not a ASCII rune. + b.WriteRune(r) + } + } + } + + return b.String() +} + +// Repeat returns a new string consisting of count copies of the string s. +// +// It panics if count is negative or if the result of (len(s) * count) +// overflows. +func Repeat(s string, count int) string { + switch count { + case 0: + return "" + case 1: + return s + } + + // Since we cannot return an error on overflow, + // we should panic if the repeat will generate + // an overflow. + // See golang.org/issue/16237. + if count < 0 { + panic("strings: negative Repeat count") + } else if len(s)*count/count != len(s) { + panic("strings: Repeat count causes overflow") + } + + if len(s) == 0 { + return "" + } + + n := len(s) * count + + // Past a certain chunk size it is counterproductive to use + // larger chunks as the source of the write, as when the source + // is too large we are basically just thrashing the CPU D-cache. + // So if the result length is larger than an empirically-found + // limit (8KB), we stop growing the source string once the limit + // is reached and keep reusing the same source string - that + // should therefore be always resident in the L1 cache - until we + // have completed the construction of the result. + // This yields significant speedups (up to +100%) in cases where + // the result length is large (roughly, over L2 cache size). + const chunkLimit = 8 * 1024 + chunkMax := n + if n > chunkLimit { + chunkMax = chunkLimit / len(s) * len(s) + if chunkMax == 0 { + chunkMax = len(s) + } + } + + var b Builder + b.Grow(n) + b.WriteString(s) + for b.Len() < n { + chunk := n - b.Len() + if chunk > b.Len() { + chunk = b.Len() + } + if chunk > chunkMax { + chunk = chunkMax + } + b.WriteString(b.String()[:chunk]) + } + return b.String() +} + +// ToUpper returns s with all Unicode letters mapped to their upper case. +func ToUpper(s string) string { + isASCII, hasLower := true, false + for i := 0; i < len(s); i++ { + c := s[i] + if c >= utf8.RuneSelf { + isASCII = false + break + } + hasLower = hasLower || ('a' <= c && c <= 'z') + } + + if isASCII { // optimize for ASCII-only strings. + if !hasLower { + return s + } + var ( + b Builder + pos int + ) + b.Grow(len(s)) + for i := 0; i < len(s); i++ { + c := s[i] + if 'a' <= c && c <= 'z' { + c -= 'a' - 'A' + if pos < i { + b.WriteString(s[pos:i]) + } + b.WriteByte(c) + pos = i + 1 + } + } + if pos < len(s) { + b.WriteString(s[pos:]) + } + return b.String() + } + return Map(unicode.ToUpper, s) +} + +// ToLower returns s with all Unicode letters mapped to their lower case. +func ToLower(s string) string { + isASCII, hasUpper := true, false + for i := 0; i < len(s); i++ { + c := s[i] + if c >= utf8.RuneSelf { + isASCII = false + break + } + hasUpper = hasUpper || ('A' <= c && c <= 'Z') + } + + if isASCII { // optimize for ASCII-only strings. + if !hasUpper { + return s + } + var ( + b Builder + pos int + ) + b.Grow(len(s)) + for i := 0; i < len(s); i++ { + c := s[i] + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + if pos < i { + b.WriteString(s[pos:i]) + } + b.WriteByte(c) + pos = i + 1 + } + } + if pos < len(s) { + b.WriteString(s[pos:]) + } + return b.String() + } + return Map(unicode.ToLower, s) +} + +// ToTitle returns a copy of the string s with all Unicode letters mapped to +// their Unicode title case. +func ToTitle(s string) string { return Map(unicode.ToTitle, s) } + +// ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their +// upper case using the case mapping specified by c. +func ToUpperSpecial(c unicode.SpecialCase, s string) string { + return Map(c.ToUpper, s) +} + +// ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their +// lower case using the case mapping specified by c. +func ToLowerSpecial(c unicode.SpecialCase, s string) string { + return Map(c.ToLower, s) +} + +// ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their +// Unicode title case, giving priority to the special casing rules. +func ToTitleSpecial(c unicode.SpecialCase, s string) string { + return Map(c.ToTitle, s) +} + +// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences +// replaced by the replacement string, which may be empty. +func ToValidUTF8(s, replacement string) string { + var b Builder + + for i, c := range s { + if c != utf8.RuneError { + continue + } + + _, wid := utf8.DecodeRuneInString(s[i:]) + if wid == 1 { + b.Grow(len(s) + len(replacement)) + b.WriteString(s[:i]) + s = s[i:] + break + } + } + + // Fast path for unchanged input + if b.Cap() == 0 { // didn't call b.Grow above + return s + } + + invalid := false // previous byte was from an invalid UTF-8 sequence + for i := 0; i < len(s); { + c := s[i] + if c < utf8.RuneSelf { + i++ + invalid = false + b.WriteByte(c) + continue + } + _, wid := utf8.DecodeRuneInString(s[i:]) + if wid == 1 { + i++ + if !invalid { + invalid = true + b.WriteString(replacement) + } + continue + } + invalid = false + b.WriteString(s[i : i+wid]) + i += wid + } + + return b.String() +} + +// isSeparator reports whether the rune could mark a word boundary. +// TODO: update when package unicode captures more of the properties. +func isSeparator(r rune) bool { + // ASCII alphanumerics and underscore are not separators + if r <= 0x7F { + switch { + case '0' <= r && r <= '9': + return false + case 'a' <= r && r <= 'z': + return false + case 'A' <= r && r <= 'Z': + return false + case r == '_': + return false + } + return true + } + // Letters and digits are not separators + if unicode.IsLetter(r) || unicode.IsDigit(r) { + return false + } + // Otherwise, all we can do for now is treat spaces as separators. + return unicode.IsSpace(r) +} + +// Title returns a copy of the string s with all Unicode letters that begin words +// mapped to their Unicode title case. +// +// Deprecated: The rule Title uses for word boundaries does not handle Unicode +// punctuation properly. Use golang.org/x/text/cases instead. +func Title(s string) string { + // Use a closure here to remember state. + // Hackish but effective. Depends on Map scanning in order and calling + // the closure once per rune. + prev := ' ' + return Map( + func(r rune) rune { + if isSeparator(prev) { + prev = r + return unicode.ToTitle(r) + } + prev = r + return r + }, + s) +} + +// TrimLeftFunc returns a slice of the string s with all leading +// Unicode code points c satisfying f(c) removed. +func TrimLeftFunc(s string, f func(rune) bool) string { + i := indexFunc(s, f, false) + if i == -1 { + return "" + } + return s[i:] +} + +// TrimRightFunc returns a slice of the string s with all trailing +// Unicode code points c satisfying f(c) removed. +func TrimRightFunc(s string, f func(rune) bool) string { + i := lastIndexFunc(s, f, false) + if i >= 0 && s[i] >= utf8.RuneSelf { + _, wid := utf8.DecodeRuneInString(s[i:]) + i += wid + } else { + i++ + } + return s[0:i] +} + +// TrimFunc returns a slice of the string s with all leading +// and trailing Unicode code points c satisfying f(c) removed. +func TrimFunc(s string, f func(rune) bool) string { + return TrimRightFunc(TrimLeftFunc(s, f), f) +} + +// IndexFunc returns the index into s of the first Unicode +// code point satisfying f(c), or -1 if none do. +func IndexFunc(s string, f func(rune) bool) int { + return indexFunc(s, f, true) +} + +// LastIndexFunc returns the index into s of the last +// Unicode code point satisfying f(c), or -1 if none do. +func LastIndexFunc(s string, f func(rune) bool) int { + return lastIndexFunc(s, f, true) +} + +// indexFunc is the same as IndexFunc except that if +// truth==false, the sense of the predicate function is +// inverted. +func indexFunc(s string, f func(rune) bool, truth bool) int { + for i, r := range s { + if f(r) == truth { + return i + } + } + return -1 +} + +// lastIndexFunc is the same as LastIndexFunc except that if +// truth==false, the sense of the predicate function is +// inverted. +func lastIndexFunc(s string, f func(rune) bool, truth bool) int { + for i := len(s); i > 0; { + r, size := utf8.DecodeLastRuneInString(s[0:i]) + i -= size + if f(r) == truth { + return i + } + } + return -1 +} + +// asciiSet is a 32-byte value, where each bit represents the presence of a +// given ASCII character in the set. The 128-bits of the lower 16 bytes, +// starting with the least-significant bit of the lowest word to the +// most-significant bit of the highest word, map to the full range of all +// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, +// ensuring that any non-ASCII character will be reported as not in the set. +// This allocates a total of 32 bytes even though the upper half +// is unused to avoid bounds checks in asciiSet.contains. +type asciiSet [8]uint32 + +// makeASCIISet creates a set of ASCII characters and reports whether all +// characters in chars are ASCII. +func makeASCIISet(chars string) (as asciiSet, ok bool) { + for i := 0; i < len(chars); i++ { + c := chars[i] + if c >= utf8.RuneSelf { + return as, false + } + as[c/32] |= 1 << (c % 32) + } + return as, true +} + +// contains reports whether c is inside the set. +func (as *asciiSet) contains(c byte) bool { + return (as[c/32] & (1 << (c % 32))) != 0 +} + +// Trim returns a slice of the string s with all leading and +// trailing Unicode code points contained in cutset removed. +func Trim(s, cutset string) string { + if s == "" || cutset == "" { + return s + } + if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { + return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0]) + } + if as, ok := makeASCIISet(cutset); ok { + return trimLeftASCII(trimRightASCII(s, &as), &as) + } + return trimLeftUnicode(trimRightUnicode(s, cutset), cutset) +} + +// TrimLeft returns a slice of the string s with all leading +// Unicode code points contained in cutset removed. +// +// To remove a prefix, use TrimPrefix instead. +func TrimLeft(s, cutset string) string { + if s == "" || cutset == "" { + return s + } + if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { + return trimLeftByte(s, cutset[0]) + } + if as, ok := makeASCIISet(cutset); ok { + return trimLeftASCII(s, &as) + } + return trimLeftUnicode(s, cutset) +} + +func trimLeftByte(s string, c byte) string { + for len(s) > 0 && s[0] == c { + s = s[1:] + } + return s +} + +func trimLeftASCII(s string, as *asciiSet) string { + for len(s) > 0 { + if !as.contains(s[0]) { + break + } + s = s[1:] + } + return s +} + +func trimLeftUnicode(s, cutset string) string { + for len(s) > 0 { + r, n := rune(s[0]), 1 + if r >= utf8.RuneSelf { + r, n = utf8.DecodeRuneInString(s) + } + if !ContainsRune(cutset, r) { + break + } + s = s[n:] + } + return s +} + +// TrimRight returns a slice of the string s, with all trailing +// Unicode code points contained in cutset removed. +// +// To remove a suffix, use TrimSuffix instead. +func TrimRight(s, cutset string) string { + if s == "" || cutset == "" { + return s + } + if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { + return trimRightByte(s, cutset[0]) + } + if as, ok := makeASCIISet(cutset); ok { + return trimRightASCII(s, &as) + } + return trimRightUnicode(s, cutset) +} + +func trimRightByte(s string, c byte) string { + for len(s) > 0 && s[len(s)-1] == c { + s = s[:len(s)-1] + } + return s +} + +func trimRightASCII(s string, as *asciiSet) string { + for len(s) > 0 { + if !as.contains(s[len(s)-1]) { + break + } + s = s[:len(s)-1] + } + return s +} + +func trimRightUnicode(s, cutset string) string { + for len(s) > 0 { + r, n := rune(s[len(s)-1]), 1 + if r >= utf8.RuneSelf { + r, n = utf8.DecodeLastRuneInString(s) + } + if !ContainsRune(cutset, r) { + break + } + s = s[:len(s)-n] + } + return s +} + +// TrimSpace returns a slice of the string s, with all leading +// and trailing white space removed, as defined by Unicode. +func TrimSpace(s string) string { + // Fast path for ASCII: look for the first ASCII non-space byte + start := 0 + for ; start < len(s); start++ { + c := s[start] + if c >= utf8.RuneSelf { + // If we run into a non-ASCII byte, fall back to the + // slower unicode-aware method on the remaining bytes + return TrimFunc(s[start:], unicode.IsSpace) + } + if asciiSpace[c] == 0 { + break + } + } + + // Now look for the first ASCII non-space byte from the end + stop := len(s) + for ; stop > start; stop-- { + c := s[stop-1] + if c >= utf8.RuneSelf { + // start has been already trimmed above, should trim end only + return TrimRightFunc(s[start:stop], unicode.IsSpace) + } + if asciiSpace[c] == 0 { + break + } + } + + // At this point s[start:stop] starts and ends with an ASCII + // non-space bytes, so we're done. Non-ASCII cases have already + // been handled above. + return s[start:stop] +} + +// TrimPrefix returns s without the provided leading prefix string. +// If s doesn't start with prefix, s is returned unchanged. +func TrimPrefix(s, prefix string) string { + if HasPrefix(s, prefix) { + return s[len(prefix):] + } + return s +} + +// TrimSuffix returns s without the provided trailing suffix string. +// If s doesn't end with suffix, s is returned unchanged. +func TrimSuffix(s, suffix string) string { + if HasSuffix(s, suffix) { + return s[:len(s)-len(suffix)] + } + return s +} + +// Replace returns a copy of the string s with the first n +// non-overlapping instances of old replaced by new. +// If old is empty, it matches at the beginning of the string +// and after each UTF-8 sequence, yielding up to k+1 replacements +// for a k-rune string. +// If n < 0, there is no limit on the number of replacements. +func Replace(s, old, new string, n int) string { + if old == new || n == 0 { + return s // avoid allocation + } + + // Compute number of replacements. + if m := Count(s, old); m == 0 { + return s // avoid allocation + } else if n < 0 || m < n { + n = m + } + + // Apply replacements to buffer. + var b Builder + b.Grow(len(s) + n*(len(new)-len(old))) + start := 0 + for i := 0; i < n; i++ { + j := start + if len(old) == 0 { + if i > 0 { + _, wid := utf8.DecodeRuneInString(s[start:]) + j += wid + } + } else { + j += Index(s[start:], old) + } + b.WriteString(s[start:j]) + b.WriteString(new) + start = j + len(old) + } + b.WriteString(s[start:]) + return b.String() +} + +// ReplaceAll returns a copy of the string s with all +// non-overlapping instances of old replaced by new. +// If old is empty, it matches at the beginning of the string +// and after each UTF-8 sequence, yielding up to k+1 replacements +// for a k-rune string. +func ReplaceAll(s, old, new string) string { + return Replace(s, old, new, -1) +} + +// EqualFold reports whether s and t, interpreted as UTF-8 strings, +// are equal under simple Unicode case-folding, which is a more general +// form of case-insensitivity. +func EqualFold(s, t string) bool { + // ASCII fast path + i := 0 + for ; i < len(s) && i < len(t); i++ { + sr := s[i] + tr := t[i] + if sr|tr >= utf8.RuneSelf { + goto hasUnicode + } + + // Easy case. + if tr == sr { + continue + } + + // Make sr < tr to simplify what follows. + if tr < sr { + tr, sr = sr, tr + } + // ASCII only, sr/tr must be upper/lower case + if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { + continue + } + return false + } + // Check if we've exhausted both strings. + return len(s) == len(t) + +hasUnicode: + s = s[i:] + t = t[i:] + for _, sr := range s { + // If t is exhausted the strings are not equal. + if len(t) == 0 { + return false + } + + // Extract first rune from second string. + var tr rune + if t[0] < utf8.RuneSelf { + tr, t = rune(t[0]), t[1:] + } else { + r, size := utf8.DecodeRuneInString(t) + tr, t = r, t[size:] + } + + // If they match, keep going; if not, return false. + + // Easy case. + if tr == sr { + continue + } + + // Make sr < tr to simplify what follows. + if tr < sr { + tr, sr = sr, tr + } + // Fast check for ASCII. + if tr < utf8.RuneSelf { + // ASCII only, sr/tr must be upper/lower case + if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { + continue + } + return false + } + + // General case. SimpleFold(x) returns the next equivalent rune > x + // or wraps around to smaller values. + r := unicode.SimpleFold(sr) + for r != sr && r < tr { + r = unicode.SimpleFold(r) + } + if r == tr { + continue + } + return false + } + + // First string is empty, so check if the second one is also empty. + return len(t) == 0 +} + +// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. +func Index(s, substr string) int { + n := len(substr) + switch { + case n == 0: + return 0 + case n == 1: + return IndexByte(s, substr[0]) + case n == len(s): + if substr == s { + return 0 + } + return -1 + case n > len(s): + return -1 + case n <= bytealg.MaxLen: + // Use brute force when s and substr both are small + if len(s) <= bytealg.MaxBruteForce { + return bytealg.IndexString(s, substr) + } + c0 := substr[0] + c1 := substr[1] + i := 0 + t := len(s) - n + 1 + fails := 0 + for i < t { + if s[i] != c0 { + // IndexByte is faster than bytealg.IndexString, so use it as long as + // we're not getting lots of false positives. + o := IndexByte(s[i+1:t], c0) + if o < 0 { + return -1 + } + i += o + 1 + } + if s[i+1] == c1 && s[i:i+n] == substr { + return i + } + fails++ + i++ + // Switch to bytealg.IndexString when IndexByte produces too many false positives. + if fails > bytealg.Cutover(i) { + r := bytealg.IndexString(s[i:], substr) + if r >= 0 { + return r + i + } + return -1 + } + } + return -1 + } + c0 := substr[0] + c1 := substr[1] + i := 0 + t := len(s) - n + 1 + fails := 0 + for i < t { + if s[i] != c0 { + o := IndexByte(s[i+1:t], c0) + if o < 0 { + return -1 + } + i += o + 1 + } + if s[i+1] == c1 && s[i:i+n] == substr { + return i + } + i++ + fails++ + if fails >= 4+i>>4 && i < t { + // See comment in ../bytes/bytes.go. + j := bytealg.IndexRabinKarp(s[i:], substr) + if j < 0 { + return -1 + } + return i + j + } + } + return -1 +} + +// Cut slices s around the first instance of sep, +// returning the text before and after sep. +// The found result reports whether sep appears in s. +// If sep does not appear in s, cut returns s, "", false. +func Cut(s, sep string) (before, after string, found bool) { + if i := Index(s, sep); i >= 0 { + return s[:i], s[i+len(sep):], true + } + return s, "", false +} + +// CutPrefix returns s without the provided leading prefix string +// and reports whether it found the prefix. +// If s doesn't start with prefix, CutPrefix returns s, false. +// If prefix is the empty string, CutPrefix returns s, true. +func CutPrefix(s, prefix string) (after string, found bool) { + if !HasPrefix(s, prefix) { + return s, false + } + return s[len(prefix):], true +} + +// CutSuffix returns s without the provided ending suffix string +// and reports whether it found the suffix. +// If s doesn't end with suffix, CutSuffix returns s, false. +// If suffix is the empty string, CutSuffix returns s, true. +func CutSuffix(s, suffix string) (before string, found bool) { + if !HasSuffix(s, suffix) { + return s, false + } + return s[:len(s)-len(suffix)], true +} diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go new file mode 100644 index 0000000..3991d12 --- /dev/null +++ b/src/strings/strings_test.go @@ -0,0 +1,2042 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "bytes" + "fmt" + "io" + "math" + "math/rand" + "reflect" + "strconv" + . "strings" + "testing" + "unicode" + "unicode/utf8" + "unsafe" +) + +func eq(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := 0; i < len(a); i++ { + if a[i] != b[i] { + return false + } + } + return true +} + +var abcd = "abcd" +var faces = "☺☻☹" +var commas = "1,2,3,4" +var dots = "1....2....3....4" + +type IndexTest struct { + s string + sep string + out int +} + +var indexTests = []IndexTest{ + {"", "", 0}, + {"", "a", -1}, + {"", "foo", -1}, + {"fo", "foo", -1}, + {"foo", "foo", 0}, + {"oofofoofooo", "f", 2}, + {"oofofoofooo", "foo", 4}, + {"barfoobarfoo", "foo", 3}, + {"foo", "", 0}, + {"foo", "o", 1}, + {"abcABCabc", "A", 3}, + {"jrzm6jjhorimglljrea4w3rlgosts0w2gia17hno2td4qd1jz", "jz", 47}, + {"ekkuk5oft4eq0ocpacknhwouic1uua46unx12l37nioq9wbpnocqks6", "ks6", 52}, + {"999f2xmimunbuyew5vrkla9cpwhmxan8o98ec", "98ec", 33}, + {"9lpt9r98i04k8bz6c6dsrthb96bhi", "96bhi", 24}, + {"55u558eqfaod2r2gu42xxsu631xf0zobs5840vl", "5840vl", 33}, + // cases with one byte strings - test special case in Index() + {"", "a", -1}, + {"x", "a", -1}, + {"x", "x", 0}, + {"abc", "a", 0}, + {"abc", "b", 1}, + {"abc", "c", 2}, + {"abc", "x", -1}, + // test special cases in Index() for short strings + {"", "ab", -1}, + {"bc", "ab", -1}, + {"ab", "ab", 0}, + {"xab", "ab", 1}, + {"xab"[:2], "ab", -1}, + {"", "abc", -1}, + {"xbc", "abc", -1}, + {"abc", "abc", 0}, + {"xabc", "abc", 1}, + {"xabc"[:3], "abc", -1}, + {"xabxc", "abc", -1}, + {"", "abcd", -1}, + {"xbcd", "abcd", -1}, + {"abcd", "abcd", 0}, + {"xabcd", "abcd", 1}, + {"xyabcd"[:5], "abcd", -1}, + {"xbcqq", "abcqq", -1}, + {"abcqq", "abcqq", 0}, + {"xabcqq", "abcqq", 1}, + {"xyabcqq"[:6], "abcqq", -1}, + {"xabxcqq", "abcqq", -1}, + {"xabcqxq", "abcqq", -1}, + {"", "01234567", -1}, + {"32145678", "01234567", -1}, + {"01234567", "01234567", 0}, + {"x01234567", "01234567", 1}, + {"x0123456x01234567", "01234567", 9}, + {"xx01234567"[:9], "01234567", -1}, + {"", "0123456789", -1}, + {"3214567844", "0123456789", -1}, + {"0123456789", "0123456789", 0}, + {"x0123456789", "0123456789", 1}, + {"x012345678x0123456789", "0123456789", 11}, + {"xyz0123456789"[:12], "0123456789", -1}, + {"x01234567x89", "0123456789", -1}, + {"", "0123456789012345", -1}, + {"3214567889012345", "0123456789012345", -1}, + {"0123456789012345", "0123456789012345", 0}, + {"x0123456789012345", "0123456789012345", 1}, + {"x012345678901234x0123456789012345", "0123456789012345", 17}, + {"", "01234567890123456789", -1}, + {"32145678890123456789", "01234567890123456789", -1}, + {"01234567890123456789", "01234567890123456789", 0}, + {"x01234567890123456789", "01234567890123456789", 1}, + {"x0123456789012345678x01234567890123456789", "01234567890123456789", 21}, + {"xyz01234567890123456789"[:22], "01234567890123456789", -1}, + {"", "0123456789012345678901234567890", -1}, + {"321456788901234567890123456789012345678911", "0123456789012345678901234567890", -1}, + {"0123456789012345678901234567890", "0123456789012345678901234567890", 0}, + {"x0123456789012345678901234567890", "0123456789012345678901234567890", 1}, + {"x012345678901234567890123456789x0123456789012345678901234567890", "0123456789012345678901234567890", 32}, + {"xyz0123456789012345678901234567890"[:33], "0123456789012345678901234567890", -1}, + {"", "01234567890123456789012345678901", -1}, + {"32145678890123456789012345678901234567890211", "01234567890123456789012345678901", -1}, + {"01234567890123456789012345678901", "01234567890123456789012345678901", 0}, + {"x01234567890123456789012345678901", "01234567890123456789012345678901", 1}, + {"x0123456789012345678901234567890x01234567890123456789012345678901", "01234567890123456789012345678901", 33}, + {"xyz01234567890123456789012345678901"[:34], "01234567890123456789012345678901", -1}, + {"xxxxxx012345678901234567890123456789012345678901234567890123456789012", "012345678901234567890123456789012345678901234567890123456789012", 6}, + {"", "0123456789012345678901234567890123456789", -1}, + {"xx012345678901234567890123456789012345678901234567890123456789012", "0123456789012345678901234567890123456789", 2}, + {"xx012345678901234567890123456789012345678901234567890123456789012"[:41], "0123456789012345678901234567890123456789", -1}, + {"xx012345678901234567890123456789012345678901234567890123456789012", "0123456789012345678901234567890123456xxx", -1}, + {"xx0123456789012345678901234567890123456789012345678901234567890120123456789012345678901234567890123456xxx", "0123456789012345678901234567890123456xxx", 65}, + // test fallback to Rabin-Karp. + {"oxoxoxoxoxoxoxoxoxoxoxoy", "oy", 22}, + {"oxoxoxoxoxoxoxoxoxoxoxox", "oy", -1}, +} + +var lastIndexTests = []IndexTest{ + {"", "", 0}, + {"", "a", -1}, + {"", "foo", -1}, + {"fo", "foo", -1}, + {"foo", "foo", 0}, + {"foo", "f", 0}, + {"oofofoofooo", "f", 7}, + {"oofofoofooo", "foo", 7}, + {"barfoobarfoo", "foo", 9}, + {"foo", "", 3}, + {"foo", "o", 2}, + {"abcABCabc", "A", 3}, + {"abcABCabc", "a", 6}, +} + +var indexAnyTests = []IndexTest{ + {"", "", -1}, + {"", "a", -1}, + {"", "abc", -1}, + {"a", "", -1}, + {"a", "a", 0}, + {"\x80", "\xffb", 0}, + {"aaa", "a", 0}, + {"abc", "xyz", -1}, + {"abc", "xcz", 2}, + {"ab☺c", "x☺yz", 2}, + {"a☺b☻c☹d", "cx", len("a☺b☻")}, + {"a☺b☻c☹d", "uvw☻xyz", len("a☺b")}, + {"aRegExp*", ".(|)*+?^$[]", 7}, + {dots + dots + dots, " ", -1}, + {"012abcba210", "\xffb", 4}, + {"012\x80bcb\x80210", "\xffb", 3}, + {"0123456\xcf\x80abc", "\xcfb\x80", 10}, +} + +var lastIndexAnyTests = []IndexTest{ + {"", "", -1}, + {"", "a", -1}, + {"", "abc", -1}, + {"a", "", -1}, + {"a", "a", 0}, + {"\x80", "\xffb", 0}, + {"aaa", "a", 2}, + {"abc", "xyz", -1}, + {"abc", "ab", 1}, + {"ab☺c", "x☺yz", 2}, + {"a☺b☻c☹d", "cx", len("a☺b☻")}, + {"a☺b☻c☹d", "uvw☻xyz", len("a☺b")}, + {"a.RegExp*", ".(|)*+?^$[]", 8}, + {dots + dots + dots, " ", -1}, + {"012abcba210", "\xffb", 6}, + {"012\x80bcb\x80210", "\xffb", 7}, + {"0123456\xcf\x80abc", "\xcfb\x80", 10}, +} + +// Execute f on each test case. funcName should be the name of f; it's used +// in failure reports. +func runIndexTests(t *testing.T, f func(s, sep string) int, funcName string, testCases []IndexTest) { + for _, test := range testCases { + actual := f(test.s, test.sep) + if actual != test.out { + t.Errorf("%s(%q,%q) = %v; want %v", funcName, test.s, test.sep, actual, test.out) + } + } +} + +func TestIndex(t *testing.T) { runIndexTests(t, Index, "Index", indexTests) } +func TestLastIndex(t *testing.T) { runIndexTests(t, LastIndex, "LastIndex", lastIndexTests) } +func TestIndexAny(t *testing.T) { runIndexTests(t, IndexAny, "IndexAny", indexAnyTests) } +func TestLastIndexAny(t *testing.T) { + runIndexTests(t, LastIndexAny, "LastIndexAny", lastIndexAnyTests) +} + +func TestIndexByte(t *testing.T) { + for _, tt := range indexTests { + if len(tt.sep) != 1 { + continue + } + pos := IndexByte(tt.s, tt.sep[0]) + if pos != tt.out { + t.Errorf(`IndexByte(%q, %q) = %v; want %v`, tt.s, tt.sep[0], pos, tt.out) + } + } +} + +func TestLastIndexByte(t *testing.T) { + testCases := []IndexTest{ + {"", "q", -1}, + {"abcdef", "q", -1}, + {"abcdefabcdef", "a", len("abcdef")}, // something in the middle + {"abcdefabcdef", "f", len("abcdefabcde")}, // last byte + {"zabcdefabcdef", "z", 0}, // first byte + {"a☺b☻c☹d", "b", len("a☺")}, // non-ascii + } + for _, test := range testCases { + actual := LastIndexByte(test.s, test.sep[0]) + if actual != test.out { + t.Errorf("LastIndexByte(%q,%c) = %v; want %v", test.s, test.sep[0], actual, test.out) + } + } +} + +func simpleIndex(s, sep string) int { + n := len(sep) + for i := n; i <= len(s); i++ { + if s[i-n:i] == sep { + return i - n + } + } + return -1 +} + +func TestIndexRandom(t *testing.T) { + const chars = "abcdefghijklmnopqrstuvwxyz0123456789" + for times := 0; times < 10; times++ { + for strLen := 5 + rand.Intn(5); strLen < 140; strLen += 10 { // Arbitrary + s1 := make([]byte, strLen) + for i := range s1 { + s1[i] = chars[rand.Intn(len(chars))] + } + s := string(s1) + for i := 0; i < 50; i++ { + begin := rand.Intn(len(s) + 1) + end := begin + rand.Intn(len(s)+1-begin) + sep := s[begin:end] + if i%4 == 0 { + pos := rand.Intn(len(sep) + 1) + sep = sep[:pos] + "A" + sep[pos:] + } + want := simpleIndex(s, sep) + res := Index(s, sep) + if res != want { + t.Errorf("Index(%s,%s) = %d; want %d", s, sep, res, want) + } + } + } + } +} + +func TestIndexRune(t *testing.T) { + tests := []struct { + in string + rune rune + want int + }{ + {"", 'a', -1}, + {"", '☺', -1}, + {"foo", '☹', -1}, + {"foo", 'o', 1}, + {"foo☺bar", '☺', 3}, + {"foo☺☻☹bar", '☹', 9}, + {"a A x", 'A', 2}, + {"some_text=some_value", '=', 9}, + {"☺a", 'a', 3}, + {"a☻☺b", '☺', 4}, + + // RuneError should match any invalid UTF-8 byte sequence. + {"�", '�', 0}, + {"\xff", '�', 0}, + {"☻x�", '�', len("☻x")}, + {"☻x\xe2\x98", '�', len("☻x")}, + {"☻x\xe2\x98�", '�', len("☻x")}, + {"☻x\xe2\x98x", '�', len("☻x")}, + + // Invalid rune values should never match. + {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", -1, -1}, + {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", 0xD800, -1}, // Surrogate pair + {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", utf8.MaxRune + 1, -1}, + } + for _, tt := range tests { + if got := IndexRune(tt.in, tt.rune); got != tt.want { + t.Errorf("IndexRune(%q, %d) = %v; want %v", tt.in, tt.rune, got, tt.want) + } + } + + haystack := "test世界" + allocs := testing.AllocsPerRun(1000, func() { + if i := IndexRune(haystack, 's'); i != 2 { + t.Fatalf("'s' at %d; want 2", i) + } + if i := IndexRune(haystack, '世'); i != 4 { + t.Fatalf("'世' at %d; want 4", i) + } + }) + if allocs != 0 && testing.CoverMode() == "" { + t.Errorf("expected no allocations, got %f", allocs) + } +} + +const benchmarkString = "some_text=some☺value" + +func BenchmarkIndexRune(b *testing.B) { + if got := IndexRune(benchmarkString, '☺'); got != 14 { + b.Fatalf("wrong index: expected 14, got=%d", got) + } + for i := 0; i < b.N; i++ { + IndexRune(benchmarkString, '☺') + } +} + +var benchmarkLongString = Repeat(" ", 100) + benchmarkString + +func BenchmarkIndexRuneLongString(b *testing.B) { + if got := IndexRune(benchmarkLongString, '☺'); got != 114 { + b.Fatalf("wrong index: expected 114, got=%d", got) + } + for i := 0; i < b.N; i++ { + IndexRune(benchmarkLongString, '☺') + } +} + +func BenchmarkIndexRuneFastPath(b *testing.B) { + if got := IndexRune(benchmarkString, 'v'); got != 17 { + b.Fatalf("wrong index: expected 17, got=%d", got) + } + for i := 0; i < b.N; i++ { + IndexRune(benchmarkString, 'v') + } +} + +func BenchmarkIndex(b *testing.B) { + if got := Index(benchmarkString, "v"); got != 17 { + b.Fatalf("wrong index: expected 17, got=%d", got) + } + for i := 0; i < b.N; i++ { + Index(benchmarkString, "v") + } +} + +func BenchmarkLastIndex(b *testing.B) { + if got := Index(benchmarkString, "v"); got != 17 { + b.Fatalf("wrong index: expected 17, got=%d", got) + } + for i := 0; i < b.N; i++ { + LastIndex(benchmarkString, "v") + } +} + +func BenchmarkIndexByte(b *testing.B) { + if got := IndexByte(benchmarkString, 'v'); got != 17 { + b.Fatalf("wrong index: expected 17, got=%d", got) + } + for i := 0; i < b.N; i++ { + IndexByte(benchmarkString, 'v') + } +} + +type SplitTest struct { + s string + sep string + n int + a []string +} + +var splittests = []SplitTest{ + {"", "", -1, []string{}}, + {abcd, "", 2, []string{"a", "bcd"}}, + {abcd, "", 4, []string{"a", "b", "c", "d"}}, + {abcd, "", -1, []string{"a", "b", "c", "d"}}, + {faces, "", -1, []string{"☺", "☻", "☹"}}, + {faces, "", 3, []string{"☺", "☻", "☹"}}, + {faces, "", 17, []string{"☺", "☻", "☹"}}, + {"☺�☹", "", -1, []string{"☺", "�", "☹"}}, + {abcd, "a", 0, nil}, + {abcd, "a", -1, []string{"", "bcd"}}, + {abcd, "z", -1, []string{"abcd"}}, + {commas, ",", -1, []string{"1", "2", "3", "4"}}, + {dots, "...", -1, []string{"1", ".2", ".3", ".4"}}, + {faces, "☹", -1, []string{"☺☻", ""}}, + {faces, "~", -1, []string{faces}}, + {"1 2 3 4", " ", 3, []string{"1", "2", "3 4"}}, + {"1 2", " ", 3, []string{"1", "2"}}, + {"", "T", math.MaxInt / 4, []string{""}}, + {"\xff-\xff", "", -1, []string{"\xff", "-", "\xff"}}, + {"\xff-\xff", "-", -1, []string{"\xff", "\xff"}}, +} + +func TestSplit(t *testing.T) { + for _, tt := range splittests { + a := SplitN(tt.s, tt.sep, tt.n) + if !eq(a, tt.a) { + t.Errorf("Split(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, a, tt.a) + continue + } + if tt.n == 0 { + continue + } + s := Join(a, tt.sep) + if s != tt.s { + t.Errorf("Join(Split(%q, %q, %d), %q) = %q", tt.s, tt.sep, tt.n, tt.sep, s) + } + if tt.n < 0 { + b := Split(tt.s, tt.sep) + if !reflect.DeepEqual(a, b) { + t.Errorf("Split disagrees with SplitN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a) + } + } + } +} + +var splitaftertests = []SplitTest{ + {abcd, "a", -1, []string{"a", "bcd"}}, + {abcd, "z", -1, []string{"abcd"}}, + {abcd, "", -1, []string{"a", "b", "c", "d"}}, + {commas, ",", -1, []string{"1,", "2,", "3,", "4"}}, + {dots, "...", -1, []string{"1...", ".2...", ".3...", ".4"}}, + {faces, "☹", -1, []string{"☺☻☹", ""}}, + {faces, "~", -1, []string{faces}}, + {faces, "", -1, []string{"☺", "☻", "☹"}}, + {"1 2 3 4", " ", 3, []string{"1 ", "2 ", "3 4"}}, + {"1 2 3", " ", 3, []string{"1 ", "2 ", "3"}}, + {"1 2", " ", 3, []string{"1 ", "2"}}, + {"123", "", 2, []string{"1", "23"}}, + {"123", "", 17, []string{"1", "2", "3"}}, +} + +func TestSplitAfter(t *testing.T) { + for _, tt := range splitaftertests { + a := SplitAfterN(tt.s, tt.sep, tt.n) + if !eq(a, tt.a) { + t.Errorf(`Split(%q, %q, %d) = %v; want %v`, tt.s, tt.sep, tt.n, a, tt.a) + continue + } + s := Join(a, "") + if s != tt.s { + t.Errorf(`Join(Split(%q, %q, %d), %q) = %q`, tt.s, tt.sep, tt.n, tt.sep, s) + } + if tt.n < 0 { + b := SplitAfter(tt.s, tt.sep) + if !reflect.DeepEqual(a, b) { + t.Errorf("SplitAfter disagrees with SplitAfterN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a) + } + } + } +} + +type FieldsTest struct { + s string + a []string +} + +var fieldstests = []FieldsTest{ + {"", []string{}}, + {" ", []string{}}, + {" \t ", []string{}}, + {"\u2000", []string{}}, + {" abc ", []string{"abc"}}, + {"1 2 3 4", []string{"1", "2", "3", "4"}}, + {"1 2 3 4", []string{"1", "2", "3", "4"}}, + {"1\t\t2\t\t3\t4", []string{"1", "2", "3", "4"}}, + {"1\u20002\u20013\u20024", []string{"1", "2", "3", "4"}}, + {"\u2000\u2001\u2002", []string{}}, + {"\n™\t™\n", []string{"™", "™"}}, + {"\n\u20001™2\u2000 \u2001 ™", []string{"1™2", "™"}}, + {"\n1\uFFFD \uFFFD2\u20003\uFFFD4", []string{"1\uFFFD", "\uFFFD2", "3\uFFFD4"}}, + {"1\xFF\u2000\xFF2\xFF \xFF", []string{"1\xFF", "\xFF2\xFF", "\xFF"}}, + {faces, []string{faces}}, +} + +func TestFields(t *testing.T) { + for _, tt := range fieldstests { + a := Fields(tt.s) + if !eq(a, tt.a) { + t.Errorf("Fields(%q) = %v; want %v", tt.s, a, tt.a) + continue + } + } +} + +var FieldsFuncTests = []FieldsTest{ + {"", []string{}}, + {"XX", []string{}}, + {"XXhiXXX", []string{"hi"}}, + {"aXXbXXXcX", []string{"a", "b", "c"}}, +} + +func TestFieldsFunc(t *testing.T) { + for _, tt := range fieldstests { + a := FieldsFunc(tt.s, unicode.IsSpace) + if !eq(a, tt.a) { + t.Errorf("FieldsFunc(%q, unicode.IsSpace) = %v; want %v", tt.s, a, tt.a) + continue + } + } + pred := func(c rune) bool { return c == 'X' } + for _, tt := range FieldsFuncTests { + a := FieldsFunc(tt.s, pred) + if !eq(a, tt.a) { + t.Errorf("FieldsFunc(%q) = %v, want %v", tt.s, a, tt.a) + } + } +} + +// Test case for any function which accepts and returns a single string. +type StringTest struct { + in, out string +} + +// Execute f on each test case. funcName should be the name of f; it's used +// in failure reports. +func runStringTests(t *testing.T, f func(string) string, funcName string, testCases []StringTest) { + for _, tc := range testCases { + actual := f(tc.in) + if actual != tc.out { + t.Errorf("%s(%q) = %q; want %q", funcName, tc.in, actual, tc.out) + } + } +} + +var upperTests = []StringTest{ + {"", ""}, + {"ONLYUPPER", "ONLYUPPER"}, + {"abc", "ABC"}, + {"AbC123", "ABC123"}, + {"azAZ09_", "AZAZ09_"}, + {"longStrinGwitHmixofsmaLLandcAps", "LONGSTRINGWITHMIXOFSMALLANDCAPS"}, + {"RENAN BASTOS 93 AOSDAJDJAIDJAIDAJIaidsjjaidijadsjiadjiOOKKO", "RENAN BASTOS 93 AOSDAJDJAIDJAIDAJIAIDSJJAIDIJADSJIADJIOOKKO"}, + {"long\u0250string\u0250with\u0250nonascii\u2C6Fchars", "LONG\u2C6FSTRING\u2C6FWITH\u2C6FNONASCII\u2C6FCHARS"}, + {"\u0250\u0250\u0250\u0250\u0250", "\u2C6F\u2C6F\u2C6F\u2C6F\u2C6F"}, // grows one byte per char + {"a\u0080\U0010FFFF", "A\u0080\U0010FFFF"}, // test utf8.RuneSelf and utf8.MaxRune +} + +var lowerTests = []StringTest{ + {"", ""}, + {"abc", "abc"}, + {"AbC123", "abc123"}, + {"azAZ09_", "azaz09_"}, + {"longStrinGwitHmixofsmaLLandcAps", "longstringwithmixofsmallandcaps"}, + {"renan bastos 93 AOSDAJDJAIDJAIDAJIaidsjjaidijadsjiadjiOOKKO", "renan bastos 93 aosdajdjaidjaidajiaidsjjaidijadsjiadjiookko"}, + {"LONG\u2C6FSTRING\u2C6FWITH\u2C6FNONASCII\u2C6FCHARS", "long\u0250string\u0250with\u0250nonascii\u0250chars"}, + {"\u2C6D\u2C6D\u2C6D\u2C6D\u2C6D", "\u0251\u0251\u0251\u0251\u0251"}, // shrinks one byte per char + {"A\u0080\U0010FFFF", "a\u0080\U0010FFFF"}, // test utf8.RuneSelf and utf8.MaxRune +} + +const space = "\t\v\r\f\n\u0085\u00a0\u2000\u3000" + +var trimSpaceTests = []StringTest{ + {"", ""}, + {"abc", "abc"}, + {space + "abc" + space, "abc"}, + {" ", ""}, + {" \t\r\n \t\t\r\r\n\n ", ""}, + {" \t\r\n x\t\t\r\r\n\n ", "x"}, + {" \u2000\t\r\n x\t\t\r\r\ny\n \u3000", "x\t\t\r\r\ny"}, + {"1 \t\r\n2", "1 \t\r\n2"}, + {" x\x80", "x\x80"}, + {" x\xc0", "x\xc0"}, + {"x \xc0\xc0 ", "x \xc0\xc0"}, + {"x \xc0", "x \xc0"}, + {"x \xc0 ", "x \xc0"}, + {"x \xc0\xc0 ", "x \xc0\xc0"}, + {"x ☺\xc0\xc0 ", "x ☺\xc0\xc0"}, + {"x ☺ ", "x ☺"}, +} + +func tenRunes(ch rune) string { + r := make([]rune, 10) + for i := range r { + r[i] = ch + } + return string(r) +} + +// User-defined self-inverse mapping function +func rot13(r rune) rune { + step := rune(13) + if r >= 'a' && r <= 'z' { + return ((r - 'a' + step) % 26) + 'a' + } + if r >= 'A' && r <= 'Z' { + return ((r - 'A' + step) % 26) + 'A' + } + return r +} + +func TestMap(t *testing.T) { + // Run a couple of awful growth/shrinkage tests + a := tenRunes('a') + // 1. Grow. This triggers two reallocations in Map. + maxRune := func(rune) rune { return unicode.MaxRune } + m := Map(maxRune, a) + expect := tenRunes(unicode.MaxRune) + if m != expect { + t.Errorf("growing: expected %q got %q", expect, m) + } + + // 2. Shrink + minRune := func(rune) rune { return 'a' } + m = Map(minRune, tenRunes(unicode.MaxRune)) + expect = a + if m != expect { + t.Errorf("shrinking: expected %q got %q", expect, m) + } + + // 3. Rot13 + m = Map(rot13, "a to zed") + expect = "n gb mrq" + if m != expect { + t.Errorf("rot13: expected %q got %q", expect, m) + } + + // 4. Rot13^2 + m = Map(rot13, Map(rot13, "a to zed")) + expect = "a to zed" + if m != expect { + t.Errorf("rot13: expected %q got %q", expect, m) + } + + // 5. Drop + dropNotLatin := func(r rune) rune { + if unicode.Is(unicode.Latin, r) { + return r + } + return -1 + } + m = Map(dropNotLatin, "Hello, 세계") + expect = "Hello" + if m != expect { + t.Errorf("drop: expected %q got %q", expect, m) + } + + // 6. Identity + identity := func(r rune) rune { + return r + } + orig := "Input string that we expect not to be copied." + m = Map(identity, orig) + if unsafe.StringData(orig) != unsafe.StringData(m) { + t.Error("unexpected copy during identity map") + } + + // 7. Handle invalid UTF-8 sequence + replaceNotLatin := func(r rune) rune { + if unicode.Is(unicode.Latin, r) { + return r + } + return utf8.RuneError + } + m = Map(replaceNotLatin, "Hello\255World") + expect = "Hello\uFFFDWorld" + if m != expect { + t.Errorf("replace invalid sequence: expected %q got %q", expect, m) + } + + // 8. Check utf8.RuneSelf and utf8.MaxRune encoding + encode := func(r rune) rune { + switch r { + case utf8.RuneSelf: + return unicode.MaxRune + case unicode.MaxRune: + return utf8.RuneSelf + } + return r + } + s := string(rune(utf8.RuneSelf)) + string(utf8.MaxRune) + r := string(utf8.MaxRune) + string(rune(utf8.RuneSelf)) // reverse of s + m = Map(encode, s) + if m != r { + t.Errorf("encoding not handled correctly: expected %q got %q", r, m) + } + m = Map(encode, r) + if m != s { + t.Errorf("encoding not handled correctly: expected %q got %q", s, m) + } + + // 9. Check mapping occurs in the front, middle and back + trimSpaces := func(r rune) rune { + if unicode.IsSpace(r) { + return -1 + } + return r + } + m = Map(trimSpaces, " abc 123 ") + expect = "abc123" + if m != expect { + t.Errorf("trimSpaces: expected %q got %q", expect, m) + } +} + +func TestToUpper(t *testing.T) { runStringTests(t, ToUpper, "ToUpper", upperTests) } + +func TestToLower(t *testing.T) { runStringTests(t, ToLower, "ToLower", lowerTests) } + +var toValidUTF8Tests = []struct { + in string + repl string + out string +}{ + {"", "\uFFFD", ""}, + {"abc", "\uFFFD", "abc"}, + {"\uFDDD", "\uFFFD", "\uFDDD"}, + {"a\xffb", "\uFFFD", "a\uFFFDb"}, + {"a\xffb\uFFFD", "X", "aXb\uFFFD"}, + {"a☺\xffb☺\xC0\xAFc☺\xff", "", "a☺b☺c☺"}, + {"a☺\xffb☺\xC0\xAFc☺\xff", "日本語", "a☺日本語b☺日本語c☺日本語"}, + {"\xC0\xAF", "\uFFFD", "\uFFFD"}, + {"\xE0\x80\xAF", "\uFFFD", "\uFFFD"}, + {"\xed\xa0\x80", "abc", "abc"}, + {"\xed\xbf\xbf", "\uFFFD", "\uFFFD"}, + {"\xF0\x80\x80\xaf", "☺", "☺"}, + {"\xF8\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"}, + {"\xFC\x80\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"}, +} + +func TestToValidUTF8(t *testing.T) { + for _, tc := range toValidUTF8Tests { + got := ToValidUTF8(tc.in, tc.repl) + if got != tc.out { + t.Errorf("ToValidUTF8(%q, %q) = %q; want %q", tc.in, tc.repl, got, tc.out) + } + } +} + +func BenchmarkToUpper(b *testing.B) { + for _, tc := range upperTests { + b.Run(tc.in, func(b *testing.B) { + for i := 0; i < b.N; i++ { + actual := ToUpper(tc.in) + if actual != tc.out { + b.Errorf("ToUpper(%q) = %q; want %q", tc.in, actual, tc.out) + } + } + }) + } +} + +func BenchmarkToLower(b *testing.B) { + for _, tc := range lowerTests { + b.Run(tc.in, func(b *testing.B) { + for i := 0; i < b.N; i++ { + actual := ToLower(tc.in) + if actual != tc.out { + b.Errorf("ToLower(%q) = %q; want %q", tc.in, actual, tc.out) + } + } + }) + } +} + +func BenchmarkMapNoChanges(b *testing.B) { + identity := func(r rune) rune { + return r + } + for i := 0; i < b.N; i++ { + Map(identity, "Some string that won't be modified.") + } +} + +func TestSpecialCase(t *testing.T) { + lower := "abcçdefgğhıijklmnoöprsştuüvyz" + upper := "ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ" + u := ToUpperSpecial(unicode.TurkishCase, upper) + if u != upper { + t.Errorf("Upper(upper) is %s not %s", u, upper) + } + u = ToUpperSpecial(unicode.TurkishCase, lower) + if u != upper { + t.Errorf("Upper(lower) is %s not %s", u, upper) + } + l := ToLowerSpecial(unicode.TurkishCase, lower) + if l != lower { + t.Errorf("Lower(lower) is %s not %s", l, lower) + } + l = ToLowerSpecial(unicode.TurkishCase, upper) + if l != lower { + t.Errorf("Lower(upper) is %s not %s", l, lower) + } +} + +func TestTrimSpace(t *testing.T) { runStringTests(t, TrimSpace, "TrimSpace", trimSpaceTests) } + +var trimTests = []struct { + f string + in, arg, out string +}{ + {"Trim", "abba", "a", "bb"}, + {"Trim", "abba", "ab", ""}, + {"TrimLeft", "abba", "ab", ""}, + {"TrimRight", "abba", "ab", ""}, + {"TrimLeft", "abba", "a", "bba"}, + {"TrimLeft", "abba", "b", "abba"}, + {"TrimRight", "abba", "a", "abb"}, + {"TrimRight", "abba", "b", "abba"}, + {"Trim", "", "<>", "tag"}, + {"Trim", "* listitem", " *", "listitem"}, + {"Trim", `"quote"`, `"`, "quote"}, + {"Trim", "\u2C6F\u2C6F\u0250\u0250\u2C6F\u2C6F", "\u2C6F", "\u0250\u0250"}, + {"Trim", "\x80test\xff", "\xff", "test"}, + {"Trim", " Ġ ", " ", "Ġ"}, + {"Trim", " Ġİ0", "0 ", "Ġİ"}, + //empty string tests + {"Trim", "abba", "", "abba"}, + {"Trim", "", "123", ""}, + {"Trim", "", "", ""}, + {"TrimLeft", "abba", "", "abba"}, + {"TrimLeft", "", "123", ""}, + {"TrimLeft", "", "", ""}, + {"TrimRight", "abba", "", "abba"}, + {"TrimRight", "", "123", ""}, + {"TrimRight", "", "", ""}, + {"TrimRight", "☺\xc0", "☺", "☺\xc0"}, + {"TrimPrefix", "aabb", "a", "abb"}, + {"TrimPrefix", "aabb", "b", "aabb"}, + {"TrimSuffix", "aabb", "a", "aabb"}, + {"TrimSuffix", "aabb", "b", "aab"}, +} + +func TestTrim(t *testing.T) { + for _, tc := range trimTests { + name := tc.f + var f func(string, string) string + switch name { + case "Trim": + f = Trim + case "TrimLeft": + f = TrimLeft + case "TrimRight": + f = TrimRight + case "TrimPrefix": + f = TrimPrefix + case "TrimSuffix": + f = TrimSuffix + default: + t.Errorf("Undefined trim function %s", name) + } + actual := f(tc.in, tc.arg) + if actual != tc.out { + t.Errorf("%s(%q, %q) = %q; want %q", name, tc.in, tc.arg, actual, tc.out) + } + } +} + +func BenchmarkTrim(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + for _, tc := range trimTests { + name := tc.f + var f func(string, string) string + switch name { + case "Trim": + f = Trim + case "TrimLeft": + f = TrimLeft + case "TrimRight": + f = TrimRight + case "TrimPrefix": + f = TrimPrefix + case "TrimSuffix": + f = TrimSuffix + default: + b.Errorf("Undefined trim function %s", name) + } + actual := f(tc.in, tc.arg) + if actual != tc.out { + b.Errorf("%s(%q, %q) = %q; want %q", name, tc.in, tc.arg, actual, tc.out) + } + } + } +} + +func BenchmarkToValidUTF8(b *testing.B) { + tests := []struct { + name string + input string + }{ + {"Valid", "typical"}, + {"InvalidASCII", "foo\xffbar"}, + {"InvalidNonASCII", "日本語\xff日本語"}, + } + replacement := "\uFFFD" + b.ResetTimer() + for _, test := range tests { + b.Run(test.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + ToValidUTF8(test.input, replacement) + } + }) + } +} + +type predicate struct { + f func(rune) bool + name string +} + +var isSpace = predicate{unicode.IsSpace, "IsSpace"} +var isDigit = predicate{unicode.IsDigit, "IsDigit"} +var isUpper = predicate{unicode.IsUpper, "IsUpper"} +var isValidRune = predicate{ + func(r rune) bool { + return r != utf8.RuneError + }, + "IsValidRune", +} + +func not(p predicate) predicate { + return predicate{ + func(r rune) bool { + return !p.f(r) + }, + "not " + p.name, + } +} + +var trimFuncTests = []struct { + f predicate + in string + trimOut string + leftOut string + rightOut string +}{ + {isSpace, space + " hello " + space, + "hello", + "hello " + space, + space + " hello"}, + {isDigit, "\u0e50\u0e5212hello34\u0e50\u0e51", + "hello", + "hello34\u0e50\u0e51", + "\u0e50\u0e5212hello"}, + {isUpper, "\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", + "hello", + "helloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", + "\u2C6F\u2C6F\u2C6F\u2C6FABCDhello"}, + {not(isSpace), "hello" + space + "hello", + space, + space + "hello", + "hello" + space}, + {not(isDigit), "hello\u0e50\u0e521234\u0e50\u0e51helo", + "\u0e50\u0e521234\u0e50\u0e51", + "\u0e50\u0e521234\u0e50\u0e51helo", + "hello\u0e50\u0e521234\u0e50\u0e51"}, + {isValidRune, "ab\xc0a\xc0cd", + "\xc0a\xc0", + "\xc0a\xc0cd", + "ab\xc0a\xc0"}, + {not(isValidRune), "\xc0a\xc0", + "a", + "a\xc0", + "\xc0a"}, + {isSpace, "", + "", + "", + ""}, + {isSpace, " ", + "", + "", + ""}, +} + +func TestTrimFunc(t *testing.T) { + for _, tc := range trimFuncTests { + trimmers := []struct { + name string + trim func(s string, f func(r rune) bool) string + out string + }{ + {"TrimFunc", TrimFunc, tc.trimOut}, + {"TrimLeftFunc", TrimLeftFunc, tc.leftOut}, + {"TrimRightFunc", TrimRightFunc, tc.rightOut}, + } + for _, trimmer := range trimmers { + actual := trimmer.trim(tc.in, tc.f.f) + if actual != trimmer.out { + t.Errorf("%s(%q, %q) = %q; want %q", trimmer.name, tc.in, tc.f.name, actual, trimmer.out) + } + } + } +} + +var indexFuncTests = []struct { + in string + f predicate + first, last int +}{ + {"", isValidRune, -1, -1}, + {"abc", isDigit, -1, -1}, + {"0123", isDigit, 0, 3}, + {"a1b", isDigit, 1, 1}, + {space, isSpace, 0, len(space) - 3}, // last rune in space is 3 bytes + {"\u0e50\u0e5212hello34\u0e50\u0e51", isDigit, 0, 18}, + {"\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", isUpper, 0, 34}, + {"12\u0e50\u0e52hello34\u0e50\u0e51", not(isDigit), 8, 12}, + + // tests of invalid UTF-8 + {"\x801", isDigit, 1, 1}, + {"\x80abc", isDigit, -1, -1}, + {"\xc0a\xc0", isValidRune, 1, 1}, + {"\xc0a\xc0", not(isValidRune), 0, 2}, + {"\xc0☺\xc0", not(isValidRune), 0, 4}, + {"\xc0☺\xc0\xc0", not(isValidRune), 0, 5}, + {"ab\xc0a\xc0cd", not(isValidRune), 2, 4}, + {"a\xe0\x80cd", not(isValidRune), 1, 2}, + {"\x80\x80\x80\x80", not(isValidRune), 0, 3}, +} + +func TestIndexFunc(t *testing.T) { + for _, tc := range indexFuncTests { + first := IndexFunc(tc.in, tc.f.f) + if first != tc.first { + t.Errorf("IndexFunc(%q, %s) = %d; want %d", tc.in, tc.f.name, first, tc.first) + } + last := LastIndexFunc(tc.in, tc.f.f) + if last != tc.last { + t.Errorf("LastIndexFunc(%q, %s) = %d; want %d", tc.in, tc.f.name, last, tc.last) + } + } +} + +func equal(m string, s1, s2 string, t *testing.T) bool { + if s1 == s2 { + return true + } + e1 := Split(s1, "") + e2 := Split(s2, "") + for i, c1 := range e1 { + if i >= len(e2) { + break + } + r1, _ := utf8.DecodeRuneInString(c1) + r2, _ := utf8.DecodeRuneInString(e2[i]) + if r1 != r2 { + t.Errorf("%s diff at %d: U+%04X U+%04X", m, i, r1, r2) + } + } + return false +} + +func TestCaseConsistency(t *testing.T) { + // Make a string of all the runes. + numRunes := int(unicode.MaxRune + 1) + if testing.Short() { + numRunes = 1000 + } + a := make([]rune, numRunes) + for i := range a { + a[i] = rune(i) + } + s := string(a) + // convert the cases. + upper := ToUpper(s) + lower := ToLower(s) + + // Consistency checks + if n := utf8.RuneCountInString(upper); n != numRunes { + t.Error("rune count wrong in upper:", n) + } + if n := utf8.RuneCountInString(lower); n != numRunes { + t.Error("rune count wrong in lower:", n) + } + if !equal("ToUpper(upper)", ToUpper(upper), upper, t) { + t.Error("ToUpper(upper) consistency fail") + } + if !equal("ToLower(lower)", ToLower(lower), lower, t) { + t.Error("ToLower(lower) consistency fail") + } + /* + These fail because of non-one-to-oneness of the data, such as multiple + upper case 'I' mapping to 'i'. We comment them out but keep them for + interest. + For instance: CAPITAL LETTER I WITH DOT ABOVE: + unicode.ToUpper(unicode.ToLower('\u0130')) != '\u0130' + + if !equal("ToUpper(lower)", ToUpper(lower), upper, t) { + t.Error("ToUpper(lower) consistency fail"); + } + if !equal("ToLower(upper)", ToLower(upper), lower, t) { + t.Error("ToLower(upper) consistency fail"); + } + */ +} + +var longString = "a" + string(make([]byte, 1<<16)) + "z" + +var RepeatTests = []struct { + in, out string + count int +}{ + {"", "", 0}, + {"", "", 1}, + {"", "", 2}, + {"-", "", 0}, + {"-", "-", 1}, + {"-", "----------", 10}, + {"abc ", "abc abc abc ", 3}, + // Tests for results over the chunkLimit + {string(rune(0)), string(make([]byte, 1<<16)), 1 << 16}, + {longString, longString + longString, 2}, +} + +func TestRepeat(t *testing.T) { + for _, tt := range RepeatTests { + a := Repeat(tt.in, tt.count) + if !equal("Repeat(s)", a, tt.out, t) { + t.Errorf("Repeat(%v, %d) = %v; want %v", tt.in, tt.count, a, tt.out) + continue + } + } +} + +func repeat(s string, count int) (err error) { + defer func() { + if r := recover(); r != nil { + switch v := r.(type) { + case error: + err = v + default: + err = fmt.Errorf("%s", v) + } + } + }() + + Repeat(s, count) + + return +} + +// See Issue golang.org/issue/16237 +func TestRepeatCatchesOverflow(t *testing.T) { + tests := [...]struct { + s string + count int + errStr string + }{ + 0: {"--", -2147483647, "negative"}, + 1: {"", int(^uint(0) >> 1), ""}, + 2: {"-", 10, ""}, + 3: {"gopher", 0, ""}, + 4: {"-", -1, "negative"}, + 5: {"--", -102, "negative"}, + 6: {string(make([]byte, 255)), int((^uint(0))/255 + 1), "overflow"}, + } + + for i, tt := range tests { + err := repeat(tt.s, tt.count) + if tt.errStr == "" { + if err != nil { + t.Errorf("#%d panicked %v", i, err) + } + continue + } + + if err == nil || !Contains(err.Error(), tt.errStr) { + t.Errorf("#%d expected %q got %q", i, tt.errStr, err) + } + } +} + +func runesEqual(a, b []rune) bool { + if len(a) != len(b) { + return false + } + for i, r := range a { + if r != b[i] { + return false + } + } + return true +} + +var RunesTests = []struct { + in string + out []rune + lossy bool +}{ + {"", []rune{}, false}, + {" ", []rune{32}, false}, + {"ABC", []rune{65, 66, 67}, false}, + {"abc", []rune{97, 98, 99}, false}, + {"\u65e5\u672c\u8a9e", []rune{26085, 26412, 35486}, false}, + {"ab\x80c", []rune{97, 98, 0xFFFD, 99}, true}, + {"ab\xc0c", []rune{97, 98, 0xFFFD, 99}, true}, +} + +func TestRunes(t *testing.T) { + for _, tt := range RunesTests { + a := []rune(tt.in) + if !runesEqual(a, tt.out) { + t.Errorf("[]rune(%q) = %v; want %v", tt.in, a, tt.out) + continue + } + if !tt.lossy { + // can only test reassembly if we didn't lose information + s := string(a) + if s != tt.in { + t.Errorf("string([]rune(%q)) = %x; want %x", tt.in, s, tt.in) + } + } + } +} + +func TestReadByte(t *testing.T) { + testStrings := []string{"", abcd, faces, commas} + for _, s := range testStrings { + reader := NewReader(s) + if e := reader.UnreadByte(); e == nil { + t.Errorf("Unreading %q at beginning: expected error", s) + } + var res bytes.Buffer + for { + b, e := reader.ReadByte() + if e == io.EOF { + break + } + if e != nil { + t.Errorf("Reading %q: %s", s, e) + break + } + res.WriteByte(b) + // unread and read again + e = reader.UnreadByte() + if e != nil { + t.Errorf("Unreading %q: %s", s, e) + break + } + b1, e := reader.ReadByte() + if e != nil { + t.Errorf("Reading %q after unreading: %s", s, e) + break + } + if b1 != b { + t.Errorf("Reading %q after unreading: want byte %q, got %q", s, b, b1) + break + } + } + if res.String() != s { + t.Errorf("Reader(%q).ReadByte() produced %q", s, res.String()) + } + } +} + +func TestReadRune(t *testing.T) { + testStrings := []string{"", abcd, faces, commas} + for _, s := range testStrings { + reader := NewReader(s) + if e := reader.UnreadRune(); e == nil { + t.Errorf("Unreading %q at beginning: expected error", s) + } + res := "" + for { + r, z, e := reader.ReadRune() + if e == io.EOF { + break + } + if e != nil { + t.Errorf("Reading %q: %s", s, e) + break + } + res += string(r) + // unread and read again + e = reader.UnreadRune() + if e != nil { + t.Errorf("Unreading %q: %s", s, e) + break + } + r1, z1, e := reader.ReadRune() + if e != nil { + t.Errorf("Reading %q after unreading: %s", s, e) + break + } + if r1 != r { + t.Errorf("Reading %q after unreading: want rune %q, got %q", s, r, r1) + break + } + if z1 != z { + t.Errorf("Reading %q after unreading: want size %d, got %d", s, z, z1) + break + } + } + if res != s { + t.Errorf("Reader(%q).ReadRune() produced %q", s, res) + } + } +} + +var UnreadRuneErrorTests = []struct { + name string + f func(*Reader) +}{ + {"Read", func(r *Reader) { r.Read([]byte{0}) }}, + {"ReadByte", func(r *Reader) { r.ReadByte() }}, + {"UnreadRune", func(r *Reader) { r.UnreadRune() }}, + {"Seek", func(r *Reader) { r.Seek(0, io.SeekCurrent) }}, + {"WriteTo", func(r *Reader) { r.WriteTo(&bytes.Buffer{}) }}, +} + +func TestUnreadRuneError(t *testing.T) { + for _, tt := range UnreadRuneErrorTests { + reader := NewReader("0123456789") + if _, _, err := reader.ReadRune(); err != nil { + // should not happen + t.Fatal(err) + } + tt.f(reader) + err := reader.UnreadRune() + if err == nil { + t.Errorf("Unreading after %s: expected error", tt.name) + } + } +} + +var ReplaceTests = []struct { + in string + old, new string + n int + out string +}{ + {"hello", "l", "L", 0, "hello"}, + {"hello", "l", "L", -1, "heLLo"}, + {"hello", "x", "X", -1, "hello"}, + {"", "x", "X", -1, ""}, + {"radar", "r", "", -1, "ada"}, + {"", "", "<>", -1, "<>"}, + {"banana", "a", "<>", -1, "b<>n<>n<>"}, + {"banana", "a", "<>", 1, "b<>nana"}, + {"banana", "a", "<>", 1000, "b<>n<>n<>"}, + {"banana", "an", "<>", -1, "b<><>a"}, + {"banana", "ana", "<>", -1, "b<>na"}, + {"banana", "", "<>", -1, "<>b<>a<>n<>a<>n<>a<>"}, + {"banana", "", "<>", 10, "<>b<>a<>n<>a<>n<>a<>"}, + {"banana", "", "<>", 6, "<>b<>a<>n<>a<>n<>a"}, + {"banana", "", "<>", 5, "<>b<>a<>n<>a<>na"}, + {"banana", "", "<>", 1, "<>banana"}, + {"banana", "a", "a", -1, "banana"}, + {"banana", "a", "a", 1, "banana"}, + {"☺☻☹", "", "<>", -1, "<>☺<>☻<>☹<>"}, +} + +func TestReplace(t *testing.T) { + for _, tt := range ReplaceTests { + if s := Replace(tt.in, tt.old, tt.new, tt.n); s != tt.out { + t.Errorf("Replace(%q, %q, %q, %d) = %q, want %q", tt.in, tt.old, tt.new, tt.n, s, tt.out) + } + if tt.n == -1 { + s := ReplaceAll(tt.in, tt.old, tt.new) + if s != tt.out { + t.Errorf("ReplaceAll(%q, %q, %q) = %q, want %q", tt.in, tt.old, tt.new, s, tt.out) + } + } + } +} + +var TitleTests = []struct { + in, out string +}{ + {"", ""}, + {"a", "A"}, + {" aaa aaa aaa ", " Aaa Aaa Aaa "}, + {" Aaa Aaa Aaa ", " Aaa Aaa Aaa "}, + {"123a456", "123a456"}, + {"double-blind", "Double-Blind"}, + {"ÿøû", "Ÿøû"}, + {"with_underscore", "With_underscore"}, + {"unicode \xe2\x80\xa8 line separator", "Unicode \xe2\x80\xa8 Line Separator"}, +} + +func TestTitle(t *testing.T) { + for _, tt := range TitleTests { + if s := Title(tt.in); s != tt.out { + t.Errorf("Title(%q) = %q, want %q", tt.in, s, tt.out) + } + } +} + +var ContainsTests = []struct { + str, substr string + expected bool +}{ + {"abc", "bc", true}, + {"abc", "bcd", false}, + {"abc", "", true}, + {"", "a", false}, + + // cases to cover code in runtime/asm_amd64.s:indexShortStr + // 2-byte needle + {"xxxxxx", "01", false}, + {"01xxxx", "01", true}, + {"xx01xx", "01", true}, + {"xxxx01", "01", true}, + {"01xxxxx"[1:], "01", false}, + {"xxxxx01"[:6], "01", false}, + // 3-byte needle + {"xxxxxxx", "012", false}, + {"012xxxx", "012", true}, + {"xx012xx", "012", true}, + {"xxxx012", "012", true}, + {"012xxxxx"[1:], "012", false}, + {"xxxxx012"[:7], "012", false}, + // 4-byte needle + {"xxxxxxxx", "0123", false}, + {"0123xxxx", "0123", true}, + {"xx0123xx", "0123", true}, + {"xxxx0123", "0123", true}, + {"0123xxxxx"[1:], "0123", false}, + {"xxxxx0123"[:8], "0123", false}, + // 5-7-byte needle + {"xxxxxxxxx", "01234", false}, + {"01234xxxx", "01234", true}, + {"xx01234xx", "01234", true}, + {"xxxx01234", "01234", true}, + {"01234xxxxx"[1:], "01234", false}, + {"xxxxx01234"[:9], "01234", false}, + // 8-byte needle + {"xxxxxxxxxxxx", "01234567", false}, + {"01234567xxxx", "01234567", true}, + {"xx01234567xx", "01234567", true}, + {"xxxx01234567", "01234567", true}, + {"01234567xxxxx"[1:], "01234567", false}, + {"xxxxx01234567"[:12], "01234567", false}, + // 9-15-byte needle + {"xxxxxxxxxxxxx", "012345678", false}, + {"012345678xxxx", "012345678", true}, + {"xx012345678xx", "012345678", true}, + {"xxxx012345678", "012345678", true}, + {"012345678xxxxx"[1:], "012345678", false}, + {"xxxxx012345678"[:13], "012345678", false}, + // 16-byte needle + {"xxxxxxxxxxxxxxxxxxxx", "0123456789ABCDEF", false}, + {"0123456789ABCDEFxxxx", "0123456789ABCDEF", true}, + {"xx0123456789ABCDEFxx", "0123456789ABCDEF", true}, + {"xxxx0123456789ABCDEF", "0123456789ABCDEF", true}, + {"0123456789ABCDEFxxxxx"[1:], "0123456789ABCDEF", false}, + {"xxxxx0123456789ABCDEF"[:20], "0123456789ABCDEF", false}, + // 17-31-byte needle + {"xxxxxxxxxxxxxxxxxxxxx", "0123456789ABCDEFG", false}, + {"0123456789ABCDEFGxxxx", "0123456789ABCDEFG", true}, + {"xx0123456789ABCDEFGxx", "0123456789ABCDEFG", true}, + {"xxxx0123456789ABCDEFG", "0123456789ABCDEFG", true}, + {"0123456789ABCDEFGxxxxx"[1:], "0123456789ABCDEFG", false}, + {"xxxxx0123456789ABCDEFG"[:21], "0123456789ABCDEFG", false}, + + // partial match cases + {"xx01x", "012", false}, // 3 + {"xx0123x", "01234", false}, // 5-7 + {"xx01234567x", "012345678", false}, // 9-15 + {"xx0123456789ABCDEFx", "0123456789ABCDEFG", false}, // 17-31, issue 15679 +} + +func TestContains(t *testing.T) { + for _, ct := range ContainsTests { + if Contains(ct.str, ct.substr) != ct.expected { + t.Errorf("Contains(%s, %s) = %v, want %v", + ct.str, ct.substr, !ct.expected, ct.expected) + } + } +} + +var ContainsAnyTests = []struct { + str, substr string + expected bool +}{ + {"", "", false}, + {"", "a", false}, + {"", "abc", false}, + {"a", "", false}, + {"a", "a", true}, + {"aaa", "a", true}, + {"abc", "xyz", false}, + {"abc", "xcz", true}, + {"a☺b☻c☹d", "uvw☻xyz", true}, + {"aRegExp*", ".(|)*+?^$[]", true}, + {dots + dots + dots, " ", false}, +} + +func TestContainsAny(t *testing.T) { + for _, ct := range ContainsAnyTests { + if ContainsAny(ct.str, ct.substr) != ct.expected { + t.Errorf("ContainsAny(%s, %s) = %v, want %v", + ct.str, ct.substr, !ct.expected, ct.expected) + } + } +} + +var ContainsRuneTests = []struct { + str string + r rune + expected bool +}{ + {"", 'a', false}, + {"a", 'a', true}, + {"aaa", 'a', true}, + {"abc", 'y', false}, + {"abc", 'c', true}, + {"a☺b☻c☹d", 'x', false}, + {"a☺b☻c☹d", '☻', true}, + {"aRegExp*", '*', true}, +} + +func TestContainsRune(t *testing.T) { + for _, ct := range ContainsRuneTests { + if ContainsRune(ct.str, ct.r) != ct.expected { + t.Errorf("ContainsRune(%q, %q) = %v, want %v", + ct.str, ct.r, !ct.expected, ct.expected) + } + } +} + +var EqualFoldTests = []struct { + s, t string + out bool +}{ + {"abc", "abc", true}, + {"ABcd", "ABcd", true}, + {"123abc", "123ABC", true}, + {"αβδ", "ΑΒΔ", true}, + {"abc", "xyz", false}, + {"abc", "XYZ", false}, + {"abcdefghijk", "abcdefghijX", false}, + {"abcdefghijk", "abcdefghij\u212A", true}, + {"abcdefghijK", "abcdefghij\u212A", true}, + {"abcdefghijkz", "abcdefghij\u212Ay", false}, + {"abcdefghijKz", "abcdefghij\u212Ay", false}, + {"1", "2", false}, + {"utf-8", "US-ASCII", false}, +} + +func TestEqualFold(t *testing.T) { + for _, tt := range EqualFoldTests { + if out := EqualFold(tt.s, tt.t); out != tt.out { + t.Errorf("EqualFold(%#q, %#q) = %v, want %v", tt.s, tt.t, out, tt.out) + } + if out := EqualFold(tt.t, tt.s); out != tt.out { + t.Errorf("EqualFold(%#q, %#q) = %v, want %v", tt.t, tt.s, out, tt.out) + } + } +} + +func BenchmarkEqualFold(b *testing.B) { + b.Run("Tests", func(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, tt := range EqualFoldTests { + if out := EqualFold(tt.s, tt.t); out != tt.out { + b.Fatal("wrong result") + } + } + } + }) + + const s1 = "abcdefghijKz" + const s2 = "abcDefGhijKz" + + b.Run("ASCII", func(b *testing.B) { + for i := 0; i < b.N; i++ { + EqualFold(s1, s2) + } + }) + + b.Run("UnicodePrefix", func(b *testing.B) { + for i := 0; i < b.N; i++ { + EqualFold("αβδ"+s1, "ΑΒΔ"+s2) + } + }) + + b.Run("UnicodeSuffix", func(b *testing.B) { + for i := 0; i < b.N; i++ { + EqualFold(s1+"αβδ", s2+"ΑΒΔ") + } + }) +} + +var CountTests = []struct { + s, sep string + num int +}{ + {"", "", 1}, + {"", "notempty", 0}, + {"notempty", "", 9}, + {"smaller", "not smaller", 0}, + {"12345678987654321", "6", 2}, + {"611161116", "6", 3}, + {"notequal", "NotEqual", 0}, + {"equal", "equal", 1}, + {"abc1231231123q", "123", 3}, + {"11111", "11", 2}, +} + +func TestCount(t *testing.T) { + for _, tt := range CountTests { + if num := Count(tt.s, tt.sep); num != tt.num { + t.Errorf("Count(%q, %q) = %d, want %d", tt.s, tt.sep, num, tt.num) + } + } +} + +var cutTests = []struct { + s, sep string + before, after string + found bool +}{ + {"abc", "b", "a", "c", true}, + {"abc", "a", "", "bc", true}, + {"abc", "c", "ab", "", true}, + {"abc", "abc", "", "", true}, + {"abc", "", "", "abc", true}, + {"abc", "d", "abc", "", false}, + {"", "d", "", "", false}, + {"", "", "", "", true}, +} + +func TestCut(t *testing.T) { + for _, tt := range cutTests { + if before, after, found := Cut(tt.s, tt.sep); before != tt.before || after != tt.after || found != tt.found { + t.Errorf("Cut(%q, %q) = %q, %q, %v, want %q, %q, %v", tt.s, tt.sep, before, after, found, tt.before, tt.after, tt.found) + } + } +} + +var cutPrefixTests = []struct { + s, sep string + after string + found bool +}{ + {"abc", "a", "bc", true}, + {"abc", "abc", "", true}, + {"abc", "", "abc", true}, + {"abc", "d", "abc", false}, + {"", "d", "", false}, + {"", "", "", true}, +} + +func TestCutPrefix(t *testing.T) { + for _, tt := range cutPrefixTests { + if after, found := CutPrefix(tt.s, tt.sep); after != tt.after || found != tt.found { + t.Errorf("CutPrefix(%q, %q) = %q, %v, want %q, %v", tt.s, tt.sep, after, found, tt.after, tt.found) + } + } +} + +var cutSuffixTests = []struct { + s, sep string + after string + found bool +}{ + {"abc", "bc", "a", true}, + {"abc", "abc", "", true}, + {"abc", "", "abc", true}, + {"abc", "d", "abc", false}, + {"", "d", "", false}, + {"", "", "", true}, +} + +func TestCutSuffix(t *testing.T) { + for _, tt := range cutSuffixTests { + if after, found := CutSuffix(tt.s, tt.sep); after != tt.after || found != tt.found { + t.Errorf("CutSuffix(%q, %q) = %q, %v, want %q, %v", tt.s, tt.sep, after, found, tt.after, tt.found) + } + } +} + +func makeBenchInputHard() string { + tokens := [...]string{ + "", "

", "", "", + "", "

", "", "", + "hello", "world", + } + x := make([]byte, 0, 1<<20) + for { + i := rand.Intn(len(tokens)) + if len(x)+len(tokens[i]) >= 1<<20 { + break + } + x = append(x, tokens[i]...) + } + return string(x) +} + +var benchInputHard = makeBenchInputHard() + +func benchmarkIndexHard(b *testing.B, sep string) { + for i := 0; i < b.N; i++ { + Index(benchInputHard, sep) + } +} + +func benchmarkLastIndexHard(b *testing.B, sep string) { + for i := 0; i < b.N; i++ { + LastIndex(benchInputHard, sep) + } +} + +func benchmarkCountHard(b *testing.B, sep string) { + for i := 0; i < b.N; i++ { + Count(benchInputHard, sep) + } +} + +func BenchmarkIndexHard1(b *testing.B) { benchmarkIndexHard(b, "<>") } +func BenchmarkIndexHard2(b *testing.B) { benchmarkIndexHard(b, "") } +func BenchmarkIndexHard3(b *testing.B) { benchmarkIndexHard(b, "hello world") } +func BenchmarkIndexHard4(b *testing.B) { + benchmarkIndexHard(b, "
helloworld
") +} + +func BenchmarkLastIndexHard1(b *testing.B) { benchmarkLastIndexHard(b, "<>") } +func BenchmarkLastIndexHard2(b *testing.B) { benchmarkLastIndexHard(b, "") } +func BenchmarkLastIndexHard3(b *testing.B) { benchmarkLastIndexHard(b, "hello world") } + +func BenchmarkCountHard1(b *testing.B) { benchmarkCountHard(b, "<>") } +func BenchmarkCountHard2(b *testing.B) { benchmarkCountHard(b, "") } +func BenchmarkCountHard3(b *testing.B) { benchmarkCountHard(b, "hello world") } + +var benchInputTorture = Repeat("ABC", 1<<10) + "123" + Repeat("ABC", 1<<10) +var benchNeedleTorture = Repeat("ABC", 1<<10+1) + +func BenchmarkIndexTorture(b *testing.B) { + for i := 0; i < b.N; i++ { + Index(benchInputTorture, benchNeedleTorture) + } +} + +func BenchmarkCountTorture(b *testing.B) { + for i := 0; i < b.N; i++ { + Count(benchInputTorture, benchNeedleTorture) + } +} + +func BenchmarkCountTortureOverlapping(b *testing.B) { + A := Repeat("ABC", 1<<20) + B := Repeat("ABC", 1<<10) + for i := 0; i < b.N; i++ { + Count(A, B) + } +} + +func BenchmarkCountByte(b *testing.B) { + indexSizes := []int{10, 32, 4 << 10, 4 << 20, 64 << 20} + benchStr := Repeat(benchmarkString, + (indexSizes[len(indexSizes)-1]+len(benchmarkString)-1)/len(benchmarkString)) + benchFunc := func(b *testing.B, benchStr string) { + b.SetBytes(int64(len(benchStr))) + for i := 0; i < b.N; i++ { + Count(benchStr, "=") + } + } + for _, size := range indexSizes { + b.Run(fmt.Sprintf("%d", size), func(b *testing.B) { + benchFunc(b, benchStr[:size]) + }) + } + +} + +var makeFieldsInput = func() string { + x := make([]byte, 1<<20) + // Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space. + for i := range x { + switch rand.Intn(10) { + case 0: + x[i] = ' ' + case 1: + if i > 0 && x[i-1] == 'x' { + copy(x[i-1:], "χ") + break + } + fallthrough + default: + x[i] = 'x' + } + } + return string(x) +} + +var makeFieldsInputASCII = func() string { + x := make([]byte, 1<<20) + // Input is ~10% space, rest ASCII non-space. + for i := range x { + if rand.Intn(10) == 0 { + x[i] = ' ' + } else { + x[i] = 'x' + } + } + return string(x) +} + +var stringdata = []struct{ name, data string }{ + {"ASCII", makeFieldsInputASCII()}, + {"Mixed", makeFieldsInput()}, +} + +func BenchmarkFields(b *testing.B) { + for _, sd := range stringdata { + b.Run(sd.name, func(b *testing.B) { + for j := 1 << 4; j <= 1<<20; j <<= 4 { + b.Run(fmt.Sprintf("%d", j), func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(j)) + data := sd.data[:j] + for i := 0; i < b.N; i++ { + Fields(data) + } + }) + } + }) + } +} + +func BenchmarkFieldsFunc(b *testing.B) { + for _, sd := range stringdata { + b.Run(sd.name, func(b *testing.B) { + for j := 1 << 4; j <= 1<<20; j <<= 4 { + b.Run(fmt.Sprintf("%d", j), func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(j)) + data := sd.data[:j] + for i := 0; i < b.N; i++ { + FieldsFunc(data, unicode.IsSpace) + } + }) + } + }) + } +} + +func BenchmarkSplitEmptySeparator(b *testing.B) { + for i := 0; i < b.N; i++ { + Split(benchInputHard, "") + } +} + +func BenchmarkSplitSingleByteSeparator(b *testing.B) { + for i := 0; i < b.N; i++ { + Split(benchInputHard, "/") + } +} + +func BenchmarkSplitMultiByteSeparator(b *testing.B) { + for i := 0; i < b.N; i++ { + Split(benchInputHard, "hello") + } +} + +func BenchmarkSplitNSingleByteSeparator(b *testing.B) { + for i := 0; i < b.N; i++ { + SplitN(benchInputHard, "/", 10) + } +} + +func BenchmarkSplitNMultiByteSeparator(b *testing.B) { + for i := 0; i < b.N; i++ { + SplitN(benchInputHard, "hello", 10) + } +} + +func BenchmarkRepeat(b *testing.B) { + s := "0123456789" + for _, n := range []int{5, 10} { + for _, c := range []int{0, 1, 2, 6} { + b.Run(fmt.Sprintf("%dx%d", n, c), func(b *testing.B) { + for i := 0; i < b.N; i++ { + Repeat(s[:n], c) + } + }) + } + } +} + +func BenchmarkRepeatLarge(b *testing.B) { + s := Repeat("@", 8*1024) + for j := 8; j <= 30; j++ { + for _, k := range []int{1, 16, 4097} { + s := s[:k] + n := (1 << j) / k + if n == 0 { + continue + } + b.Run(fmt.Sprintf("%d/%d", 1<") + } +} -- cgit v1.2.3