summaryrefslogtreecommitdiffstats
path: root/src/unicode/utf8
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:19:13 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:19:13 +0000
commitccd992355df7192993c666236047820244914598 (patch)
treef00fea65147227b7743083c6148396f74cd66935 /src/unicode/utf8
parentInitial commit. (diff)
downloadgolang-1.21-ccd992355df7192993c666236047820244914598.tar.xz
golang-1.21-ccd992355df7192993c666236047820244914598.zip
Adding upstream version 1.21.8.upstream/1.21.8
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/unicode/utf8')
-rw-r--r--src/unicode/utf8/example_test.go226
-rw-r--r--src/unicode/utf8/utf8.go583
-rw-r--r--src/unicode/utf8/utf8_test.go703
3 files changed, 1512 insertions, 0 deletions
diff --git a/src/unicode/utf8/example_test.go b/src/unicode/utf8/example_test.go
new file mode 100644
index 0000000..fe434c9
--- /dev/null
+++ b/src/unicode/utf8/example_test.go
@@ -0,0 +1,226 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8_test
+
+import (
+ "fmt"
+ "unicode/utf8"
+)
+
+func ExampleDecodeLastRune() {
+ b := []byte("Hello, 世界")
+
+ for len(b) > 0 {
+ r, size := utf8.DecodeLastRune(b)
+ fmt.Printf("%c %v\n", r, size)
+
+ b = b[:len(b)-size]
+ }
+ // Output:
+ // 界 3
+ // 世 3
+ // 1
+ // , 1
+ // o 1
+ // l 1
+ // l 1
+ // e 1
+ // H 1
+}
+
+func ExampleDecodeLastRuneInString() {
+ str := "Hello, 世界"
+
+ for len(str) > 0 {
+ r, size := utf8.DecodeLastRuneInString(str)
+ fmt.Printf("%c %v\n", r, size)
+
+ str = str[:len(str)-size]
+ }
+ // Output:
+ // 界 3
+ // 世 3
+ // 1
+ // , 1
+ // o 1
+ // l 1
+ // l 1
+ // e 1
+ // H 1
+
+}
+
+func ExampleDecodeRune() {
+ b := []byte("Hello, 世界")
+
+ for len(b) > 0 {
+ r, size := utf8.DecodeRune(b)
+ fmt.Printf("%c %v\n", r, size)
+
+ b = b[size:]
+ }
+ // Output:
+ // H 1
+ // e 1
+ // l 1
+ // l 1
+ // o 1
+ // , 1
+ // 1
+ // 世 3
+ // 界 3
+}
+
+func ExampleDecodeRuneInString() {
+ str := "Hello, 世界"
+
+ for len(str) > 0 {
+ r, size := utf8.DecodeRuneInString(str)
+ fmt.Printf("%c %v\n", r, size)
+
+ str = str[size:]
+ }
+ // Output:
+ // H 1
+ // e 1
+ // l 1
+ // l 1
+ // o 1
+ // , 1
+ // 1
+ // 世 3
+ // 界 3
+}
+
+func ExampleEncodeRune() {
+ r := '世'
+ buf := make([]byte, 3)
+
+ n := utf8.EncodeRune(buf, r)
+
+ fmt.Println(buf)
+ fmt.Println(n)
+ // Output:
+ // [228 184 150]
+ // 3
+}
+
+func ExampleEncodeRune_outOfRange() {
+ runes := []rune{
+ // Less than 0, out of range.
+ -1,
+ // Greater than 0x10FFFF, out of range.
+ 0x110000,
+ // The Unicode replacement character.
+ utf8.RuneError,
+ }
+ for i, c := range runes {
+ buf := make([]byte, 3)
+ size := utf8.EncodeRune(buf, c)
+ fmt.Printf("%d: %d %[2]s %d\n", i, buf, size)
+ }
+ // Output:
+ // 0: [239 191 189] � 3
+ // 1: [239 191 189] � 3
+ // 2: [239 191 189] � 3
+}
+
+func ExampleFullRune() {
+ buf := []byte{228, 184, 150} // 世
+ fmt.Println(utf8.FullRune(buf))
+ fmt.Println(utf8.FullRune(buf[:2]))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleFullRuneInString() {
+ str := "世"
+ fmt.Println(utf8.FullRuneInString(str))
+ fmt.Println(utf8.FullRuneInString(str[:2]))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleRuneCount() {
+ buf := []byte("Hello, 世界")
+ fmt.Println("bytes =", len(buf))
+ fmt.Println("runes =", utf8.RuneCount(buf))
+ // Output:
+ // bytes = 13
+ // runes = 9
+}
+
+func ExampleRuneCountInString() {
+ str := "Hello, 世界"
+ fmt.Println("bytes =", len(str))
+ fmt.Println("runes =", utf8.RuneCountInString(str))
+ // Output:
+ // bytes = 13
+ // runes = 9
+}
+
+func ExampleRuneLen() {
+ fmt.Println(utf8.RuneLen('a'))
+ fmt.Println(utf8.RuneLen('界'))
+ // Output:
+ // 1
+ // 3
+}
+
+func ExampleRuneStart() {
+ buf := []byte("a界")
+ fmt.Println(utf8.RuneStart(buf[0]))
+ fmt.Println(utf8.RuneStart(buf[1]))
+ fmt.Println(utf8.RuneStart(buf[2]))
+ // Output:
+ // true
+ // true
+ // false
+}
+
+func ExampleValid() {
+ valid := []byte("Hello, 世界")
+ invalid := []byte{0xff, 0xfe, 0xfd}
+
+ fmt.Println(utf8.Valid(valid))
+ fmt.Println(utf8.Valid(invalid))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleValidRune() {
+ valid := 'a'
+ invalid := rune(0xfffffff)
+
+ fmt.Println(utf8.ValidRune(valid))
+ fmt.Println(utf8.ValidRune(invalid))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleValidString() {
+ valid := "Hello, 世界"
+ invalid := string([]byte{0xff, 0xfe, 0xfd})
+
+ fmt.Println(utf8.ValidString(valid))
+ fmt.Println(utf8.ValidString(invalid))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleAppendRune() {
+ buf1 := utf8.AppendRune(nil, 0x10000)
+ buf2 := utf8.AppendRune([]byte("init"), 0x10000)
+ fmt.Println(string(buf1))
+ fmt.Println(string(buf2))
+ // Output:
+ // 𐀀
+ // init𐀀
+}
diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go
new file mode 100644
index 0000000..1e9f666
--- /dev/null
+++ b/src/unicode/utf8/utf8.go
@@ -0,0 +1,583 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package utf8 implements functions and constants to support text encoded in
+// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
+// See https://en.wikipedia.org/wiki/UTF-8
+package utf8
+
+// The conditions RuneError==unicode.ReplacementChar and
+// MaxRune==unicode.MaxRune are verified in the tests.
+// Defining them locally avoids this package depending on package unicode.
+
+// Numbers fundamental to the encoding.
+const (
+ RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
+ RuneSelf = 0x80 // characters below RuneSelf are represented as themselves in a single byte.
+ MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
+ UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
+)
+
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+ surrogateMin = 0xD800
+ surrogateMax = 0xDFFF
+)
+
+const (
+ t1 = 0b00000000
+ tx = 0b10000000
+ t2 = 0b11000000
+ t3 = 0b11100000
+ t4 = 0b11110000
+ t5 = 0b11111000
+
+ maskx = 0b00111111
+ mask2 = 0b00011111
+ mask3 = 0b00001111
+ mask4 = 0b00000111
+
+ rune1Max = 1<<7 - 1
+ rune2Max = 1<<11 - 1
+ rune3Max = 1<<16 - 1
+
+ // The default lowest and highest continuation byte.
+ locb = 0b10000000
+ hicb = 0b10111111
+
+ // These names of these constants are chosen to give nice alignment in the
+ // table below. The first nibble is an index into acceptRanges or F for
+ // special one-byte cases. The second nibble is the Rune length or the
+ // Status for the special one-byte case.
+ xx = 0xF1 // invalid: size 1
+ as = 0xF0 // ASCII: size 1
+ s1 = 0x02 // accept 0, size 2
+ s2 = 0x13 // accept 1, size 3
+ s3 = 0x03 // accept 0, size 3
+ s4 = 0x23 // accept 2, size 3
+ s5 = 0x34 // accept 3, size 4
+ s6 = 0x04 // accept 0, size 4
+ s7 = 0x44 // accept 4, size 4
+)
+
+// first is information about the first byte in a UTF-8 sequence.
+var first = [256]uint8{
+ // 1 2 3 4 5 6 7 8 9 A B C D E F
+ as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
+ as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
+ as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
+ as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
+ as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
+ as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
+ as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
+ as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
+ // 1 2 3 4 5 6 7 8 9 A B C D E F
+ xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
+ xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
+ xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
+ xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
+ xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
+ s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
+ s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
+ s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
+}
+
+// acceptRange gives the range of valid values for the second byte in a UTF-8
+// sequence.
+type acceptRange struct {
+ lo uint8 // lowest value for second byte.
+ hi uint8 // highest value for second byte.
+}
+
+// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
+var acceptRanges = [16]acceptRange{
+ 0: {locb, hicb},
+ 1: {0xA0, hicb},
+ 2: {locb, 0x9F},
+ 3: {0x90, hicb},
+ 4: {locb, 0x8F},
+}
+
+// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
+// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
+func FullRune(p []byte) bool {
+ n := len(p)
+ if n == 0 {
+ return false
+ }
+ x := first[p[0]]
+ if n >= int(x&7) {
+ return true // ASCII, invalid or valid.
+ }
+ // Must be short or invalid.
+ accept := acceptRanges[x>>4]
+ if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
+ return true
+ } else if n > 2 && (p[2] < locb || hicb < p[2]) {
+ return true
+ }
+ return false
+}
+
+// FullRuneInString is like FullRune but its input is a string.
+func FullRuneInString(s string) bool {
+ n := len(s)
+ if n == 0 {
+ return false
+ }
+ x := first[s[0]]
+ if n >= int(x&7) {
+ return true // ASCII, invalid, or valid.
+ }
+ // Must be short or invalid.
+ accept := acceptRanges[x>>4]
+ if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
+ return true
+ } else if n > 2 && (s[2] < locb || hicb < s[2]) {
+ return true
+ }
+ return false
+}
+
+// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
+// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
+// the encoding is invalid, it returns (RuneError, 1). Both are impossible
+// results for correct, non-empty UTF-8.
+//
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
+func DecodeRune(p []byte) (r rune, size int) {
+ n := len(p)
+ if n < 1 {
+ return RuneError, 0
+ }
+ p0 := p[0]
+ x := first[p0]
+ if x >= as {
+ // The following code simulates an additional check for x == xx and
+ // handling the ASCII and invalid cases accordingly. This mask-and-or
+ // approach prevents an additional branch.
+ mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
+ return rune(p[0])&^mask | RuneError&mask, 1
+ }
+ sz := int(x & 7)
+ accept := acceptRanges[x>>4]
+ if n < sz {
+ return RuneError, 1
+ }
+ b1 := p[1]
+ if b1 < accept.lo || accept.hi < b1 {
+ return RuneError, 1
+ }
+ if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
+ return rune(p0&mask2)<<6 | rune(b1&maskx), 2
+ }
+ b2 := p[2]
+ if b2 < locb || hicb < b2 {
+ return RuneError, 1
+ }
+ if sz <= 3 {
+ return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
+ }
+ b3 := p[3]
+ if b3 < locb || hicb < b3 {
+ return RuneError, 1
+ }
+ return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
+}
+
+// DecodeRuneInString is like DecodeRune but its input is a string. If s is
+// empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
+// returns (RuneError, 1). Both are impossible results for correct, non-empty
+// UTF-8.
+//
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
+func DecodeRuneInString(s string) (r rune, size int) {
+ n := len(s)
+ if n < 1 {
+ return RuneError, 0
+ }
+ s0 := s[0]
+ x := first[s0]
+ if x >= as {
+ // The following code simulates an additional check for x == xx and
+ // handling the ASCII and invalid cases accordingly. This mask-and-or
+ // approach prevents an additional branch.
+ mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
+ return rune(s[0])&^mask | RuneError&mask, 1
+ }
+ sz := int(x & 7)
+ accept := acceptRanges[x>>4]
+ if n < sz {
+ return RuneError, 1
+ }
+ s1 := s[1]
+ if s1 < accept.lo || accept.hi < s1 {
+ return RuneError, 1
+ }
+ if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
+ return rune(s0&mask2)<<6 | rune(s1&maskx), 2
+ }
+ s2 := s[2]
+ if s2 < locb || hicb < s2 {
+ return RuneError, 1
+ }
+ if sz <= 3 {
+ return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
+ }
+ s3 := s[3]
+ if s3 < locb || hicb < s3 {
+ return RuneError, 1
+ }
+ return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
+}
+
+// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
+// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
+// the encoding is invalid, it returns (RuneError, 1). Both are impossible
+// results for correct, non-empty UTF-8.
+//
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
+func DecodeLastRune(p []byte) (r rune, size int) {
+ end := len(p)
+ if end == 0 {
+ return RuneError, 0
+ }
+ start := end - 1
+ r = rune(p[start])
+ if r < RuneSelf {
+ return r, 1
+ }
+ // guard against O(n^2) behavior when traversing
+ // backwards through strings with long sequences of
+ // invalid UTF-8.
+ lim := end - UTFMax
+ if lim < 0 {
+ lim = 0
+ }
+ for start--; start >= lim; start-- {
+ if RuneStart(p[start]) {
+ break
+ }
+ }
+ if start < 0 {
+ start = 0
+ }
+ r, size = DecodeRune(p[start:end])
+ if start+size != end {
+ return RuneError, 1
+ }
+ return r, size
+}
+
+// DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
+// s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
+// it returns (RuneError, 1). Both are impossible results for correct,
+// non-empty UTF-8.
+//
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
+func DecodeLastRuneInString(s string) (r rune, size int) {
+ end := len(s)
+ if end == 0 {
+ return RuneError, 0
+ }
+ start := end - 1
+ r = rune(s[start])
+ if r < RuneSelf {
+ return r, 1
+ }
+ // guard against O(n^2) behavior when traversing
+ // backwards through strings with long sequences of
+ // invalid UTF-8.
+ lim := end - UTFMax
+ if lim < 0 {
+ lim = 0
+ }
+ for start--; start >= lim; start-- {
+ if RuneStart(s[start]) {
+ break
+ }
+ }
+ if start < 0 {
+ start = 0
+ }
+ r, size = DecodeRuneInString(s[start:end])
+ if start+size != end {
+ return RuneError, 1
+ }
+ return r, size
+}
+
+// RuneLen returns the number of bytes required to encode the rune.
+// It returns -1 if the rune is not a valid value to encode in UTF-8.
+func RuneLen(r rune) int {
+ switch {
+ case r < 0:
+ return -1
+ case r <= rune1Max:
+ return 1
+ case r <= rune2Max:
+ return 2
+ case surrogateMin <= r && r <= surrogateMax:
+ return -1
+ case r <= rune3Max:
+ return 3
+ case r <= MaxRune:
+ return 4
+ }
+ return -1
+}
+
+// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
+// If the rune is out of range, it writes the encoding of RuneError.
+// It returns the number of bytes written.
+func EncodeRune(p []byte, r rune) int {
+ // Negative values are erroneous. Making it unsigned addresses the problem.
+ switch i := uint32(r); {
+ case i <= rune1Max:
+ p[0] = byte(r)
+ return 1
+ case i <= rune2Max:
+ _ = p[1] // eliminate bounds checks
+ p[0] = t2 | byte(r>>6)
+ p[1] = tx | byte(r)&maskx
+ return 2
+ case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
+ r = RuneError
+ fallthrough
+ case i <= rune3Max:
+ _ = p[2] // eliminate bounds checks
+ p[0] = t3 | byte(r>>12)
+ p[1] = tx | byte(r>>6)&maskx
+ p[2] = tx | byte(r)&maskx
+ return 3
+ default:
+ _ = p[3] // eliminate bounds checks
+ p[0] = t4 | byte(r>>18)
+ p[1] = tx | byte(r>>12)&maskx
+ p[2] = tx | byte(r>>6)&maskx
+ p[3] = tx | byte(r)&maskx
+ return 4
+ }
+}
+
+// AppendRune appends the UTF-8 encoding of r to the end of p and
+// returns the extended buffer. If the rune is out of range,
+// it appends the encoding of RuneError.
+func AppendRune(p []byte, r rune) []byte {
+ // This function is inlineable for fast handling of ASCII.
+ if uint32(r) <= rune1Max {
+ return append(p, byte(r))
+ }
+ return appendRuneNonASCII(p, r)
+}
+
+func appendRuneNonASCII(p []byte, r rune) []byte {
+ // Negative values are erroneous. Making it unsigned addresses the problem.
+ switch i := uint32(r); {
+ case i <= rune2Max:
+ return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
+ case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
+ r = RuneError
+ fallthrough
+ case i <= rune3Max:
+ return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
+ default:
+ return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
+ }
+}
+
+// RuneCount returns the number of runes in p. Erroneous and short
+// encodings are treated as single runes of width 1 byte.
+func RuneCount(p []byte) int {
+ np := len(p)
+ var n int
+ for i := 0; i < np; {
+ n++
+ c := p[i]
+ if c < RuneSelf {
+ // ASCII fast path
+ i++
+ continue
+ }
+ x := first[c]
+ if x == xx {
+ i++ // invalid.
+ continue
+ }
+ size := int(x & 7)
+ if i+size > np {
+ i++ // Short or invalid.
+ continue
+ }
+ accept := acceptRanges[x>>4]
+ if c := p[i+1]; c < accept.lo || accept.hi < c {
+ size = 1
+ } else if size == 2 {
+ } else if c := p[i+2]; c < locb || hicb < c {
+ size = 1
+ } else if size == 3 {
+ } else if c := p[i+3]; c < locb || hicb < c {
+ size = 1
+ }
+ i += size
+ }
+ return n
+}
+
+// RuneCountInString is like RuneCount but its input is a string.
+func RuneCountInString(s string) (n int) {
+ ns := len(s)
+ for i := 0; i < ns; n++ {
+ c := s[i]
+ if c < RuneSelf {
+ // ASCII fast path
+ i++
+ continue
+ }
+ x := first[c]
+ if x == xx {
+ i++ // invalid.
+ continue
+ }
+ size := int(x & 7)
+ if i+size > ns {
+ i++ // Short or invalid.
+ continue
+ }
+ accept := acceptRanges[x>>4]
+ if c := s[i+1]; c < accept.lo || accept.hi < c {
+ size = 1
+ } else if size == 2 {
+ } else if c := s[i+2]; c < locb || hicb < c {
+ size = 1
+ } else if size == 3 {
+ } else if c := s[i+3]; c < locb || hicb < c {
+ size = 1
+ }
+ i += size
+ }
+ return n
+}
+
+// RuneStart reports whether the byte could be the first byte of an encoded,
+// possibly invalid rune. Second and subsequent bytes always have the top two
+// bits set to 10.
+func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
+
+// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
+func Valid(p []byte) bool {
+ // This optimization avoids the need to recompute the capacity
+ // when generating code for p[8:], bringing it to parity with
+ // ValidString, which was 20% faster on long ASCII strings.
+ p = p[:len(p):len(p)]
+
+ // Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
+ for len(p) >= 8 {
+ // Combining two 32 bit loads allows the same code to be used
+ // for 32 and 64 bit platforms.
+ // The compiler can generate a 32bit load for first32 and second32
+ // on many platforms. See test/codegen/memcombine.go.
+ first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
+ second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
+ if (first32|second32)&0x80808080 != 0 {
+ // Found a non ASCII byte (>= RuneSelf).
+ break
+ }
+ p = p[8:]
+ }
+ n := len(p)
+ for i := 0; i < n; {
+ pi := p[i]
+ if pi < RuneSelf {
+ i++
+ continue
+ }
+ x := first[pi]
+ if x == xx {
+ return false // Illegal starter byte.
+ }
+ size := int(x & 7)
+ if i+size > n {
+ return false // Short or invalid.
+ }
+ accept := acceptRanges[x>>4]
+ if c := p[i+1]; c < accept.lo || accept.hi < c {
+ return false
+ } else if size == 2 {
+ } else if c := p[i+2]; c < locb || hicb < c {
+ return false
+ } else if size == 3 {
+ } else if c := p[i+3]; c < locb || hicb < c {
+ return false
+ }
+ i += size
+ }
+ return true
+}
+
+// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
+func ValidString(s string) bool {
+ // Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
+ for len(s) >= 8 {
+ // Combining two 32 bit loads allows the same code to be used
+ // for 32 and 64 bit platforms.
+ // The compiler can generate a 32bit load for first32 and second32
+ // on many platforms. See test/codegen/memcombine.go.
+ first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
+ second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
+ if (first32|second32)&0x80808080 != 0 {
+ // Found a non ASCII byte (>= RuneSelf).
+ break
+ }
+ s = s[8:]
+ }
+ n := len(s)
+ for i := 0; i < n; {
+ si := s[i]
+ if si < RuneSelf {
+ i++
+ continue
+ }
+ x := first[si]
+ if x == xx {
+ return false // Illegal starter byte.
+ }
+ size := int(x & 7)
+ if i+size > n {
+ return false // Short or invalid.
+ }
+ accept := acceptRanges[x>>4]
+ if c := s[i+1]; c < accept.lo || accept.hi < c {
+ return false
+ } else if size == 2 {
+ } else if c := s[i+2]; c < locb || hicb < c {
+ return false
+ } else if size == 3 {
+ } else if c := s[i+3]; c < locb || hicb < c {
+ return false
+ }
+ i += size
+ }
+ return true
+}
+
+// ValidRune reports whether r can be legally encoded as UTF-8.
+// Code points that are out of range or a surrogate half are illegal.
+func ValidRune(r rune) bool {
+ switch {
+ case 0 <= r && r < surrogateMin:
+ return true
+ case surrogateMax < r && r <= MaxRune:
+ return true
+ }
+ return false
+}
diff --git a/src/unicode/utf8/utf8_test.go b/src/unicode/utf8/utf8_test.go
new file mode 100644
index 0000000..19a04dc
--- /dev/null
+++ b/src/unicode/utf8/utf8_test.go
@@ -0,0 +1,703 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8_test
+
+import (
+ "bytes"
+ "strings"
+ "testing"
+ "unicode"
+ . "unicode/utf8"
+)
+
+// Validate the constants redefined from unicode.
+func init() {
+ if MaxRune != unicode.MaxRune {
+ panic("utf8.MaxRune is wrong")
+ }
+ if RuneError != unicode.ReplacementChar {
+ panic("utf8.RuneError is wrong")
+ }
+}
+
+// Validate the constants redefined from unicode.
+func TestConstants(t *testing.T) {
+ if MaxRune != unicode.MaxRune {
+ t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
+ }
+ if RuneError != unicode.ReplacementChar {
+ t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
+ }
+}
+
+type Utf8Map struct {
+ r rune
+ str string
+}
+
+var utf8map = []Utf8Map{
+ {0x0000, "\x00"},
+ {0x0001, "\x01"},
+ {0x007e, "\x7e"},
+ {0x007f, "\x7f"},
+ {0x0080, "\xc2\x80"},
+ {0x0081, "\xc2\x81"},
+ {0x00bf, "\xc2\xbf"},
+ {0x00c0, "\xc3\x80"},
+ {0x00c1, "\xc3\x81"},
+ {0x00c8, "\xc3\x88"},
+ {0x00d0, "\xc3\x90"},
+ {0x00e0, "\xc3\xa0"},
+ {0x00f0, "\xc3\xb0"},
+ {0x00f8, "\xc3\xb8"},
+ {0x00ff, "\xc3\xbf"},
+ {0x0100, "\xc4\x80"},
+ {0x07ff, "\xdf\xbf"},
+ {0x0400, "\xd0\x80"},
+ {0x0800, "\xe0\xa0\x80"},
+ {0x0801, "\xe0\xa0\x81"},
+ {0x1000, "\xe1\x80\x80"},
+ {0xd000, "\xed\x80\x80"},
+ {0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
+ {0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
+ {0xfffe, "\xef\xbf\xbe"},
+ {0xffff, "\xef\xbf\xbf"},
+ {0x10000, "\xf0\x90\x80\x80"},
+ {0x10001, "\xf0\x90\x80\x81"},
+ {0x40000, "\xf1\x80\x80\x80"},
+ {0x10fffe, "\xf4\x8f\xbf\xbe"},
+ {0x10ffff, "\xf4\x8f\xbf\xbf"},
+ {0xFFFD, "\xef\xbf\xbd"},
+}
+
+var surrogateMap = []Utf8Map{
+ {0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
+ {0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
+}
+
+var testStrings = []string{
+ "",
+ "abcd",
+ "☺☻☹",
+ "日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+ "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+ "\x80\x80\x80\x80",
+}
+
+func TestFullRune(t *testing.T) {
+ for _, m := range utf8map {
+ b := []byte(m.str)
+ if !FullRune(b) {
+ t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
+ }
+ s := m.str
+ if !FullRuneInString(s) {
+ t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
+ }
+ b1 := b[0 : len(b)-1]
+ if FullRune(b1) {
+ t.Errorf("FullRune(%q) = true, want false", b1)
+ }
+ s1 := string(b1)
+ if FullRuneInString(s1) {
+ t.Errorf("FullRune(%q) = true, want false", s1)
+ }
+ }
+ for _, s := range []string{"\xc0", "\xc1"} {
+ b := []byte(s)
+ if !FullRune(b) {
+ t.Errorf("FullRune(%q) = false, want true", s)
+ }
+ if !FullRuneInString(s) {
+ t.Errorf("FullRuneInString(%q) = false, want true", s)
+ }
+ }
+}
+
+func TestEncodeRune(t *testing.T) {
+ for _, m := range utf8map {
+ b := []byte(m.str)
+ var buf [10]byte
+ n := EncodeRune(buf[0:], m.r)
+ b1 := buf[0:n]
+ if !bytes.Equal(b, b1) {
+ t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
+ }
+ }
+}
+
+func TestAppendRune(t *testing.T) {
+ for _, m := range utf8map {
+ if buf := AppendRune(nil, m.r); string(buf) != m.str {
+ t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
+ }
+ if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str {
+ t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
+ }
+ }
+}
+
+func TestDecodeRune(t *testing.T) {
+ for _, m := range utf8map {
+ b := []byte(m.str)
+ r, size := DecodeRune(b)
+ if r != m.r || size != len(b) {
+ t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
+ }
+ s := m.str
+ r, size = DecodeRuneInString(s)
+ if r != m.r || size != len(b) {
+ t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
+ }
+
+ // there's an extra byte that bytes left behind - make sure trailing byte works
+ r, size = DecodeRune(b[0:cap(b)])
+ if r != m.r || size != len(b) {
+ t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
+ }
+ s = m.str + "\x00"
+ r, size = DecodeRuneInString(s)
+ if r != m.r || size != len(b) {
+ t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
+ }
+
+ // make sure missing bytes fail
+ wantsize := 1
+ if wantsize >= len(b) {
+ wantsize = 0
+ }
+ r, size = DecodeRune(b[0 : len(b)-1])
+ if r != RuneError || size != wantsize {
+ t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
+ }
+ s = m.str[0 : len(m.str)-1]
+ r, size = DecodeRuneInString(s)
+ if r != RuneError || size != wantsize {
+ t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
+ }
+
+ // make sure bad sequences fail
+ if len(b) == 1 {
+ b[0] = 0x80
+ } else {
+ b[len(b)-1] = 0x7F
+ }
+ r, size = DecodeRune(b)
+ if r != RuneError || size != 1 {
+ t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
+ }
+ s = string(b)
+ r, size = DecodeRuneInString(s)
+ if r != RuneError || size != 1 {
+ t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
+ }
+
+ }
+}
+
+func TestDecodeSurrogateRune(t *testing.T) {
+ for _, m := range surrogateMap {
+ b := []byte(m.str)
+ r, size := DecodeRune(b)
+ if r != RuneError || size != 1 {
+ t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
+ }
+ s := m.str
+ r, size = DecodeRuneInString(s)
+ if r != RuneError || size != 1 {
+ t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
+ }
+ }
+}
+
+// Check that DecodeRune and DecodeLastRune correspond to
+// the equivalent range loop.
+func TestSequencing(t *testing.T) {
+ for _, ts := range testStrings {
+ for _, m := range utf8map {
+ for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
+ testSequence(t, s)
+ }
+ }
+ }
+}
+
+func runtimeRuneCount(s string) int {
+ return len([]rune(s)) // Replaced by gc with call to runtime.countrunes(s).
+}
+
+// Check that a range loop, len([]rune(string)) optimization and
+// []rune conversions visit the same runes.
+// Not really a test of this package, but the assumption is used here and
+// it's good to verify.
+func TestRuntimeConversion(t *testing.T) {
+ for _, ts := range testStrings {
+ count := RuneCountInString(ts)
+ if n := runtimeRuneCount(ts); n != count {
+ t.Errorf("%q: len([]rune()) counted %d runes; got %d from RuneCountInString", ts, n, count)
+ break
+ }
+
+ runes := []rune(ts)
+ if n := len(runes); n != count {
+ t.Errorf("%q: []rune() has length %d; got %d from RuneCountInString", ts, n, count)
+ break
+ }
+ i := 0
+ for _, r := range ts {
+ if r != runes[i] {
+ t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
+ }
+ i++
+ }
+ }
+}
+
+var invalidSequenceTests = []string{
+ "\xed\xa0\x80\x80", // surrogate min
+ "\xed\xbf\xbf\x80", // surrogate max
+
+ // xx
+ "\x91\x80\x80\x80",
+
+ // s1
+ "\xC2\x7F\x80\x80",
+ "\xC2\xC0\x80\x80",
+ "\xDF\x7F\x80\x80",
+ "\xDF\xC0\x80\x80",
+
+ // s2
+ "\xE0\x9F\xBF\x80",
+ "\xE0\xA0\x7F\x80",
+ "\xE0\xBF\xC0\x80",
+ "\xE0\xC0\x80\x80",
+
+ // s3
+ "\xE1\x7F\xBF\x80",
+ "\xE1\x80\x7F\x80",
+ "\xE1\xBF\xC0\x80",
+ "\xE1\xC0\x80\x80",
+
+ //s4
+ "\xED\x7F\xBF\x80",
+ "\xED\x80\x7F\x80",
+ "\xED\x9F\xC0\x80",
+ "\xED\xA0\x80\x80",
+
+ // s5
+ "\xF0\x8F\xBF\xBF",
+ "\xF0\x90\x7F\xBF",
+ "\xF0\x90\x80\x7F",
+ "\xF0\xBF\xBF\xC0",
+ "\xF0\xBF\xC0\x80",
+ "\xF0\xC0\x80\x80",
+
+ // s6
+ "\xF1\x7F\xBF\xBF",
+ "\xF1\x80\x7F\xBF",
+ "\xF1\x80\x80\x7F",
+ "\xF1\xBF\xBF\xC0",
+ "\xF1\xBF\xC0\x80",
+ "\xF1\xC0\x80\x80",
+
+ // s7
+ "\xF4\x7F\xBF\xBF",
+ "\xF4\x80\x7F\xBF",
+ "\xF4\x80\x80\x7F",
+ "\xF4\x8F\xBF\xC0",
+ "\xF4\x8F\xC0\x80",
+ "\xF4\x90\x80\x80",
+}
+
+func runtimeDecodeRune(s string) rune {
+ for _, r := range s {
+ return r
+ }
+ return -1
+}
+
+func TestDecodeInvalidSequence(t *testing.T) {
+ for _, s := range invalidSequenceTests {
+ r1, _ := DecodeRune([]byte(s))
+ if want := RuneError; r1 != want {
+ t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want)
+ return
+ }
+ r2, _ := DecodeRuneInString(s)
+ if want := RuneError; r2 != want {
+ t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s, r2, want)
+ return
+ }
+ if r1 != r2 {
+ t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRuneInString(%q) = %#04x", s, r1, s, r2)
+ return
+ }
+ r3 := runtimeDecodeRune(s)
+ if r2 != r3 {
+ t.Errorf("DecodeRuneInString(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3)
+ return
+ }
+ }
+}
+
+func testSequence(t *testing.T, s string) {
+ type info struct {
+ index int
+ r rune
+ }
+ index := make([]info, len(s))
+ b := []byte(s)
+ si := 0
+ j := 0
+ for i, r := range s {
+ if si != i {
+ t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
+ return
+ }
+ index[j] = info{i, r}
+ j++
+ r1, size1 := DecodeRune(b[i:])
+ if r != r1 {
+ t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
+ return
+ }
+ r2, size2 := DecodeRuneInString(s[i:])
+ if r != r2 {
+ t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
+ return
+ }
+ if size1 != size2 {
+ t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
+ return
+ }
+ si += size1
+ }
+ j--
+ for si = len(s); si > 0; {
+ r1, size1 := DecodeLastRune(b[0:si])
+ r2, size2 := DecodeLastRuneInString(s[0:si])
+ if size1 != size2 {
+ t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
+ return
+ }
+ if r1 != index[j].r {
+ t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
+ return
+ }
+ if r2 != index[j].r {
+ t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
+ return
+ }
+ si -= size1
+ if si != index[j].index {
+ t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
+ return
+ }
+ j--
+ }
+ if si != 0 {
+ t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
+ }
+}
+
+// Check that negative runes encode as U+FFFD.
+func TestNegativeRune(t *testing.T) {
+ errorbuf := make([]byte, UTFMax)
+ errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
+ buf := make([]byte, UTFMax)
+ buf = buf[0:EncodeRune(buf, -1)]
+ if !bytes.Equal(buf, errorbuf) {
+ t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
+ }
+}
+
+type RuneCountTest struct {
+ in string
+ out int
+}
+
+var runecounttests = []RuneCountTest{
+ {"abcd", 4},
+ {"☺☻☹", 3},
+ {"1,2,3,4", 7},
+ {"\xe2\x00", 2},
+ {"\xe2\x80", 2},
+ {"a\xe2\x80", 3},
+}
+
+func TestRuneCount(t *testing.T) {
+ for _, tt := range runecounttests {
+ if out := RuneCountInString(tt.in); out != tt.out {
+ t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
+ }
+ if out := RuneCount([]byte(tt.in)); out != tt.out {
+ t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
+ }
+ }
+}
+
+type RuneLenTest struct {
+ r rune
+ size int
+}
+
+var runelentests = []RuneLenTest{
+ {0, 1},
+ {'e', 1},
+ {'é', 2},
+ {'☺', 3},
+ {RuneError, 3},
+ {MaxRune, 4},
+ {0xD800, -1},
+ {0xDFFF, -1},
+ {MaxRune + 1, -1},
+ {-1, -1},
+}
+
+func TestRuneLen(t *testing.T) {
+ for _, tt := range runelentests {
+ if size := RuneLen(tt.r); size != tt.size {
+ t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
+ }
+ }
+}
+
+type ValidTest struct {
+ in string
+ out bool
+}
+
+var validTests = []ValidTest{
+ {"", true},
+ {"a", true},
+ {"abc", true},
+ {"Ж", true},
+ {"ЖЖ", true},
+ {"брэд-ЛГТМ", true},
+ {"☺☻☹", true},
+ {"aa\xe2", false},
+ {string([]byte{66, 250}), false},
+ {string([]byte{66, 250, 67}), false},
+ {"a\uFFFDb", true},
+ {string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF
+ {string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range
+ {string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range
+ {string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
+ {string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect
+ {string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic)
+ {string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic)
+}
+
+func TestValid(t *testing.T) {
+ for _, tt := range validTests {
+ if Valid([]byte(tt.in)) != tt.out {
+ t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
+ }
+ if ValidString(tt.in) != tt.out {
+ t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
+ }
+ }
+}
+
+type ValidRuneTest struct {
+ r rune
+ ok bool
+}
+
+var validrunetests = []ValidRuneTest{
+ {0, true},
+ {'e', true},
+ {'é', true},
+ {'☺', true},
+ {RuneError, true},
+ {MaxRune, true},
+ {0xD7FF, true},
+ {0xD800, false},
+ {0xDFFF, false},
+ {0xE000, true},
+ {MaxRune + 1, false},
+ {-1, false},
+}
+
+func TestValidRune(t *testing.T) {
+ for _, tt := range validrunetests {
+ if ok := ValidRune(tt.r); ok != tt.ok {
+ t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
+ }
+ }
+}
+
+func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
+ s := []byte("0123456789")
+ for i := 0; i < b.N; i++ {
+ RuneCount(s)
+ }
+}
+
+func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
+ s := []byte("日本語日本語日本語日")
+ for i := 0; i < b.N; i++ {
+ RuneCount(s)
+ }
+}
+
+func BenchmarkRuneCountInStringTenASCIIChars(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ RuneCountInString("0123456789")
+ }
+}
+
+func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ RuneCountInString("日本語日本語日本語日")
+ }
+}
+
+var ascii100000 = strings.Repeat("0123456789", 10000)
+
+func BenchmarkValidTenASCIIChars(b *testing.B) {
+ s := []byte("0123456789")
+ for i := 0; i < b.N; i++ {
+ Valid(s)
+ }
+}
+
+func BenchmarkValid100KASCIIChars(b *testing.B) {
+ s := []byte(ascii100000)
+ for i := 0; i < b.N; i++ {
+ Valid(s)
+ }
+}
+
+func BenchmarkValidTenJapaneseChars(b *testing.B) {
+ s := []byte("日本語日本語日本語日")
+ for i := 0; i < b.N; i++ {
+ Valid(s)
+ }
+}
+func BenchmarkValidLongMostlyASCII(b *testing.B) {
+ longMostlyASCII := []byte(longStringMostlyASCII)
+ for i := 0; i < b.N; i++ {
+ Valid(longMostlyASCII)
+ }
+}
+
+func BenchmarkValidLongJapanese(b *testing.B) {
+ longJapanese := []byte(longStringJapanese)
+ for i := 0; i < b.N; i++ {
+ Valid(longJapanese)
+ }
+}
+
+func BenchmarkValidStringTenASCIIChars(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ ValidString("0123456789")
+ }
+}
+
+func BenchmarkValidString100KASCIIChars(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ ValidString(ascii100000)
+ }
+}
+
+func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ ValidString("日本語日本語日本語日")
+ }
+}
+
+func BenchmarkValidStringLongMostlyASCII(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ ValidString(longStringMostlyASCII)
+ }
+}
+
+func BenchmarkValidStringLongJapanese(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ ValidString(longStringJapanese)
+ }
+}
+
+var longStringMostlyASCII string // ~100KB, ~97% ASCII
+var longStringJapanese string // ~100KB, non-ASCII
+
+func init() {
+ const japanese = "日本語日本語日本語日"
+ var b strings.Builder
+ for i := 0; b.Len() < 100_000; i++ {
+ if i%100 == 0 {
+ b.WriteString(japanese)
+ } else {
+ b.WriteString("0123456789")
+ }
+ }
+ longStringMostlyASCII = b.String()
+ longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese))
+}
+
+func BenchmarkEncodeASCIIRune(b *testing.B) {
+ buf := make([]byte, UTFMax)
+ for i := 0; i < b.N; i++ {
+ EncodeRune(buf, 'a')
+ }
+}
+
+func BenchmarkEncodeJapaneseRune(b *testing.B) {
+ buf := make([]byte, UTFMax)
+ for i := 0; i < b.N; i++ {
+ EncodeRune(buf, '本')
+ }
+}
+
+func BenchmarkAppendASCIIRune(b *testing.B) {
+ buf := make([]byte, UTFMax)
+ for i := 0; i < b.N; i++ {
+ AppendRune(buf[:0], 'a')
+ }
+}
+
+func BenchmarkAppendJapaneseRune(b *testing.B) {
+ buf := make([]byte, UTFMax)
+ for i := 0; i < b.N; i++ {
+ AppendRune(buf[:0], '本')
+ }
+}
+
+func BenchmarkDecodeASCIIRune(b *testing.B) {
+ a := []byte{'a'}
+ for i := 0; i < b.N; i++ {
+ DecodeRune(a)
+ }
+}
+
+func BenchmarkDecodeJapaneseRune(b *testing.B) {
+ nihon := []byte("本")
+ for i := 0; i < b.N; i++ {
+ DecodeRune(nihon)
+ }
+}
+
+// boolSink is used to reference the return value of benchmarked
+// functions to avoid dead code elimination.
+var boolSink bool
+
+func BenchmarkFullRune(b *testing.B) {
+ benchmarks := []struct {
+ name string
+ data []byte
+ }{
+ {"ASCII", []byte("a")},
+ {"Incomplete", []byte("\xf0\x90\x80")},
+ {"Japanese", []byte("本")},
+ }
+ for _, bm := range benchmarks {
+ b.Run(bm.name, func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ boolSink = FullRune(bm.data)
+ }
+ })
+ }
+}