diff options
Diffstat (limited to 'src/strconv/quote.go')
-rw-r--r-- | src/strconv/quote.go | 604 |
1 files changed, 604 insertions, 0 deletions
diff --git a/src/strconv/quote.go b/src/strconv/quote.go new file mode 100644 index 0000000..1b5bddf --- /dev/null +++ b/src/strconv/quote.go @@ -0,0 +1,604 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:generate go run makeisprint.go -output isprint.go + +package strconv + +import ( + "unicode/utf8" +) + +const ( + lowerhex = "0123456789abcdef" + upperhex = "0123456789ABCDEF" +) + +// contains reports whether the string contains the byte c. +func contains(s string, c byte) bool { + return index(s, c) != -1 +} + +func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { + return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) +} + +func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string { + return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly)) +} + +func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte { + // Often called with big strings, so preallocate. If there's quoting, + // this is conservative but still helps a lot. + if cap(buf)-len(buf) < len(s) { + nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1) + copy(nBuf, buf) + buf = nBuf + } + buf = append(buf, quote) + for width := 0; len(s) > 0; s = s[width:] { + r := rune(s[0]) + width = 1 + if r >= utf8.RuneSelf { + r, width = utf8.DecodeRuneInString(s) + } + if width == 1 && r == utf8.RuneError { + buf = append(buf, `\x`...) + buf = append(buf, lowerhex[s[0]>>4]) + buf = append(buf, lowerhex[s[0]&0xF]) + continue + } + buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) + } + buf = append(buf, quote) + return buf +} + +func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { + buf = append(buf, quote) + if !utf8.ValidRune(r) { + r = utf8.RuneError + } + buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) + buf = append(buf, quote) + return buf +} + +func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { + var runeTmp [utf8.UTFMax]byte + if r == rune(quote) || r == '\\' { // always backslashed + buf = append(buf, '\\') + buf = append(buf, byte(r)) + return buf + } + if ASCIIonly { + if r < utf8.RuneSelf && IsPrint(r) { + buf = append(buf, byte(r)) + return buf + } + } else if IsPrint(r) || graphicOnly && isInGraphicList(r) { + n := utf8.EncodeRune(runeTmp[:], r) + buf = append(buf, runeTmp[:n]...) + return buf + } + switch r { + case '\a': + buf = append(buf, `\a`...) + case '\b': + buf = append(buf, `\b`...) + case '\f': + buf = append(buf, `\f`...) + case '\n': + buf = append(buf, `\n`...) + case '\r': + buf = append(buf, `\r`...) + case '\t': + buf = append(buf, `\t`...) + case '\v': + buf = append(buf, `\v`...) + default: + switch { + case r < ' ' || r == 0x7f: + buf = append(buf, `\x`...) + buf = append(buf, lowerhex[byte(r)>>4]) + buf = append(buf, lowerhex[byte(r)&0xF]) + case !utf8.ValidRune(r): + r = 0xFFFD + fallthrough + case r < 0x10000: + buf = append(buf, `\u`...) + for s := 12; s >= 0; s -= 4 { + buf = append(buf, lowerhex[r>>uint(s)&0xF]) + } + default: + buf = append(buf, `\U`...) + for s := 28; s >= 0; s -= 4 { + buf = append(buf, lowerhex[r>>uint(s)&0xF]) + } + } + } + return buf +} + +// Quote returns a double-quoted Go string literal representing s. The +// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for +// control characters and non-printable characters as defined by +// IsPrint. +func Quote(s string) string { + return quoteWith(s, '"', false, false) +} + +// AppendQuote appends a double-quoted Go string literal representing s, +// as generated by Quote, to dst and returns the extended buffer. +func AppendQuote(dst []byte, s string) []byte { + return appendQuotedWith(dst, s, '"', false, false) +} + +// QuoteToASCII returns a double-quoted Go string literal representing s. +// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for +// non-ASCII characters and non-printable characters as defined by IsPrint. +func QuoteToASCII(s string) string { + return quoteWith(s, '"', true, false) +} + +// AppendQuoteToASCII appends a double-quoted Go string literal representing s, +// as generated by QuoteToASCII, to dst and returns the extended buffer. +func AppendQuoteToASCII(dst []byte, s string) []byte { + return appendQuotedWith(dst, s, '"', true, false) +} + +// QuoteToGraphic returns a double-quoted Go string literal representing s. +// The returned string leaves Unicode graphic characters, as defined by +// IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100) +// for non-graphic characters. +func QuoteToGraphic(s string) string { + return quoteWith(s, '"', false, true) +} + +// AppendQuoteToGraphic appends a double-quoted Go string literal representing s, +// as generated by QuoteToGraphic, to dst and returns the extended buffer. +func AppendQuoteToGraphic(dst []byte, s string) []byte { + return appendQuotedWith(dst, s, '"', false, true) +} + +// QuoteRune returns a single-quoted Go character literal representing the +// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) +// for control characters and non-printable characters as defined by IsPrint. +// If r is not a valid Unicode code point, it is interpreted as the Unicode +// replacement character U+FFFD. +func QuoteRune(r rune) string { + return quoteRuneWith(r, '\'', false, false) +} + +// AppendQuoteRune appends a single-quoted Go character literal representing the rune, +// as generated by QuoteRune, to dst and returns the extended buffer. +func AppendQuoteRune(dst []byte, r rune) []byte { + return appendQuotedRuneWith(dst, r, '\'', false, false) +} + +// QuoteRuneToASCII returns a single-quoted Go character literal representing +// the rune. The returned string uses Go escape sequences (\t, \n, \xFF, +// \u0100) for non-ASCII characters and non-printable characters as defined +// by IsPrint. +// If r is not a valid Unicode code point, it is interpreted as the Unicode +// replacement character U+FFFD. +func QuoteRuneToASCII(r rune) string { + return quoteRuneWith(r, '\'', true, false) +} + +// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, +// as generated by QuoteRuneToASCII, to dst and returns the extended buffer. +func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { + return appendQuotedRuneWith(dst, r, '\'', true, false) +} + +// QuoteRuneToGraphic returns a single-quoted Go character literal representing +// the rune. If the rune is not a Unicode graphic character, +// as defined by IsGraphic, the returned string will use a Go escape sequence +// (\t, \n, \xFF, \u0100). +// If r is not a valid Unicode code point, it is interpreted as the Unicode +// replacement character U+FFFD. +func QuoteRuneToGraphic(r rune) string { + return quoteRuneWith(r, '\'', false, true) +} + +// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, +// as generated by QuoteRuneToGraphic, to dst and returns the extended buffer. +func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { + return appendQuotedRuneWith(dst, r, '\'', false, true) +} + +// CanBackquote reports whether the string s can be represented +// unchanged as a single-line backquoted string without control +// characters other than tab. +func CanBackquote(s string) bool { + for len(s) > 0 { + r, wid := utf8.DecodeRuneInString(s) + s = s[wid:] + if wid > 1 { + if r == '\ufeff' { + return false // BOMs are invisible and should not be quoted. + } + continue // All other multibyte runes are correctly encoded and assumed printable. + } + if r == utf8.RuneError { + return false + } + if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { + return false + } + } + return true +} + +func unhex(b byte) (v rune, ok bool) { + c := rune(b) + switch { + case '0' <= c && c <= '9': + return c - '0', true + case 'a' <= c && c <= 'f': + return c - 'a' + 10, true + case 'A' <= c && c <= 'F': + return c - 'A' + 10, true + } + return +} + +// UnquoteChar decodes the first character or byte in the escaped string +// or character literal represented by the string s. +// It returns four values: +// +// 1. value, the decoded Unicode code point or byte value; +// 2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; +// 3. tail, the remainder of the string after the character; and +// 4. an error that will be nil if the character is syntactically valid. +// +// The second argument, quote, specifies the type of literal being parsed +// and therefore which escaped quote character is permitted. +// If set to a single quote, it permits the sequence \' and disallows unescaped '. +// If set to a double quote, it permits \" and disallows unescaped ". +// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. +func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { + // easy cases + if len(s) == 0 { + err = ErrSyntax + return + } + switch c := s[0]; { + case c == quote && (quote == '\'' || quote == '"'): + err = ErrSyntax + return + case c >= utf8.RuneSelf: + r, size := utf8.DecodeRuneInString(s) + return r, true, s[size:], nil + case c != '\\': + return rune(s[0]), false, s[1:], nil + } + + // hard case: c is backslash + if len(s) <= 1 { + err = ErrSyntax + return + } + c := s[1] + s = s[2:] + + switch c { + case 'a': + value = '\a' + case 'b': + value = '\b' + case 'f': + value = '\f' + case 'n': + value = '\n' + case 'r': + value = '\r' + case 't': + value = '\t' + case 'v': + value = '\v' + case 'x', 'u', 'U': + n := 0 + switch c { + case 'x': + n = 2 + case 'u': + n = 4 + case 'U': + n = 8 + } + var v rune + if len(s) < n { + err = ErrSyntax + return + } + for j := 0; j < n; j++ { + x, ok := unhex(s[j]) + if !ok { + err = ErrSyntax + return + } + v = v<<4 | x + } + s = s[n:] + if c == 'x' { + // single-byte string, possibly not UTF-8 + value = v + break + } + if !utf8.ValidRune(v) { + err = ErrSyntax + return + } + value = v + multibyte = true + case '0', '1', '2', '3', '4', '5', '6', '7': + v := rune(c) - '0' + if len(s) < 2 { + err = ErrSyntax + return + } + for j := 0; j < 2; j++ { // one digit already; two more + x := rune(s[j]) - '0' + if x < 0 || x > 7 { + err = ErrSyntax + return + } + v = (v << 3) | x + } + s = s[2:] + if v > 255 { + err = ErrSyntax + return + } + value = v + case '\\': + value = '\\' + case '\'', '"': + if c != quote { + err = ErrSyntax + return + } + value = rune(c) + default: + err = ErrSyntax + return + } + tail = s + return +} + +// QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s. +// If s does not start with a valid quoted string, QuotedPrefix returns an error. +func QuotedPrefix(s string) (string, error) { + out, _, err := unquote(s, false) + return out, err +} + +// Unquote interprets s as a single-quoted, double-quoted, +// or backquoted Go string literal, returning the string value +// that s quotes. (If s is single-quoted, it would be a Go +// character literal; Unquote returns the corresponding +// one-character string.) +func Unquote(s string) (string, error) { + out, rem, err := unquote(s, true) + if len(rem) > 0 { + return "", ErrSyntax + } + return out, err +} + +// unquote parses a quoted string at the start of the input, +// returning the parsed prefix, the remaining suffix, and any parse errors. +// If unescape is true, the parsed prefix is unescaped, +// otherwise the input prefix is provided verbatim. +func unquote(in string, unescape bool) (out, rem string, err error) { + // Determine the quote form and optimistically find the terminating quote. + if len(in) < 2 { + return "", in, ErrSyntax + } + quote := in[0] + end := index(in[1:], quote) + if end < 0 { + return "", in, ErrSyntax + } + end += 2 // position after terminating quote; may be wrong if escape sequences are present + + switch quote { + case '`': + switch { + case !unescape: + out = in[:end] // include quotes + case !contains(in[:end], '\r'): + out = in[len("`") : end-len("`")] // exclude quotes + default: + // Carriage return characters ('\r') inside raw string literals + // are discarded from the raw string value. + buf := make([]byte, 0, end-len("`")-len("\r")-len("`")) + for i := len("`"); i < end-len("`"); i++ { + if in[i] != '\r' { + buf = append(buf, in[i]) + } + } + out = string(buf) + } + // NOTE: Prior implementations did not verify that raw strings consist + // of valid UTF-8 characters and we continue to not verify it as such. + // The Go specification does not explicitly require valid UTF-8, + // but only mention that it is implicitly valid for Go source code + // (which must be valid UTF-8). + return out, in[end:], nil + case '"', '\'': + // Handle quoted strings without any escape sequences. + if !contains(in[:end], '\\') && !contains(in[:end], '\n') { + var valid bool + switch quote { + case '"': + valid = utf8.ValidString(in[len(`"`) : end-len(`"`)]) + case '\'': + r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")]) + valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1) + } + if valid { + out = in[:end] + if unescape { + out = out[1 : end-1] // exclude quotes + } + return out, in[end:], nil + } + } + + // Handle quoted strings with escape sequences. + var buf []byte + in0 := in + in = in[1:] // skip starting quote + if unescape { + buf = make([]byte, 0, 3*end/2) // try to avoid more allocations + } + for len(in) > 0 && in[0] != quote { + // Process the next character, + // rejecting any unescaped newline characters which are invalid. + r, multibyte, rem, err := UnquoteChar(in, quote) + if in[0] == '\n' || err != nil { + return "", in0, ErrSyntax + } + in = rem + + // Append the character if unescaping the input. + if unescape { + if r < utf8.RuneSelf || !multibyte { + buf = append(buf, byte(r)) + } else { + var arr [utf8.UTFMax]byte + n := utf8.EncodeRune(arr[:], r) + buf = append(buf, arr[:n]...) + } + } + + // Single quoted strings must be a single character. + if quote == '\'' { + break + } + } + + // Verify that the string ends with a terminating quote. + if !(len(in) > 0 && in[0] == quote) { + return "", in0, ErrSyntax + } + in = in[1:] // skip terminating quote + + if unescape { + return string(buf), in, nil + } + return in0[:len(in0)-len(in)], in, nil + default: + return "", in, ErrSyntax + } +} + +// bsearch16 returns the smallest i such that a[i] >= x. +// If there is no such i, bsearch16 returns len(a). +func bsearch16(a []uint16, x uint16) int { + i, j := 0, len(a) + for i < j { + h := i + (j-i)>>1 + if a[h] < x { + i = h + 1 + } else { + j = h + } + } + return i +} + +// bsearch32 returns the smallest i such that a[i] >= x. +// If there is no such i, bsearch32 returns len(a). +func bsearch32(a []uint32, x uint32) int { + i, j := 0, len(a) + for i < j { + h := i + (j-i)>>1 + if a[h] < x { + i = h + 1 + } else { + j = h + } + } + return i +} + +// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests +// to give the same answer. It allows this package not to depend on unicode, +// and therefore not pull in all the Unicode tables. If the linker were better +// at tossing unused tables, we could get rid of this implementation. +// That would be nice. + +// IsPrint reports whether the rune is defined as printable by Go, with +// the same definition as unicode.IsPrint: letters, numbers, punctuation, +// symbols and ASCII space. +func IsPrint(r rune) bool { + // Fast check for Latin-1 + if r <= 0xFF { + if 0x20 <= r && r <= 0x7E { + // All the ASCII is printable from space through DEL-1. + return true + } + if 0xA1 <= r && r <= 0xFF { + // Similarly for ¡ through ÿ... + return r != 0xAD // ...except for the bizarre soft hyphen. + } + return false + } + + // Same algorithm, either on uint16 or uint32 value. + // First, find first i such that isPrint[i] >= x. + // This is the index of either the start or end of a pair that might span x. + // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). + // If we find x in a range, make sure x is not in isNotPrint list. + + if 0 <= r && r < 1<<16 { + rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 + i := bsearch16(isPrint, rr) + if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { + return false + } + j := bsearch16(isNotPrint, rr) + return j >= len(isNotPrint) || isNotPrint[j] != rr + } + + rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 + i := bsearch32(isPrint, rr) + if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { + return false + } + if r >= 0x20000 { + return true + } + r -= 0x10000 + j := bsearch16(isNotPrint, uint16(r)) + return j >= len(isNotPrint) || isNotPrint[j] != uint16(r) +} + +// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such +// characters include letters, marks, numbers, punctuation, symbols, and +// spaces, from categories L, M, N, P, S, and Zs. +func IsGraphic(r rune) bool { + if IsPrint(r) { + return true + } + return isInGraphicList(r) +} + +// isInGraphicList reports whether the rune is in the isGraphic list. This separation +// from IsGraphic allows quoteWith to avoid two calls to IsPrint. +// Should be called only if IsPrint fails. +func isInGraphicList(r rune) bool { + // We know r must fit in 16 bits - see makeisprint.go. + if r > 0xFFFF { + return false + } + rr := uint16(r) + i := bsearch16(isGraphic, rr) + return i < len(isGraphic) && rr == isGraphic[i] +} |