diff options
Diffstat (limited to 'src/html/escape.go')
-rw-r--r-- | src/html/escape.go | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/src/html/escape.go b/src/html/escape.go new file mode 100644 index 0000000..1dc1287 --- /dev/null +++ b/src/html/escape.go @@ -0,0 +1,214 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package html provides functions for escaping and unescaping HTML text. +package html + +import ( + "strings" + "unicode/utf8" +) + +// These replacements permit compatibility with old numeric entities that +// assumed Windows-1252 encoding. +// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state +var replacementTable = [...]rune{ + '\u20AC', // First entry is what 0x80 should be replaced with. + '\u0081', + '\u201A', + '\u0192', + '\u201E', + '\u2026', + '\u2020', + '\u2021', + '\u02C6', + '\u2030', + '\u0160', + '\u2039', + '\u0152', + '\u008D', + '\u017D', + '\u008F', + '\u0090', + '\u2018', + '\u2019', + '\u201C', + '\u201D', + '\u2022', + '\u2013', + '\u2014', + '\u02DC', + '\u2122', + '\u0161', + '\u203A', + '\u0153', + '\u009D', + '\u017E', + '\u0178', // Last entry is 0x9F. + // 0x00->'\uFFFD' is handled programmatically. + // 0x0D->'\u000D' is a no-op. +} + +// unescapeEntity reads an entity like "<" from b[src:] and writes the +// corresponding "<" to b[dst:], returning the incremented dst and src cursors. +// Precondition: b[src] == '&' && dst <= src. +func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { + const attribute = false + + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference + + // i starts at 1 because we already know that s[0] == '&'. + i, s := 1, b[src:] + + if len(s) <= 1 { + b[dst] = b[src] + return dst + 1, src + 1 + } + + if s[i] == '#' { + if len(s) <= 3 { // We need to have at least "&#.". + b[dst] = b[src] + return dst + 1, src + 1 + } + i++ + c := s[i] + hex := false + if c == 'x' || c == 'X' { + hex = true + i++ + } + + x := '\x00' + for i < len(s) { + c = s[i] + i++ + if hex { + if '0' <= c && c <= '9' { + x = 16*x + rune(c) - '0' + continue + } else if 'a' <= c && c <= 'f' { + x = 16*x + rune(c) - 'a' + 10 + continue + } else if 'A' <= c && c <= 'F' { + x = 16*x + rune(c) - 'A' + 10 + continue + } + } else if '0' <= c && c <= '9' { + x = 10*x + rune(c) - '0' + continue + } + if c != ';' { + i-- + } + break + } + + if i <= 3 { // No characters matched. + b[dst] = b[src] + return dst + 1, src + 1 + } + + if 0x80 <= x && x <= 0x9F { + // Replace characters from Windows-1252 with UTF-8 equivalents. + x = replacementTable[x-0x80] + } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { + // Replace invalid characters with the replacement character. + x = '\uFFFD' + } + + return dst + utf8.EncodeRune(b[dst:], x), src + i + } + + // Consume the maximum number of characters possible, with the + // consumed characters matching one of the named references. + + for i < len(s) { + c := s[i] + i++ + // Lower-cased characters are more common in entities, so we check for them first. + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { + continue + } + if c != ';' { + i-- + } + break + } + + entityName := s[1:i] + if len(entityName) == 0 { + // No-op. + } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { + // No-op. + } else if x := entity[string(entityName)]; x != 0 { + return dst + utf8.EncodeRune(b[dst:], x), src + i + } else if x := entity2[string(entityName)]; x[0] != 0 { + dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) + return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i + } else if !attribute { + maxLen := len(entityName) - 1 + if maxLen > longestEntityWithoutSemicolon { + maxLen = longestEntityWithoutSemicolon + } + for j := maxLen; j > 1; j-- { + if x := entity[string(entityName[:j])]; x != 0 { + return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 + } + } + } + + dst1, src1 = dst+i, src+i + copy(b[dst:dst1], b[src:src1]) + return dst1, src1 +} + +var htmlEscaper = strings.NewReplacer( + `&`, "&", + `'`, "'", // "'" is shorter than "'" and apos was not in HTML until HTML5. + `<`, "<", + `>`, ">", + `"`, """, // """ is shorter than """. +) + +// EscapeString escapes special characters like "<" to become "<". It +// escapes only five such characters: <, >, &, ' and ". +// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't +// always true. +func EscapeString(s string) string { + return htmlEscaper.Replace(s) +} + +// UnescapeString unescapes entities like "<" to become "<". It unescapes a +// larger range of entities than EscapeString escapes. For example, "á" +// unescapes to "รก", as does "á" and "á". +// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't +// always true. +func UnescapeString(s string) string { + populateMapsOnce.Do(populateMaps) + i := strings.IndexByte(s, '&') + + if i < 0 { + return s + } + + b := []byte(s) + dst, src := unescapeEntity(b, i, i) + for len(s[src:]) > 0 { + if s[src] == '&' { + i = 0 + } else { + i = strings.IndexByte(s[src:], '&') + } + if i < 0 { + dst += copy(b[dst:], s[src:]) + break + } + + if i > 0 { + copy(b[dst:], s[src:src+i]) + } + dst, src = unescapeEntity(b, dst+i, src+i) + } + return string(b[:dst]) +} |