summaryrefslogtreecommitdiffstats
path: root/src/syscall/wtf8_windows.go
blob: f166021b7c51041544c7ee2b57b51dcb89574885 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Windows UTF-16 strings can contain unpaired surrogates, which can't be
// decoded into a valid UTF-8 string. This file defines a set of functions
// that can be used to encode and decode potentially ill-formed UTF-16 strings
// by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
//
// WTF-8 is a strict superset of UTF-8, i.e. any string that is
// well-formed in UTF-8 is also well-formed in WTF-8 and the content
// is unchanged. Also, the conversion never fails and is lossless.
//
// The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string
// is that the conversion is lossless even for ill-formed UTF-16 strings.
// This property allows to read an ill-formed UTF-16 string, convert it
// to a Go string, and convert it back to the same original UTF-16 string.
//
// See go.dev/issues/59971 for more info.

package syscall

import (
	"unicode/utf16"
	"unicode/utf8"
)

const (
	surr1 = 0xd800
	surr2 = 0xdc00
	surr3 = 0xe000

	tx    = 0b10000000
	t3    = 0b11100000
	maskx = 0b00111111
	mask3 = 0b00001111

	rune1Max = 1<<7 - 1
	rune2Max = 1<<11 - 1
)

// encodeWTF16 returns the potentially ill-formed
// UTF-16 encoding of s.
func encodeWTF16(s string, buf []uint16) []uint16 {
	for i := 0; i < len(s); {
		// Cannot use 'for range s' because it expects valid
		// UTF-8 runes.
		r, size := utf8.DecodeRuneInString(s[i:])
		if r == utf8.RuneError {
			// Check if s[i:] contains a valid WTF-8 encoded surrogate.
			if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF {
				r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx)
				buf = append(buf, uint16(r))
				i += 3
				continue
			}
		}
		i += size
		buf = utf16.AppendRune(buf, r)
	}
	return buf
}

// decodeWTF16 returns the WTF-8 encoding of
// the potentially ill-formed UTF-16 s.
func decodeWTF16(s []uint16, buf []byte) []byte {
	for i := 0; i < len(s); i++ {
		var ar rune
		switch r := s[i]; {
		case r < surr1, surr3 <= r:
			// normal rune
			ar = rune(r)
		case surr1 <= r && r < surr2 && i+1 < len(s) &&
			surr2 <= s[i+1] && s[i+1] < surr3:
			// valid surrogate sequence
			ar = utf16.DecodeRune(rune(r), rune(s[i+1]))
			i++
		default:
			// WTF-8 fallback.
			// This only handles the 3-byte case of utf8.AppendRune,
			// as surrogates always fall in that case.
			ar = rune(r)
			if ar > utf8.MaxRune {
				ar = utf8.RuneError
			}
			buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx)
			continue
		}
		buf = utf8.AppendRune(buf, ar)
	}
	return buf
}