summaryrefslogtreecommitdiffstats
path: root/src/libvterm/t/03encoding_utf8.test
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 02:44:24 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 02:44:24 +0000
commit8baab3c8d7a6f22888bd581cd5c6098fd2e4b5a8 (patch)
tree3537e168b860f2742f6029d70501b5ed7d15d345 /src/libvterm/t/03encoding_utf8.test
parentInitial commit. (diff)
downloadvim-8baab3c8d7a6f22888bd581cd5c6098fd2e4b5a8.tar.xz
vim-8baab3c8d7a6f22888bd581cd5c6098fd2e4b5a8.zip
Adding upstream version 2:8.1.0875.upstream/2%8.1.0875upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/libvterm/t/03encoding_utf8.test')
-rw-r--r--src/libvterm/t/03encoding_utf8.test122
1 files changed, 122 insertions, 0 deletions
diff --git a/src/libvterm/t/03encoding_utf8.test b/src/libvterm/t/03encoding_utf8.test
new file mode 100644
index 0000000..7ee16ac
--- /dev/null
+++ b/src/libvterm/t/03encoding_utf8.test
@@ -0,0 +1,122 @@
+INIT
+WANTENCODING
+
+!Low
+ENCIN "123"
+ encout 0x31,0x32,0x33
+
+# We want to prove the UTF-8 parser correctly handles all the sequences.
+# Easy way to do this is to check it does low/high boundary cases, as that
+# leaves only two for each sequence length
+#
+# These ranges are therefore:
+#
+# Two bytes:
+# U+0080 = 000 10000000 => 00010 000000
+# => 11000010 10000000 = C2 80
+# U+07FF = 111 11111111 => 11111 111111
+# => 11011111 10111111 = DF BF
+#
+# Three bytes:
+# U+0800 = 00001000 00000000 => 0000 100000 000000
+# => 11100000 10100000 10000000 = E0 A0 80
+# U+FFFD = 11111111 11111101 => 1111 111111 111101
+# => 11101111 10111111 10111101 = EF BF BD
+# (We avoid U+FFFE and U+FFFF as they're invalid codepoints)
+#
+# Four bytes:
+# U+10000 = 00001 00000000 00000000 => 000 010000 000000 000000
+# => 11110000 10010000 10000000 10000000 = F0 90 80 80
+# U+1FFFFF = 11111 11111111 11111111 => 111 111111 111111 111111
+# => 11110111 10111111 10111111 10111111 = F7 BF BF BF
+
+!2 byte
+ENCIN "\xC2\x80\xDF\xBF"
+ encout 0x0080, 0x07FF
+
+!3 byte
+ENCIN "\xE0\xA0\x80\xEF\xBF\xBD"
+ encout 0x0800,0xFFFD
+
+!4 byte
+ENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF"
+ encout 0x10000,0x1fffff
+
+# Next up, we check some invalid sequences
+# + Early termination (back to low bytes too soon)
+# + Early restart (another sequence introduction before the previous one was finished)
+
+!Early termination
+ENCIN "\xC2!"
+ encout 0xfffd,0x21
+
+ENCIN "\xE0!\xE0\xA0!"
+ encout 0xfffd,0x21,0xfffd,0x21
+
+ENCIN "\xF0!\xF0\x90!\xF0\x90\x80!"
+ encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21
+
+!Early restart
+ENCIN "\xC2\xC2\x90"
+ encout 0xfffd,0x0090
+
+ENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90"
+ encout 0xfffd,0x0090,0xfffd,0x0090
+
+ENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90"
+ encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090
+
+# Test the overlong sequences by giving an overlong encoding of U+0000 and
+# an encoding of the highest codepoint still too short
+#
+# Two bytes:
+# U+0000 = C0 80
+# U+007F = 000 01111111 => 00001 111111 =>
+# => 11000001 10111111 => C1 BF
+#
+# Three bytes:
+# U+0000 = E0 80 80
+# U+07FF = 00000111 11111111 => 0000 011111 111111
+# => 11100000 10011111 10111111 = E0 9F BF
+#
+# Four bytes:
+# U+0000 = F0 80 80 80
+# U+FFFF = 11111111 11111111 => 000 001111 111111 111111
+# => 11110000 10001111 10111111 10111111 = F0 8F BF BF
+
+!Overlong
+ENCIN "\xC0\x80\xC1\xBF"
+ encout 0xfffd,0xfffd
+
+ENCIN "\xE0\x80\x80\xE0\x9F\xBF"
+ encout 0xfffd,0xfffd
+
+ENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF"
+ encout 0xfffd,0xfffd
+
+# UTF-16 surrogates U+D800 and U+DFFF
+!UTF-16 Surrogates
+ENCIN "\xED\xA0\x80\xED\xBF\xBF"
+ encout 0xfffd,0xfffd
+
+!Split write
+ENCIN "\xC2"
+ENCIN "\xA0"
+ encout 0x000A0
+
+ENCIN "\xE0"
+ENCIN "\xA0\x80"
+ encout 0x00800
+ENCIN "\xE0\xA0"
+ENCIN "\x80"
+ encout 0x00800
+
+ENCIN "\xF0"
+ENCIN "\x90\x80\x80"
+ encout 0x10000
+ENCIN "\xF0\x90"
+ENCIN "\x80\x80"
+ encout 0x10000
+ENCIN "\xF0\x90\x80"
+ENCIN "\x80"
+ encout 0x10000