From 8baab3c8d7a6f22888bd581cd5c6098fd2e4b5a8 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 6 May 2024 04:44:24 +0200 Subject: Adding upstream version 2:8.1.0875. Signed-off-by: Daniel Baumann --- src/libvterm/t/03encoding_utf8.test | 122 ++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 src/libvterm/t/03encoding_utf8.test (limited to 'src/libvterm/t/03encoding_utf8.test') diff --git a/src/libvterm/t/03encoding_utf8.test b/src/libvterm/t/03encoding_utf8.test new file mode 100644 index 0000000..7ee16ac --- /dev/null +++ b/src/libvterm/t/03encoding_utf8.test @@ -0,0 +1,122 @@ +INIT +WANTENCODING + +!Low +ENCIN "123" + encout 0x31,0x32,0x33 + +# We want to prove the UTF-8 parser correctly handles all the sequences. +# Easy way to do this is to check it does low/high boundary cases, as that +# leaves only two for each sequence length +# +# These ranges are therefore: +# +# Two bytes: +# U+0080 = 000 10000000 => 00010 000000 +# => 11000010 10000000 = C2 80 +# U+07FF = 111 11111111 => 11111 111111 +# => 11011111 10111111 = DF BF +# +# Three bytes: +# U+0800 = 00001000 00000000 => 0000 100000 000000 +# => 11100000 10100000 10000000 = E0 A0 80 +# U+FFFD = 11111111 11111101 => 1111 111111 111101 +# => 11101111 10111111 10111101 = EF BF BD +# (We avoid U+FFFE and U+FFFF as they're invalid codepoints) +# +# Four bytes: +# U+10000 = 00001 00000000 00000000 => 000 010000 000000 000000 +# => 11110000 10010000 10000000 10000000 = F0 90 80 80 +# U+1FFFFF = 11111 11111111 11111111 => 111 111111 111111 111111 +# => 11110111 10111111 10111111 10111111 = F7 BF BF BF + +!2 byte +ENCIN "\xC2\x80\xDF\xBF" + encout 0x0080, 0x07FF + +!3 byte +ENCIN "\xE0\xA0\x80\xEF\xBF\xBD" + encout 0x0800,0xFFFD + +!4 byte +ENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF" + encout 0x10000,0x1fffff + +# Next up, we check some invalid sequences +# + Early termination (back to low bytes too soon) +# + Early restart (another sequence introduction before the previous one was finished) + +!Early termination +ENCIN "\xC2!" + encout 0xfffd,0x21 + +ENCIN "\xE0!\xE0\xA0!" + encout 0xfffd,0x21,0xfffd,0x21 + +ENCIN "\xF0!\xF0\x90!\xF0\x90\x80!" + encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21 + +!Early restart +ENCIN "\xC2\xC2\x90" + encout 0xfffd,0x0090 + +ENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90" + encout 0xfffd,0x0090,0xfffd,0x0090 + +ENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90" + encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090 + +# Test the overlong sequences by giving an overlong encoding of U+0000 and +# an encoding of the highest codepoint still too short +# +# Two bytes: +# U+0000 = C0 80 +# U+007F = 000 01111111 => 00001 111111 => +# => 11000001 10111111 => C1 BF +# +# Three bytes: +# U+0000 = E0 80 80 +# U+07FF = 00000111 11111111 => 0000 011111 111111 +# => 11100000 10011111 10111111 = E0 9F BF +# +# Four bytes: +# U+0000 = F0 80 80 80 +# U+FFFF = 11111111 11111111 => 000 001111 111111 111111 +# => 11110000 10001111 10111111 10111111 = F0 8F BF BF + +!Overlong +ENCIN "\xC0\x80\xC1\xBF" + encout 0xfffd,0xfffd + +ENCIN "\xE0\x80\x80\xE0\x9F\xBF" + encout 0xfffd,0xfffd + +ENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF" + encout 0xfffd,0xfffd + +# UTF-16 surrogates U+D800 and U+DFFF +!UTF-16 Surrogates +ENCIN "\xED\xA0\x80\xED\xBF\xBF" + encout 0xfffd,0xfffd + +!Split write +ENCIN "\xC2" +ENCIN "\xA0" + encout 0x000A0 + +ENCIN "\xE0" +ENCIN "\xA0\x80" + encout 0x00800 +ENCIN "\xE0\xA0" +ENCIN "\x80" + encout 0x00800 + +ENCIN "\xF0" +ENCIN "\x90\x80\x80" + encout 0x10000 +ENCIN "\xF0\x90" +ENCIN "\x80\x80" + encout 0x10000 +ENCIN "\xF0\x90\x80" +ENCIN "\x80" + encout 0x10000 -- cgit v1.2.3