diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 17:44:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 17:44:55 +0000 |
commit | 5068d34c08f951a7ea6257d305a1627b09a95817 (patch) | |
tree | 08213e2be853396a3b07ce15dbe222644dcd9a89 /src/ww898/cp_utf8.hpp | |
parent | Initial commit. (diff) | |
download | lnav-5068d34c08f951a7ea6257d305a1627b09a95817.tar.xz lnav-5068d34c08f951a7ea6257d305a1627b09a95817.zip |
Adding upstream version 0.11.1.upstream/0.11.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/ww898/cp_utf8.hpp')
-rw-r--r-- | src/ww898/cp_utf8.hpp | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/src/ww898/cp_utf8.hpp b/src/ww898/cp_utf8.hpp new file mode 100644 index 0000000..8eaa133 --- /dev/null +++ b/src/ww898/cp_utf8.hpp @@ -0,0 +1,171 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include <cstdint> +#include <utility> +#include <stdexcept> + +#include "base/result.h" + +namespace ww898 { +namespace utf { + +// Supported combinations: +// 0xxx_xxxx +// 110x_xxxx 10xx_xxxx +// 1110_xxxx 10xx_xxxx 10xx_xxxx +// 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx +// 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx +// 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx +struct utf8 final +{ + static size_t const max_unicode_symbol_size = 4; + static size_t const max_supported_symbol_size = 6; + + static uint32_t const max_supported_code_point = 0x7FFFFFFF; + + using char_type = uint8_t; + + template<typename PeekFn> + static Result<size_t, const char *> char_size(PeekFn && peek_fn) + { + const std::pair<char_type, size_t> peek_res = std::forward<PeekFn>(peek_fn)(); + const auto ch0 = peek_res.first; + const auto remaining = peek_res.second; + size_t retval = 0; + + if (ch0 < 0x80) { // 0xxx_xxxx + retval = 1; + } else if (ch0 < 0xC0) { + return Err("The utf8 first char in sequence is incorrect"); + } else if (ch0 < 0xE0) { // 110x_xxxx 10xx_xxxx + retval = 2; + } else if (ch0 < 0xF0) { // 1110_xxxx 10xx_xxxx 10xx_xxxx + retval = 3; + } else if (ch0 < 0xF8) { // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + retval = 4; + } else if (ch0 < 0xFC) { // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + retval = 5; + } else if (ch0 < 0xFE) { // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + retval = 6; + } else { + return Err("The utf8 first char in sequence is incorrect"); + } + if (retval - 1 > remaining) { + return Err("Truncated utf8 sequence"); + } + return Ok(retval); + } + + template<typename ReadFn> + static uint32_t read(ReadFn && read_fn) + { + char_type const ch0 = read_fn(); + if (ch0 < 0x80) // 0xxx_xxxx + return ch0; + if (ch0 < 0xC0) + throw std::runtime_error("The utf8 first char in sequence is incorrect"); + if (ch0 < 0xE0) // 110x_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + return (ch0 << 6) + ch1 - 0x3080; + } + if (ch0 < 0xF0) // 1110_xxxx 10xx_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; + return (ch0 << 12) + (ch1 << 6) + ch2 - 0xE2080; + } + if (ch0 < 0xF8) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; + char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; + return (ch0 << 18) + (ch1 << 12) + (ch2 << 6) + ch3 - 0x3C82080; + } + if (ch0 < 0xFC) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; + char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; + char_type const ch4 = read_fn(); if (ch4 >> 6 != 2) goto _err; + return (ch0 << 24) + (ch1 << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - 0xFA082080; + } + if (ch0 < 0xFE) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; + char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; + char_type const ch4 = read_fn(); if (ch4 >> 6 != 2) goto _err; + char_type const ch5 = read_fn(); if (ch5 >> 6 != 2) goto _err; + return (ch0 << 30) + (ch1 << 24) + (ch2 << 18) + (ch3 << 12) + (ch4 << 6) + ch5 - 0x82082080; + } + throw std::runtime_error("The utf8 first char in sequence is incorrect"); + _err: throw std::runtime_error("The utf8 slave char in sequence is incorrect"); + } + + template<typename WriteFn> + static void write(uint32_t const cp, WriteFn && write_fn) + { + if (cp < 0x80) // 0xxx_xxxx + write_fn(static_cast<char_type>(cp)); + else if (cp < 0x800) // 110x_xxxx 10xx_xxxx + { + write_fn(static_cast<char_type>(0xC0 | cp >> 6)); + goto _1; + } + else if (cp < 0x10000) // 1110_xxxx 10xx_xxxx 10xx_xxxx + { + write_fn(static_cast<char_type>(0xE0 | cp >> 12)); + goto _2; + } + else if (cp < 0x200000) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + write_fn(static_cast<char_type>(0xF0 | cp >> 18)); + goto _3; + } + else if (cp < 0x4000000) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + write_fn(static_cast<char_type>(0xF8 | cp >> 24)); + goto _4; + } + else if (cp < 0x80000000) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + write_fn(static_cast<char_type>(0xFC | cp >> 30)); + goto _5; + } + else + throw std::runtime_error("Tool large UTF8 code point"); + return; + _5: write_fn(static_cast<char_type>(0x80 | (cp >> 24 & 0x3F))); + _4: write_fn(static_cast<char_type>(0x80 | (cp >> 18 & 0x3F))); + _3: write_fn(static_cast<char_type>(0x80 | (cp >> 12 & 0x3F))); + _2: write_fn(static_cast<char_type>(0x80 | (cp >> 6 & 0x3F))); + _1: write_fn(static_cast<char_type>(0x80 | (cp & 0x3F))); + } +}; + +}} |