diff options
Diffstat (limited to 'src/third-party/scnlib/include/scn/unicode')
-rw-r--r-- | src/third-party/scnlib/include/scn/unicode/common.h | 139 | ||||
-rw-r--r-- | src/third-party/scnlib/include/scn/unicode/unicode.h | 243 | ||||
-rw-r--r-- | src/third-party/scnlib/include/scn/unicode/utf16.h | 139 | ||||
-rw-r--r-- | src/third-party/scnlib/include/scn/unicode/utf8.h | 297 |
4 files changed, 818 insertions, 0 deletions
diff --git a/src/third-party/scnlib/include/scn/unicode/common.h b/src/third-party/scnlib/include/scn/unicode/common.h new file mode 100644 index 0000000..3807793 --- /dev/null +++ b/src/third-party/scnlib/include/scn/unicode/common.h @@ -0,0 +1,139 @@ +// Copyright 2017 Elias Kosunen +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// This file is a part of scnlib: +// https://github.com/eliaskosunen/scnlib +// +// The contents of this file are based on utfcpp: +// https://github.com/nemtrif/utfcpp +// Copyright (c) 2006 Nemanja Trifunovic +// Distributed under the Boost Software License, version 1.0 + +#ifndef SCN_UNICODE_COMMON_H +#define SCN_UNICODE_COMMON_H + +#include "../detail/fwd.h" + +#include <cstdint> + +namespace scn { + SCN_BEGIN_NAMESPACE + + /** + * A Unicode code point + */ + enum class code_point : uint32_t {}; + + template <typename T> + constexpr bool operator==(code_point a, T b) + { + return static_cast<uint32_t>(a) == static_cast<uint32_t>(b); + } + template <typename T> + constexpr bool operator!=(code_point a, T b) + { + return static_cast<uint32_t>(a) != static_cast<uint32_t>(b); + } + template <typename T> + constexpr bool operator<(code_point a, T b) + { + return static_cast<uint32_t>(a) < static_cast<uint32_t>(b); + } + template <typename T> + constexpr bool operator>(code_point a, T b) + { + return static_cast<uint32_t>(a) > static_cast<uint32_t>(b); + } + template <typename T> + constexpr bool operator<=(code_point a, T b) + { + return static_cast<uint32_t>(a) <= static_cast<uint32_t>(b); + } + template <typename T> + constexpr bool operator>=(code_point a, T b) + { + return static_cast<uint32_t>(a) >= static_cast<uint32_t>(b); + } + + namespace detail { + static constexpr const uint16_t lead_surrogate_min = 0xd800; + static constexpr const uint16_t lead_surrogate_max = 0xdbff; + static constexpr const uint16_t trail_surrogate_min = 0xdc00; + static constexpr const uint16_t trail_surrogate_max = 0xdfff; + static constexpr const uint16_t lead_offset = + lead_surrogate_min - (0x10000u >> 10); + static constexpr const uint32_t surrogate_offset = + 0x10000u - (lead_surrogate_min << 10) - trail_surrogate_min; + static constexpr const uint32_t code_point_max = 0x10ffff; + + template <typename Octet> + constexpr uint8_t mask8(Octet o) + { + return static_cast<uint8_t>(0xff & o); + } + template <typename U16> + constexpr uint16_t mask16(U16 v) + { + return static_cast<uint16_t>(0xffff & v); + } + template <typename U16> + constexpr bool is_lead_surrogate(U16 cp) + { + return cp >= lead_surrogate_min && cp <= lead_surrogate_max; + } + template <typename U16> + constexpr bool is_trail_surrogate(U16 cp) + { + return cp >= trail_surrogate_min && cp <= trail_surrogate_max; + } + template <typename U16> + constexpr bool is_surrogate(U16 cp) + { + return cp >= lead_surrogate_min && cp <= trail_surrogate_max; + } + + constexpr inline bool is_code_point_valid(code_point cp) + { + return cp <= code_point_max && !is_surrogate(cp); + } + } // namespace detail + + template <typename T> + constexpr code_point make_code_point(T ch) + { + return static_cast<code_point>(ch); + } + + /** + * Returns `true`, if `cp` is valid, e.g. is less than or equal to the + * maximum value for a code point (U+10FFFF), and is not a surrogate (U+D800 + * to U+DFFF). + */ + constexpr inline bool is_valid_code_point(code_point cp) + { + return detail::is_code_point_valid(cp); + } + /** + * Returns `true` if `cp` can be encoded in ASCII as-is (is between U+0 and + * U+7F) + */ + constexpr inline bool is_ascii_code_point(code_point cp) + { + return cp <= 0x7f; + } + + SCN_END_NAMESPACE +} // namespace scn + +#endif diff --git a/src/third-party/scnlib/include/scn/unicode/unicode.h b/src/third-party/scnlib/include/scn/unicode/unicode.h new file mode 100644 index 0000000..011b0b9 --- /dev/null +++ b/src/third-party/scnlib/include/scn/unicode/unicode.h @@ -0,0 +1,243 @@ +// Copyright 2017 Elias Kosunen +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// This file is a part of scnlib: +// https://github.com/eliaskosunen/scnlib +// +// The contents of this file are based on utfcpp: +// https://github.com/nemtrif/utfcpp +// Copyright (c) 2006 Nemanja Trifunovic +// Distributed under the Boost Software License, version 1.0 + +#ifndef SCN_UNICODE_UNICODE_H +#define SCN_UNICODE_UNICODE_H + +#include "utf16.h" +#include "utf8.h" + +namespace scn { + SCN_BEGIN_NAMESPACE + + namespace detail { + inline constexpr bool is_wide_multichar() + { + return sizeof(wchar_t) == 2; + } + + inline constexpr bool is_multichar_type(char) + { + return true; + } + inline constexpr bool is_multichar_type(wchar_t) + { + return is_wide_multichar(); + } + + using utf8_tag = std::integral_constant<size_t, 1>; + using utf16_tag = std::integral_constant<size_t, 2>; + using utf32_tag = std::integral_constant<size_t, 4>; + +#define SCN_MAKE_UTF_TAG(CharT) \ + std::integral_constant<size_t, sizeof(CharT)> {} + + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> parse_code_point(I begin, + S end, + code_point& cp, + utf8_tag) + { + return utf8::parse_code_point(begin, end, cp); + } + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> parse_code_point(I begin, + S end, + code_point& cp, + utf16_tag) + { + return utf16::parse_code_point(begin, end, cp); + } + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> parse_code_point(I begin, + S end, + code_point& cp, + utf32_tag) + { + SCN_EXPECT(begin != end); + cp = make_code_point(*begin); + return {++begin}; + } + } // namespace detail + + /** + * Parses a Unicode code point from the range at `[begin, end)`, and writes + * it into `cp`. + * + * The encoding is determined by the size of the value type of the range. + * Let `n = sizeof(typename std::iterator_traits<I>::value_type)`. + * If `n == 1` -> UTF-8. If `n == 2` -> UTF-16. If `n == 4` -> UTF-32. + * + * `begin != end` must be `true`. + * + * On error, `cp` is not written into. + * + * \return On success, returns an iterator one-past the last code unit used + * to parse `cp`. If the code point is encoded incorrectly, returns + * `error::invalid_encoding`. + */ + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> parse_code_point(I begin, S end, code_point& cp) + { + return detail::parse_code_point( + begin, end, cp, + SCN_MAKE_UTF_TAG(typename std::iterator_traits<I>::value_type)); + } + + namespace detail { + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> encode_code_point(I begin, + S end, + code_point cp, + utf8_tag) + { + return utf8::encode_code_point(begin, end, cp); + } + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> encode_code_point(I begin, + S end, + code_point cp, + utf16_tag) + { + return utf16::encode_code_point(begin, end, cp); + } + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> encode_code_point(I begin, + S end, + code_point cp, + utf32_tag) + { + SCN_EXPECT(begin + 1 >= end); + *begin++ = static_cast<uint32_t>(cp); + return {begin}; + } + } // namespace detail + + /** + * Writes the code point `cp` into `begin`, using the encoding determined by + * the type of `begin`. + * + * For more information on how the encoding is determined, see \ref + * parse_code_point(). + * + * `end` must be reachable from `begin`, and must have enough room to encode + * the code point (4 code units for UTF-8, 2 for UTF-16, and 1 for UTF-32). + * + * \param begin Beginning of the range to write the result to + * \param end End of the range to write the result to + * \param cp Code point to encode + * \return On success, one-past the last code unit written. + * If `cp` was not a valid code point, returns `error::invalid_encoding`. + */ + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> encode_code_point(I begin, S end, code_point cp) + { + return detail::encode_code_point( + begin, end, cp, + SCN_MAKE_UTF_TAG(typename std::iterator_traits<I>::value_type)); + } + + namespace detail { + template <typename T> + SCN_CONSTEXPR14 int get_sequence_length(T a, utf8_tag) + { + return utf8::get_sequence_length(a); + } + template <typename T> + SCN_CONSTEXPR14 int get_sequence_length(T a, utf16_tag) + { + return utf16::get_sequence_length(a); + } + template <typename T> + SCN_CONSTEXPR14 int get_sequence_length(T, utf32_tag) + { + return 1; + } + } // namespace detail + + /** + * Returns the length of the code point starting from code unit `a` in code + * units. + * + * For information on how the encoding is determined, see \ref + * parse_code_point(). + * + * \param a The first code unit in a code point. + * + * \return Length of the code point starting from `a`, in code units + * If the code point is encoded incorrectly, or this code unit is not the + * first code unit in a code point, returns 0. + */ + template <typename T> + SCN_CONSTEXPR14 int get_sequence_length(T a) + { + return detail::get_sequence_length(a, SCN_MAKE_UTF_TAG(T)); + } + + namespace detail { + template <typename I, typename S> + SCN_CONSTEXPR14 expected<std::ptrdiff_t> code_point_distance(I begin, + S end, + utf8_tag) + { + return utf8::code_point_distance(begin, end); + } + template <typename I, typename S> + SCN_CONSTEXPR14 expected<std::ptrdiff_t> code_point_distance(I begin, + S end, + utf16_tag) + { + return utf16::code_point_distance(begin, end); + } + template <typename I, typename S> + SCN_CONSTEXPR14 expected<std::ptrdiff_t> code_point_distance(I begin, + S end, + utf32_tag) + { + return {end - begin}; + } + } // namespace detail + + /** + * Get the distance between two code points, in code points. + * + * `end >= begin` must be `true`. + * `begin` and `end` must both point to the first code units in a code + * point. + * + * \return The distance between `begin` and `end`, in code points. If the + * string was encoded incorrectly, returns `error::invalid_encoding`. + */ + template <typename I, typename S> + SCN_CONSTEXPR14 expected<std::ptrdiff_t> code_point_distance(I begin, S end) + { + return detail::code_point_distance( + begin, end, + SCN_MAKE_UTF_TAG(typename std::iterator_traits<I>::value_type)); + } + +#undef SCN_MAKE_UTF_TAG + + SCN_END_NAMESPACE +} // namespace scn + +#endif diff --git a/src/third-party/scnlib/include/scn/unicode/utf16.h b/src/third-party/scnlib/include/scn/unicode/utf16.h new file mode 100644 index 0000000..8d8a400 --- /dev/null +++ b/src/third-party/scnlib/include/scn/unicode/utf16.h @@ -0,0 +1,139 @@ +// Copyright 2017 Elias Kosunen +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// This file is a part of scnlib: +// https://github.com/eliaskosunen/scnlib +// +// The contents of this file are based on utfcpp: +// https://github.com/nemtrif/utfcpp +// Copyright (c) 2006 Nemanja Trifunovic +// Distributed under the Boost Software License, version 1.0 + +#ifndef SCN_UNICODE_UTF16_H +#define SCN_UNICODE_UTF16_H + +#include "../detail/error.h" +#include "../util/expected.h" +#include "common.h" + +namespace scn { + SCN_BEGIN_NAMESPACE + + namespace detail { + namespace utf16 { + template <typename U16> + SCN_CONSTEXPR14 int get_sequence_length(U16 ch) + { + uint16_t lead = mask16(ch); + if (is_lead_surrogate(lead)) { + return 2; + } + if (SCN_UNLIKELY(is_trail_surrogate(lead))) { + return 0; + } + return 1; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 error validate_next(I& it, S end, code_point& cp) + { + SCN_EXPECT(it != end); + + uint16_t lead = mask16(*it); + if (is_lead_surrogate(lead)) { + ++it; + if (it == end) { + return {error::invalid_encoding, + "Lone utf16 lead surrogate"}; + } + uint16_t trail = mask16(*it); + if (!is_trail_surrogate(trail)) { + return {error::invalid_encoding, + "Invalid utf16 trail surrogate"}; + } + ++it; + cp = static_cast<code_point>( + static_cast<uint32_t>(lead << 10u) + trail + + surrogate_offset); + return {}; + } + if (is_trail_surrogate(lead)) { + return {error::invalid_encoding, + "Lone utf16 trail surrogate"}; + } + + cp = static_cast<code_point>(*it); + ++it; + return {}; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> parse_code_point(I begin, + S end, + code_point& cp) + { + auto e = validate_next(begin, end, cp); + if (!e) { + return e; + } + return {begin}; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> encode_code_point(I begin, + S end, + code_point cp) + { + SCN_EXPECT(begin + 2 <= end); + + if (!is_valid_code_point(cp)) { + return error(error::invalid_encoding, + "Invalid code point, cannot encode in UTF-16"); + } + + if (cp > 0xffffu) { + *begin++ = static_cast<uint16_t>( + (static_cast<uint32_t>(cp) >> 10u) + lead_offset); + *begin++ = static_cast<uint16_t>( + (static_cast<uint32_t>(cp) & 0x3ffu) + + trail_surrogate_min); + } + else { + *begin++ = static_cast<uint16_t>(cp); + } + return {begin}; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 expected<std::ptrdiff_t> code_point_distance( + I begin, + S end) + { + std::ptrdiff_t dist{}; + code_point cp{}; + for (; begin < end; ++dist) { + auto e = validate_next(begin, end, cp); + if (!e) { + return e; + } + } + return {dist}; + } + } // namespace utf16 + } // namespace detail + + SCN_END_NAMESPACE +} // namespace scn + +#endif diff --git a/src/third-party/scnlib/include/scn/unicode/utf8.h b/src/third-party/scnlib/include/scn/unicode/utf8.h new file mode 100644 index 0000000..d2ee54d --- /dev/null +++ b/src/third-party/scnlib/include/scn/unicode/utf8.h @@ -0,0 +1,297 @@ +// Copyright 2017 Elias Kosunen +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// This file is a part of scnlib: +// https://github.com/eliaskosunen/scnlib +// +// The contents of this file are based on utfcpp: +// https://github.com/nemtrif/utfcpp +// Copyright (c) 2006 Nemanja Trifunovic +// Distributed under the Boost Software License, version 1.0 + +#ifndef SCN_UNICODE_UTF8_H +#define SCN_UNICODE_UTF8_H + +#include "../detail/error.h" +#include "../util/expected.h" +#include "common.h" + +namespace scn { + SCN_BEGIN_NAMESPACE + + namespace detail { + namespace utf8 { + template <typename Octet> + constexpr bool is_trail(Octet o) + { + return (mask8(o) >> 6) == 2; + } + + template <typename Octet> + SCN_CONSTEXPR14 int get_sequence_length(Octet ch) + { + uint8_t lead = detail::mask8(ch); + if (lead < 0x80) { + return 1; + } + else if ((lead >> 5) == 6) { + return 2; + } + else if ((lead >> 4) == 0xe) { + return 3; + } + else if ((lead >> 3) == 0x1e) { + return 4; + } + return 0; + } + + SCN_CONSTEXPR14 bool is_overlong_sequence(code_point cp, + std::ptrdiff_t len) + { + if (cp < 0x80) { + if (len != 1) { + return true; + } + } + else if (cp < 0x800) { + if (len != 2) { + return true; + } + } + else if (cp < 0x10000) { + if (len != 3) { + return true; + } + } + + return false; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 error increase_safely(I& it, S end) + { + if (++it == end) { + return {error::invalid_encoding, + "Unexpected end of range when decoding utf8 " + "(partial codepoint)"}; + } + if (!is_trail(*it)) { + return {error::invalid_encoding, + "Invalid utf8 codepoint parsed"}; + } + return {}; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 error get_sequence_1(I& it, S end, code_point& cp) + { + SCN_EXPECT(it != end); + cp = make_code_point(mask8(*it)); + return {}; + } + template <typename I, typename S> + SCN_CONSTEXPR14 error get_sequence_2(I& it, S end, code_point& cp) + { + SCN_EXPECT(it != end); + uint32_t c = mask8(*it); + + auto e = increase_safely(it, end); + if (!e) { + return e; + } + + c = static_cast<uint32_t>((c << 6u) & 0x7ffu) + + (static_cast<uint32_t>(*it) & 0x3fu); + cp = make_code_point(c); + + return {}; + } + template <typename I, typename S> + SCN_CONSTEXPR14 error get_sequence_3(I& it, S end, code_point& cp) + { + SCN_EXPECT(it != end); + uint32_t c = mask8(*it); + + auto e = increase_safely(it, end); + if (!e) { + return e; + } + + c = static_cast<uint32_t>((c << 12u) & 0xffffu) + + (static_cast<uint32_t>(mask8(*it) << 6u) & 0xfffu); + + e = increase_safely(it, end); + if (!e) { + return e; + } + + c += static_cast<uint32_t>(*it) & 0x3fu; + cp = make_code_point(c); + + return {}; + } + template <typename I, typename S> + SCN_CONSTEXPR14 error get_sequence_4(I& it, S end, code_point& cp) + { + SCN_EXPECT(it != end); + uint32_t c = mask8(*it); + + auto e = increase_safely(it, end); + if (!e) { + return e; + } + + c = ((c << 18u) & 0x1fffffu) + + (static_cast<uint32_t>(mask8(*it) << 12u) & 0x3ffffu); + + e = increase_safely(it, end); + if (!e) { + return e; + } + + c += static_cast<uint32_t>(mask8(*it) << 6u) & 0xfffu; + + e = increase_safely(it, end); + if (!e) { + return e; + } + + c += static_cast<uint32_t>(*it) & 0x3fu; + cp = make_code_point(c); + + return {}; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 error validate_next(I& it, S end, code_point& cp) + { + SCN_EXPECT(it != end); + + int len = get_sequence_length(*it); + error e{}; + switch (len) { + case 1: + e = get_sequence_1(it, end, cp); + break; + case 2: + e = get_sequence_2(it, end, cp); + break; + case 3: + e = get_sequence_3(it, end, cp); + break; + case 4: + e = get_sequence_4(it, end, cp); + break; + default: + return {error::invalid_encoding, + "Invalid lead byte for utf8"}; + } + + if (!e) { + return e; + } + if (!is_valid_code_point(cp) || is_overlong_sequence(cp, len)) { + return {error::invalid_encoding, "Invalid utf8 code point"}; + } + ++it; + return {}; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> parse_code_point(I begin, + S end, + code_point& cp) + { + code_point c{}; + auto e = validate_next(begin, end, c); + if (e) { + cp = c; + return {begin}; + } + return e; + } + + template <typename I> + I append(code_point cp, I it) + { + SCN_EXPECT(is_code_point_valid(cp)); + + if (cp < 0x80) { + *(it++) = static_cast<uint8_t>(cp); + } + else if (cp < 0x800) { + *(it++) = static_cast<uint8_t>( + (static_cast<uint32_t>(cp) >> 6u) | 0xc0u); + *(it++) = static_cast<uint8_t>( + (static_cast<uint32_t>(cp) & 0x3fu) | 0x80u); + } + else if (cp < 0x10000) { + *(it++) = static_cast<uint8_t>( + (static_cast<uint32_t>(cp) >> 12u) | 0xe0u); + *(it++) = static_cast<uint8_t>( + ((static_cast<uint32_t>(cp) >> 6u) & 0x3fu) | 0x80u); + *(it++) = static_cast<uint8_t>( + (static_cast<uint32_t>(cp) & 0x3fu) | 0x80u); + } + else { + *(it++) = static_cast<uint8_t>( + (static_cast<uint32_t>(cp) >> 18u) | 0xf0u); + *(it++) = static_cast<uint8_t>( + ((static_cast<uint32_t>(cp) >> 12u) & 0x3fu) | 0x80u); + *(it++) = static_cast<uint8_t>( + ((static_cast<uint32_t>(cp) >> 6u) & 0x3fu) | 0x80u); + *(it++) = static_cast<uint8_t>( + (static_cast<uint32_t>(cp) & 0x3fu) | 0x80u); + } + return it; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 expected<I> encode_code_point(I begin, + S end, + code_point cp) + { + SCN_EXPECT(begin + 4 <= end); + + if (!is_code_point_valid(cp)) { + return error(error::invalid_encoding, + "Invalid code point, cannot encode in UTF-8"); + } + return {append(cp, begin)}; + } + + template <typename I, typename S> + SCN_CONSTEXPR14 expected<std::ptrdiff_t> code_point_distance( + I begin, + S end) + { + std::ptrdiff_t dist{}; + code_point cp{}; + for (; begin < end; ++dist) { + auto e = validate_next(begin, end, cp); + if (!e) { + return e; + } + } + return {dist}; + } + + } // namespace utf8 + } // namespace detail + + SCN_END_NAMESPACE +} // namespace scn + +#endif |