summaryrefslogtreecommitdiffstats
path: root/src/libixion/utf8.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/libixion/utf8.cpp')
-rw-r--r--src/libixion/utf8.cpp72
1 files changed, 72 insertions, 0 deletions
diff --git a/src/libixion/utf8.cpp b/src/libixion/utf8.cpp
new file mode 100644
index 0000000..a833ada
--- /dev/null
+++ b/src/libixion/utf8.cpp
@@ -0,0 +1,72 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "utf8.hpp"
+
+#include <ixion/exceptions.hpp>
+#include <sstream>
+#include <limits>
+
+namespace ixion { namespace detail {
+
+namespace {
+
+constexpr uint8_t invalid_utf8_byte_length = std::numeric_limits<uint8_t>::max();
+
+uint8_t calc_utf8_byte_length(uint8_t c1)
+{
+ if ((c1 & 0x80) == 0x00)
+ // highest bit is not set.
+ return 1;
+
+ if ((c1 & 0xE0) == 0xC0)
+ // highest 3 bits are 110.
+ return 2;
+
+ if ((c1 & 0xF0) == 0xE0)
+ // highest 4 bits are 1110.
+ return 3;
+
+ if ((c1 & 0xFC) == 0xF0)
+ // highest 5 bits are 11110.
+ return 4;
+
+ return invalid_utf8_byte_length;
+}
+
+}
+
+std::vector<std::size_t> calc_utf8_byte_positions(const std::string& s)
+{
+ const char* p = s.data();
+ const char* p0 = p; // head position
+ const char* p_end = p + s.size();
+
+ std::vector<std::size_t> positions;
+
+ while (p < p_end)
+ {
+ positions.push_back(std::distance(p0, p));
+
+ uint8_t n = calc_utf8_byte_length(*p);
+
+ if (n == invalid_utf8_byte_length)
+ {
+ std::ostringstream os;
+ os << "invalid utf8 byte length in string '" << s << "'";
+ throw general_error(os.str());
+ }
+
+ p += n;
+ }
+
+ return positions;
+}
+
+}}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */