summaryrefslogtreecommitdiffstats
path: root/src/libixion/utf8.cpp
blob: a833ada92e65debde361ff5763c04a2af5243b60 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

#include "utf8.hpp"

#include <ixion/exceptions.hpp>
#include <sstream>
#include <limits>

namespace ixion { namespace detail {

namespace {

constexpr uint8_t invalid_utf8_byte_length = std::numeric_limits<uint8_t>::max();

uint8_t calc_utf8_byte_length(uint8_t c1)
{
    if ((c1 & 0x80) == 0x00)
        // highest bit is not set.
        return 1;

    if ((c1 & 0xE0) == 0xC0)
        // highest 3 bits are 110.
        return 2;

    if ((c1 & 0xF0) == 0xE0)
        // highest 4 bits are 1110.
        return 3;

    if ((c1 & 0xFC) == 0xF0)
        // highest 5 bits are 11110.
        return 4;

    return invalid_utf8_byte_length;
}

}

std::vector<std::size_t> calc_utf8_byte_positions(const std::string& s)
{
    const char* p = s.data();
    const char* p0 = p; // head position
    const char* p_end = p + s.size();

    std::vector<std::size_t> positions;

    while (p < p_end)
    {
        positions.push_back(std::distance(p0, p));

        uint8_t n = calc_utf8_byte_length(*p);

        if (n == invalid_utf8_byte_length)
        {
            std::ostringstream os;
            os << "invalid utf8 byte length in string '" << s << "'";
            throw general_error(os.str());
        }

        p += n;
    }

    return positions;
}

}}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */