summaryrefslogtreecommitdiffstats
path: root/third_party/wasm2c/src/utf8.cc
blob: dd95c8c63dc9a3b9e23f0bc14938204e9bc80de2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/*
 * Copyright 2017 WebAssembly Community Group participants
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "wabt/utf8.h"

#include <cstdint>

namespace wabt {

namespace {

// clang-format off
const int s_utf8_length[256] = {
 // 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x00
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x10
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x20
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x30
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x40
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x50
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x60
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x70
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xa0
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xb0
    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 0xc0
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 0xd0
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // 0xe0
    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xf0
};
// clang-format on

// Returns true if this is a valid continuation byte.
bool IsCont(uint8_t c) {
  return (c & 0xc0) == 0x80;
}

}  // end anonymous namespace

bool IsValidUtf8(const char* s, size_t s_length) {
  const uint8_t* p = reinterpret_cast<const uint8_t*>(s);
  const uint8_t* end = p + s_length;
  while (p < end) {
    uint8_t cu0 = *p;
    int length = s_utf8_length[cu0];
    if (p + length > end) {
      return false;
    }

    switch (length) {
      case 0:
        return false;

      case 1:
        p++;
        break;

      case 2:
        p++;
        if (!IsCont(*p++)) {
          return false;
        }
        break;

      case 3: {
        p++;
        uint8_t cu1 = *p++;
        uint8_t cu2 = *p++;
        if (!(IsCont(cu1) && IsCont(cu2)) ||
            (cu0 == 0xe0 && cu1 < 0xa0) ||  // Overlong encoding.
            (cu0 == 0xed && cu1 >= 0xa0))   // UTF-16 surrogate halves.
          return false;
        break;
      }

      case 4: {
        p++;
        uint8_t cu1 = *p++;
        uint8_t cu2 = *p++;
        uint8_t cu3 = *p++;
        if (!(IsCont(cu1) && IsCont(cu2) && IsCont(cu3)) ||
            (cu0 == 0xf0 && cu1 < 0x90) ||  // Overlong encoding.
            (cu0 == 0xf4 && cu1 >= 0x90))   // Code point >= 0x11000.
          return false;
        break;
      }
    }
  }
  return true;
}

}  // namespace wabt