diff options
Diffstat (limited to 'third_party/wasm2c/src/utf8.cc')
-rw-r--r-- | third_party/wasm2c/src/utf8.cc | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/third_party/wasm2c/src/utf8.cc b/third_party/wasm2c/src/utf8.cc new file mode 100644 index 0000000000..dd95c8c63d --- /dev/null +++ b/third_party/wasm2c/src/utf8.cc @@ -0,0 +1,106 @@ +/* + * Copyright 2017 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "wabt/utf8.h" + +#include <cstdint> + +namespace wabt { + +namespace { + +// clang-format off +const int s_utf8_length[256] = { + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 +}; +// clang-format on + +// Returns true if this is a valid continuation byte. +bool IsCont(uint8_t c) { + return (c & 0xc0) == 0x80; +} + +} // end anonymous namespace + +bool IsValidUtf8(const char* s, size_t s_length) { + const uint8_t* p = reinterpret_cast<const uint8_t*>(s); + const uint8_t* end = p + s_length; + while (p < end) { + uint8_t cu0 = *p; + int length = s_utf8_length[cu0]; + if (p + length > end) { + return false; + } + + switch (length) { + case 0: + return false; + + case 1: + p++; + break; + + case 2: + p++; + if (!IsCont(*p++)) { + return false; + } + break; + + case 3: { + p++; + uint8_t cu1 = *p++; + uint8_t cu2 = *p++; + if (!(IsCont(cu1) && IsCont(cu2)) || + (cu0 == 0xe0 && cu1 < 0xa0) || // Overlong encoding. + (cu0 == 0xed && cu1 >= 0xa0)) // UTF-16 surrogate halves. + return false; + break; + } + + case 4: { + p++; + uint8_t cu1 = *p++; + uint8_t cu2 = *p++; + uint8_t cu3 = *p++; + if (!(IsCont(cu1) && IsCont(cu2) && IsCont(cu3)) || + (cu0 == 0xf0 && cu1 < 0x90) || // Overlong encoding. + (cu0 == 0xf4 && cu1 >= 0x90)) // Code point >= 0x11000. + return false; + break; + } + } + } + return true; +} + +} // namespace wabt |