/* * Copyright 2017 WebAssembly Community Group participants * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "src/utf8.h" #include namespace wabt { namespace { const int s_utf8_length[256] = { // 0 1 2 3 4 5 6 7 8 9 a b c d e f 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 }; // Returns true if this is a valid continuation byte. bool IsCont(uint8_t c) { return (c & 0xc0) == 0x80; } } // end anonymous namespace bool IsValidUtf8(const char* s, size_t s_length) { const uint8_t* p = reinterpret_cast(s); const uint8_t* end = p + s_length; while (p < end) { uint8_t cu0 = *p; int length = s_utf8_length[cu0]; if (p + length > end) { return false; } switch (length) { case 0: return false; case 1: p++; break; case 2: p++; if (!IsCont(*p++)) { return false; } break; case 3: { p++; uint8_t cu1 = *p++; uint8_t cu2 = *p++; if (!(IsCont(cu1) && IsCont(cu2)) || (cu0 == 0xe0 && cu1 < 0xa0) || // Overlong encoding. (cu0 == 0xed && cu1 >= 0xa0)) // UTF-16 surrogate halves. return false; break; } case 4: { p++; uint8_t cu1 = *p++; uint8_t cu2 = *p++; uint8_t cu3 = *p++; if (!(IsCont(cu1) && IsCont(cu2) && IsCont(cu3)) || (cu0 == 0xf0 && cu1 < 0x90) || // Overlong encoding. (cu0 == 0xf4 && cu1 >= 0x90)) // Code point >= 0x11000. return false; break; } } } return true; } } // namespace wabt