diff options
Diffstat (limited to 'third_party/wasm2c/src/wast-lexer.cc')
-rw-r--r-- | third_party/wasm2c/src/wast-lexer.cc | 626 |
1 files changed, 626 insertions, 0 deletions
diff --git a/third_party/wasm2c/src/wast-lexer.cc b/third_party/wasm2c/src/wast-lexer.cc new file mode 100644 index 0000000000..1f89c3ff47 --- /dev/null +++ b/third_party/wasm2c/src/wast-lexer.cc @@ -0,0 +1,626 @@ +/* + * Copyright 2016 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "wabt/wast-lexer.h" + +#include <cassert> +#include <cstdio> + +#include "wabt/config.h" + +#include "wabt/lexer-source.h" + +#define ERROR(...) Error(GetLocation(), __VA_ARGS__) + +namespace wabt { + +namespace { + +#include "prebuilt/lexer-keywords.cc" + +} // namespace + +WastLexer::WastLexer(std::unique_ptr<LexerSource> source, + std::string_view filename, + Errors* errors) + : source_(std::move(source)), + filename_(filename), + line_(1), + buffer_(static_cast<const char*>(source_->data())), + buffer_end_(buffer_ + source_->size()), + line_start_(buffer_), + token_start_(buffer_), + cursor_(buffer_), + errors_(errors) {} + +// static +std::unique_ptr<WastLexer> WastLexer::CreateBufferLexer( + std::string_view filename, + const void* data, + size_t size, + Errors* errors) { + return std::make_unique<WastLexer>(std::make_unique<LexerSource>(data, size), + filename, errors); +} + +Token WastLexer::GetToken() { + while (true) { + token_start_ = cursor_; + switch (PeekChar()) { + case kEof: + return BareToken(TokenType::Eof); + + case '(': + if (MatchString("(;")) { + if (ReadBlockComment()) { + continue; + } + return BareToken(TokenType::Eof); + } else if (MatchString("(@")) { + GetIdChars(); + // offset=2 to skip the "(@" prefix + return TextToken(TokenType::LparAnn, 2); + } else { + ReadChar(); + return BareToken(TokenType::Lpar); + } + break; + + case ')': + ReadChar(); + return BareToken(TokenType::Rpar); + + case ';': + if (MatchString(";;")) { + if (ReadLineComment()) { + continue; + } + return BareToken(TokenType::Eof); + } else { + ReadChar(); + ERROR("unexpected char"); + continue; + } + break; + + case ' ': + case '\t': + case '\r': + case '\n': + ReadWhitespace(); + continue; + + case '"': + return GetStringToken(); + + case '+': + case '-': + ReadChar(); + switch (PeekChar()) { + case 'i': + return GetInfToken(); + + case 'n': + return GetNanToken(); + + case '0': + return MatchString("0x") ? GetHexNumberToken(TokenType::Int) + : GetNumberToken(TokenType::Int); + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return GetNumberToken(TokenType::Int); + + default: + return GetReservedToken(); + } + break; + + case '0': + return MatchString("0x") ? GetHexNumberToken(TokenType::Nat) + : GetNumberToken(TokenType::Nat); + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return GetNumberToken(TokenType::Nat); + + case '$': + return GetIdChars(); // Initial $ is idchar, so this produces id token + + case 'a': + return GetNameEqNumToken("align=", TokenType::AlignEqNat); + + case 'i': + return GetInfToken(); + + case 'n': + return GetNanToken(); + + case 'o': + return GetNameEqNumToken("offset=", TokenType::OffsetEqNat); + + default: + if (IsKeyword(PeekChar())) { + return GetKeywordToken(); + } else if (IsIdChar(PeekChar())) { + return GetReservedToken(); + } else { + ReadChar(); + ERROR("unexpected char"); + continue; + } + } + } +} + +Location WastLexer::GetLocation() { + auto column = [=](const char* p) { + return std::max(1, static_cast<int>(p - line_start_ + 1)); + }; + return Location(filename_, line_, column(token_start_), column(cursor_)); +} + +std::string_view WastLexer::GetText(size_t offset) { + // Bounds checks are necessary because token_start may have been moved + // (e.g. if GetStringToken found a newline and reset token_start to + // point at it). + + if (token_start_ + offset >= buffer_end_) + return {}; + + if (cursor_ <= token_start_ + offset) + return {}; + + return std::string_view(token_start_ + offset, + (cursor_ - token_start_) - offset); +} + +Token WastLexer::BareToken(TokenType token_type) { + return Token(GetLocation(), token_type); +} + +Token WastLexer::LiteralToken(TokenType token_type, LiteralType literal_type) { + return Token(GetLocation(), token_type, Literal(literal_type, GetText())); +} + +Token WastLexer::TextToken(TokenType token_type, size_t offset) { + return Token(GetLocation(), token_type, GetText(offset)); +} + +int WastLexer::PeekChar() { + return cursor_ < buffer_end_ ? static_cast<uint8_t>(*cursor_) : kEof; +} + +int WastLexer::ReadChar() { + return cursor_ < buffer_end_ ? static_cast<uint8_t>(*cursor_++) : kEof; +} + +bool WastLexer::MatchChar(char c) { + if (PeekChar() == c) { + ReadChar(); + return true; + } + return false; +} + +bool WastLexer::MatchString(std::string_view s) { + const char* saved_cursor = cursor_; + for (char c : s) { + if (ReadChar() != c) { + cursor_ = saved_cursor; + return false; + } + } + return true; +} + +void WastLexer::Newline() { + line_++; + line_start_ = cursor_; +} + +bool WastLexer::ReadBlockComment() { + int nesting = 1; + while (true) { + switch (ReadChar()) { + case kEof: + ERROR("EOF in block comment"); + return false; + + case ';': + if (MatchChar(')') && --nesting == 0) { + return true; + } + break; + + case '(': + if (MatchChar(';')) { + nesting++; + } + break; + + case '\n': + Newline(); + break; + } + } +} + +bool WastLexer::ReadLineComment() { + while (true) { + switch (ReadChar()) { + case kEof: + return false; + + case '\n': + Newline(); + return true; + } + } +} + +void WastLexer::ReadWhitespace() { + while (true) { + switch (PeekChar()) { + case ' ': + case '\t': + case '\r': + ReadChar(); + break; + + case '\n': + ReadChar(); + Newline(); + break; + + default: + return; + } + } +} + +Token WastLexer::GetStringToken() { + const char* saved_token_start = token_start_; + bool has_error = false; + bool in_string = true; + ReadChar(); + while (in_string) { + switch (ReadChar()) { + case kEof: + return BareToken(TokenType::Eof); + + case '\n': + token_start_ = cursor_ - 1; + ERROR("newline in string"); + has_error = true; + Newline(); + continue; + + case '"': + if (PeekChar() == '"') { + ERROR("invalid string token"); + has_error = true; + } + in_string = false; + break; + + case '\\': { + switch (ReadChar()) { + case 't': + case 'n': + case 'r': + case '"': + case '\'': + case '\\': + // Valid escape. + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': // Hex byte escape. + if (IsHexDigit(PeekChar())) { + ReadChar(); + } else { + token_start_ = cursor_ - 2; + goto error; + } + break; + + case 'u': { + token_start_ = cursor_ - 2; + if (ReadChar() != '{') { + goto error; + } + + // Value must be a valid unicode scalar value. + uint32_t digit; + uint32_t scalar_value = 0; + + while (IsHexDigit(PeekChar())) { + ParseHexdigit(*cursor_++, &digit); + + scalar_value = (scalar_value << 4) | digit; + // Maximum value of a unicode code point. + if (scalar_value >= 0x110000) { + goto error; + } + } + + if (PeekChar() != '}') { + goto error; + } + + // Scalars between 0xd800 and 0xdfff are not allowed. + if ((scalar_value >= 0xd800 && scalar_value < 0xe000) || + token_start_ == cursor_ - 3) { + ReadChar(); + goto error; + } + break; + } + + default: + token_start_ = cursor_ - 2; + goto error; + + error: + ERROR("bad escape \"%.*s\"", + static_cast<int>(cursor_ - token_start_), token_start_); + has_error = true; + break; + } + break; + } + } + } + token_start_ = saved_token_start; + if (has_error) { + return Token(GetLocation(), TokenType::Invalid); + } + + return TextToken(TokenType::Text); +} + +// static +bool WastLexer::IsCharClass(int c, CharClass bit) { + // Generated by the following python script: + // + // def Range(c, lo, hi): return lo <= c <= hi + // def IsDigit(c): return Range(c, '0', '9') + // def IsHexDigit(c): return IsDigit(c) or Range(c.lower(), 'a', 'f') + // def IsKeyword(c): return Range(c, 'a', 'z') + // def IsIdChar(c): return Range(c, '!', '~') and c not in '"(),;[]{}' + // + // print ([0] + [ + // (8 if IsDigit(c) else 0) | + // (4 if IsHexDigit(c) else 0) | + // (2 if IsKeyword(c) else 0) | + // (1 if IsIdChar(c) else 0) + // for c in map(chr, range(0, 127)) + // ]) + static const char kCharClasses[257] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, + 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 1, 0, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, + 1, 1, 1, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 1, 0, 1, + }; + + assert(c >= -1 && c < 256); + return (kCharClasses[c + 1] & static_cast<int>(bit)) != 0; +} + +bool WastLexer::ReadNum() { + if (IsDigit(PeekChar())) { + ReadChar(); + return MatchChar('_') || IsDigit(PeekChar()) ? ReadNum() : true; + } + return false; +} + +bool WastLexer::ReadHexNum() { + if (IsHexDigit(PeekChar())) { + ReadChar(); + return MatchChar('_') || IsHexDigit(PeekChar()) ? ReadHexNum() : true; + } + return false; +} + +WastLexer::ReservedChars WastLexer::ReadReservedChars() { + ReservedChars ret{ReservedChars::None}; + while (true) { + auto peek = PeekChar(); + if (IsIdChar(peek)) { + ReadChar(); + if (ret == ReservedChars::None) { + ret = ReservedChars::Id; + } + } else if (peek == '"') { + GetStringToken(); + ret = ReservedChars::Some; + } else { + break; + } + } + return ret; +} + +void WastLexer::ReadSign() { + if (PeekChar() == '+' || PeekChar() == '-') { + ReadChar(); + } +} + +Token WastLexer::GetNumberToken(TokenType token_type) { + if (ReadNum()) { + if (MatchChar('.')) { + token_type = TokenType::Float; + if (IsDigit(PeekChar()) && !ReadNum()) { + return GetReservedToken(); + } + } + if (MatchChar('e') || MatchChar('E')) { + token_type = TokenType::Float; + ReadSign(); + if (!ReadNum()) { + return GetReservedToken(); + } + } + if (NoTrailingReservedChars()) { + if (token_type == TokenType::Float) { + return LiteralToken(token_type, LiteralType::Float); + } else { + return LiteralToken(token_type, LiteralType::Int); + } + } + } + return GetReservedToken(); +} + +Token WastLexer::GetHexNumberToken(TokenType token_type) { + if (ReadHexNum()) { + if (MatchChar('.')) { + token_type = TokenType::Float; + if (IsHexDigit(PeekChar()) && !ReadHexNum()) { + return GetReservedToken(); + } + } + if (MatchChar('p') || MatchChar('P')) { + token_type = TokenType::Float; + ReadSign(); + if (!ReadNum()) { + return GetReservedToken(); + } + } + if (NoTrailingReservedChars()) { + if (token_type == TokenType::Float) { + return LiteralToken(token_type, LiteralType::Hexfloat); + } else { + return LiteralToken(token_type, LiteralType::Int); + } + } + } + return GetReservedToken(); +} + +Token WastLexer::GetInfToken() { + if (MatchString("inf")) { + if (NoTrailingReservedChars()) { + return LiteralToken(TokenType::Float, LiteralType::Infinity); + } + return GetReservedToken(); + } + return GetKeywordToken(); +} + +Token WastLexer::GetNanToken() { + if (MatchString("nan")) { + if (MatchChar(':')) { + if (MatchString("0x") && ReadHexNum() && NoTrailingReservedChars()) { + return LiteralToken(TokenType::Float, LiteralType::Nan); + } + } else if (NoTrailingReservedChars()) { + return LiteralToken(TokenType::Float, LiteralType::Nan); + } + } + return GetKeywordToken(); +} + +Token WastLexer::GetNameEqNumToken(std::string_view name, + TokenType token_type) { + if (MatchString(name)) { + if (MatchString("0x")) { + if (ReadHexNum() && NoTrailingReservedChars()) { + return TextToken(token_type, name.size()); + } + } else if (ReadNum() && NoTrailingReservedChars()) { + return TextToken(token_type, name.size()); + } + } + return GetKeywordToken(); +} + +Token WastLexer::GetIdChars() { + if (ReadReservedChars() == ReservedChars::Id) { + return TextToken(TokenType::Var); + } + + return TextToken(TokenType::Reserved); +} + +Token WastLexer::GetKeywordToken() { + ReadReservedChars(); + TokenInfo* info = + Perfect_Hash::InWordSet(token_start_, cursor_ - token_start_); + if (!info) { + return TextToken(TokenType::Reserved); + } + if (IsTokenTypeBare(info->token_type)) { + return BareToken(info->token_type); + } else if (IsTokenTypeType(info->token_type) || + IsTokenTypeRefKind(info->token_type)) { + return Token(GetLocation(), info->token_type, info->value_type); + } else { + assert(IsTokenTypeOpcode(info->token_type)); + return Token(GetLocation(), info->token_type, info->opcode); + } +} + +Token WastLexer::GetReservedToken() { + ReadReservedChars(); + return TextToken(TokenType::Reserved); +} + +void WastLexer::Error(Location loc, const char* format, ...) { + WABT_SNPRINTF_ALLOCA(buffer, length, format); + errors_->emplace_back(ErrorLevel::Error, loc, buffer); +} + +} // namespace wabt |