/* * Copyright 2016 WebAssembly Community Group participants * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "wabt/wast-lexer.h" #include <cassert> #include <cstdio> #include "wabt/config.h" #include "wabt/lexer-source.h" #define ERROR(...) Error(GetLocation(), __VA_ARGS__) namespace wabt { namespace { #include "prebuilt/lexer-keywords.cc" } // namespace WastLexer::WastLexer(std::unique_ptr<LexerSource> source, std::string_view filename, Errors* errors) : source_(std::move(source)), filename_(filename), line_(1), buffer_(static_cast<const char*>(source_->data())), buffer_end_(buffer_ + source_->size()), line_start_(buffer_), token_start_(buffer_), cursor_(buffer_), errors_(errors) {} // static std::unique_ptr<WastLexer> WastLexer::CreateBufferLexer( std::string_view filename, const void* data, size_t size, Errors* errors) { return std::make_unique<WastLexer>(std::make_unique<LexerSource>(data, size), filename, errors); } Token WastLexer::GetToken() { while (true) { token_start_ = cursor_; switch (PeekChar()) { case kEof: return BareToken(TokenType::Eof); case '(': if (MatchString("(;")) { if (ReadBlockComment()) { continue; } return BareToken(TokenType::Eof); } else if (MatchString("(@")) { GetIdChars(); // offset=2 to skip the "(@" prefix return TextToken(TokenType::LparAnn, 2); } else { ReadChar(); return BareToken(TokenType::Lpar); } break; case ')': ReadChar(); return BareToken(TokenType::Rpar); case ';': if (MatchString(";;")) { if (ReadLineComment()) { continue; } return BareToken(TokenType::Eof); } else { ReadChar(); ERROR("unexpected char"); continue; } break; case ' ': case '\t': case '\r': case '\n': ReadWhitespace(); continue; case '"': return GetStringToken(); case '+': case '-': ReadChar(); switch (PeekChar()) { case 'i': return GetInfToken(); case 'n': return GetNanToken(); case '0': return MatchString("0x") ? GetHexNumberToken(TokenType::Int) : GetNumberToken(TokenType::Int); case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return GetNumberToken(TokenType::Int); default: return GetReservedToken(); } break; case '0': return MatchString("0x") ? GetHexNumberToken(TokenType::Nat) : GetNumberToken(TokenType::Nat); case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return GetNumberToken(TokenType::Nat); case '$': return GetIdChars(); // Initial $ is idchar, so this produces id token case 'a': return GetNameEqNumToken("align=", TokenType::AlignEqNat); case 'i': return GetInfToken(); case 'n': return GetNanToken(); case 'o': return GetNameEqNumToken("offset=", TokenType::OffsetEqNat); default: if (IsKeyword(PeekChar())) { return GetKeywordToken(); } else if (IsIdChar(PeekChar())) { return GetReservedToken(); } else { ReadChar(); ERROR("unexpected char"); continue; } } } } Location WastLexer::GetLocation() { auto column = [=](const char* p) { return std::max(1, static_cast<int>(p - line_start_ + 1)); }; return Location(filename_, line_, column(token_start_), column(cursor_)); } std::string_view WastLexer::GetText(size_t offset) { // Bounds checks are necessary because token_start may have been moved // (e.g. if GetStringToken found a newline and reset token_start to // point at it). if (token_start_ + offset >= buffer_end_) return {}; if (cursor_ <= token_start_ + offset) return {}; return std::string_view(token_start_ + offset, (cursor_ - token_start_) - offset); } Token WastLexer::BareToken(TokenType token_type) { return Token(GetLocation(), token_type); } Token WastLexer::LiteralToken(TokenType token_type, LiteralType literal_type) { return Token(GetLocation(), token_type, Literal(literal_type, GetText())); } Token WastLexer::TextToken(TokenType token_type, size_t offset) { return Token(GetLocation(), token_type, GetText(offset)); } int WastLexer::PeekChar() { return cursor_ < buffer_end_ ? static_cast<uint8_t>(*cursor_) : kEof; } int WastLexer::ReadChar() { return cursor_ < buffer_end_ ? static_cast<uint8_t>(*cursor_++) : kEof; } bool WastLexer::MatchChar(char c) { if (PeekChar() == c) { ReadChar(); return true; } return false; } bool WastLexer::MatchString(std::string_view s) { const char* saved_cursor = cursor_; for (char c : s) { if (ReadChar() != c) { cursor_ = saved_cursor; return false; } } return true; } void WastLexer::Newline() { line_++; line_start_ = cursor_; } bool WastLexer::ReadBlockComment() { int nesting = 1; while (true) { switch (ReadChar()) { case kEof: ERROR("EOF in block comment"); return false; case ';': if (MatchChar(')') && --nesting == 0) { return true; } break; case '(': if (MatchChar(';')) { nesting++; } break; case '\n': Newline(); break; } } } bool WastLexer::ReadLineComment() { while (true) { switch (ReadChar()) { case kEof: return false; case '\n': Newline(); return true; } } } void WastLexer::ReadWhitespace() { while (true) { switch (PeekChar()) { case ' ': case '\t': case '\r': ReadChar(); break; case '\n': ReadChar(); Newline(); break; default: return; } } } Token WastLexer::GetStringToken() { const char* saved_token_start = token_start_; bool has_error = false; bool in_string = true; ReadChar(); while (in_string) { switch (ReadChar()) { case kEof: return BareToken(TokenType::Eof); case '\n': token_start_ = cursor_ - 1; ERROR("newline in string"); has_error = true; Newline(); continue; case '"': if (PeekChar() == '"') { ERROR("invalid string token"); has_error = true; } in_string = false; break; case '\\': { switch (ReadChar()) { case 't': case 'n': case 'r': case '"': case '\'': case '\\': // Valid escape. break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': // Hex byte escape. if (IsHexDigit(PeekChar())) { ReadChar(); } else { token_start_ = cursor_ - 2; goto error; } break; case 'u': { token_start_ = cursor_ - 2; if (ReadChar() != '{') { goto error; } // Value must be a valid unicode scalar value. uint32_t digit; uint32_t scalar_value = 0; while (IsHexDigit(PeekChar())) { ParseHexdigit(*cursor_++, &digit); scalar_value = (scalar_value << 4) | digit; // Maximum value of a unicode code point. if (scalar_value >= 0x110000) { goto error; } } if (PeekChar() != '}') { goto error; } // Scalars between 0xd800 and 0xdfff are not allowed. if ((scalar_value >= 0xd800 && scalar_value < 0xe000) || token_start_ == cursor_ - 3) { ReadChar(); goto error; } break; } default: token_start_ = cursor_ - 2; goto error; error: ERROR("bad escape \"%.*s\"", static_cast<int>(cursor_ - token_start_), token_start_); has_error = true; break; } break; } } } token_start_ = saved_token_start; if (has_error) { return Token(GetLocation(), TokenType::Invalid); } return TextToken(TokenType::Text); } // static bool WastLexer::IsCharClass(int c, CharClass bit) { // Generated by the following python script: // // def Range(c, lo, hi): return lo <= c <= hi // def IsDigit(c): return Range(c, '0', '9') // def IsHexDigit(c): return IsDigit(c) or Range(c.lower(), 'a', 'f') // def IsKeyword(c): return Range(c, 'a', 'z') // def IsIdChar(c): return Range(c, '!', '~') and c not in '"(),;[]{}' // // print ([0] + [ // (8 if IsDigit(c) else 0) | // (4 if IsHexDigit(c) else 0) | // (2 if IsKeyword(c) else 0) | // (1 if IsIdChar(c) else 0) // for c in map(chr, range(0, 127)) // ]) static const char kCharClasses[257] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 1, 0, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 1, 0, 1, }; assert(c >= -1 && c < 256); return (kCharClasses[c + 1] & static_cast<int>(bit)) != 0; } bool WastLexer::ReadNum() { if (IsDigit(PeekChar())) { ReadChar(); return MatchChar('_') || IsDigit(PeekChar()) ? ReadNum() : true; } return false; } bool WastLexer::ReadHexNum() { if (IsHexDigit(PeekChar())) { ReadChar(); return MatchChar('_') || IsHexDigit(PeekChar()) ? ReadHexNum() : true; } return false; } WastLexer::ReservedChars WastLexer::ReadReservedChars() { ReservedChars ret{ReservedChars::None}; while (true) { auto peek = PeekChar(); if (IsIdChar(peek)) { ReadChar(); if (ret == ReservedChars::None) { ret = ReservedChars::Id; } } else if (peek == '"') { GetStringToken(); ret = ReservedChars::Some; } else { break; } } return ret; } void WastLexer::ReadSign() { if (PeekChar() == '+' || PeekChar() == '-') { ReadChar(); } } Token WastLexer::GetNumberToken(TokenType token_type) { if (ReadNum()) { if (MatchChar('.')) { token_type = TokenType::Float; if (IsDigit(PeekChar()) && !ReadNum()) { return GetReservedToken(); } } if (MatchChar('e') || MatchChar('E')) { token_type = TokenType::Float; ReadSign(); if (!ReadNum()) { return GetReservedToken(); } } if (NoTrailingReservedChars()) { if (token_type == TokenType::Float) { return LiteralToken(token_type, LiteralType::Float); } else { return LiteralToken(token_type, LiteralType::Int); } } } return GetReservedToken(); } Token WastLexer::GetHexNumberToken(TokenType token_type) { if (ReadHexNum()) { if (MatchChar('.')) { token_type = TokenType::Float; if (IsHexDigit(PeekChar()) && !ReadHexNum()) { return GetReservedToken(); } } if (MatchChar('p') || MatchChar('P')) { token_type = TokenType::Float; ReadSign(); if (!ReadNum()) { return GetReservedToken(); } } if (NoTrailingReservedChars()) { if (token_type == TokenType::Float) { return LiteralToken(token_type, LiteralType::Hexfloat); } else { return LiteralToken(token_type, LiteralType::Int); } } } return GetReservedToken(); } Token WastLexer::GetInfToken() { if (MatchString("inf")) { if (NoTrailingReservedChars()) { return LiteralToken(TokenType::Float, LiteralType::Infinity); } return GetReservedToken(); } return GetKeywordToken(); } Token WastLexer::GetNanToken() { if (MatchString("nan")) { if (MatchChar(':')) { if (MatchString("0x") && ReadHexNum() && NoTrailingReservedChars()) { return LiteralToken(TokenType::Float, LiteralType::Nan); } } else if (NoTrailingReservedChars()) { return LiteralToken(TokenType::Float, LiteralType::Nan); } } return GetKeywordToken(); } Token WastLexer::GetNameEqNumToken(std::string_view name, TokenType token_type) { if (MatchString(name)) { if (MatchString("0x")) { if (ReadHexNum() && NoTrailingReservedChars()) { return TextToken(token_type, name.size()); } } else if (ReadNum() && NoTrailingReservedChars()) { return TextToken(token_type, name.size()); } } return GetKeywordToken(); } Token WastLexer::GetIdChars() { if (ReadReservedChars() == ReservedChars::Id) { return TextToken(TokenType::Var); } return TextToken(TokenType::Reserved); } Token WastLexer::GetKeywordToken() { ReadReservedChars(); TokenInfo* info = Perfect_Hash::InWordSet(token_start_, cursor_ - token_start_); if (!info) { return TextToken(TokenType::Reserved); } if (IsTokenTypeBare(info->token_type)) { return BareToken(info->token_type); } else if (IsTokenTypeType(info->token_type) || IsTokenTypeRefKind(info->token_type)) { return Token(GetLocation(), info->token_type, info->value_type); } else { assert(IsTokenTypeOpcode(info->token_type)); return Token(GetLocation(), info->token_type, info->opcode); } } Token WastLexer::GetReservedToken() { ReadReservedChars(); return TextToken(TokenType::Reserved); } void WastLexer::Error(Location loc, const char* format, ...) { WABT_SNPRINTF_ALLOCA(buffer, length, format); errors_->emplace_back(ErrorLevel::Error, loc, buffer); } } // namespace wabt