//! Definition of a lexer for the WebAssembly text format. //! //! This module provides a [`Lexer`][] type which is an iterate over the raw //! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single //! byte in a WebAssembly text field, returning tokens even for comments and //! whitespace. Typically you'll ignore comments and whitespace, however. //! //! If you'd like to iterate over the tokens in a file you can do so via: //! //! ``` //! # fn foo() -> Result<(), wast::Error> { //! use wast::lexer::Lexer; //! //! let wat = "(module (func $foo))"; //! for token in Lexer::new(wat) { //! println!("{:?}", token?); //! } //! # Ok(()) //! # } //! ``` //! //! Note that you'll typically not use this module but will rather use //! [`ParseBuffer`](crate::parser::ParseBuffer) instead. //! //! [`Lexer`]: crate::lexer::Lexer use crate::token::Span; use crate::Error; use std::borrow::Cow; use std::char; use std::fmt; use std::str; /// A structure used to lex the s-expression syntax of WAT files. /// /// This structure is used to generate [`Token`] items, which should account for /// every single byte of the input as we iterate over it. A [`LexError`] is /// returned for any non-lexable text. #[derive(Clone)] pub struct Lexer<'a> { remaining: &'a str, input: &'a str, allow_confusing_unicode: bool, } /// A fragment of source lex'd from an input string. /// /// This enumeration contains all kinds of fragments, including comments and /// whitespace. For most cases you'll probably ignore these and simply look at /// tokens. #[derive(Debug, PartialEq)] pub enum Token<'a> { /// A line comment, preceded with `;;` LineComment(&'a str), /// A block comment, surrounded by `(;` and `;)`. Note that these can be /// nested. BlockComment(&'a str), /// A fragment of source that represents whitespace. Whitespace(&'a str), /// A left-parenthesis, including the source text for where it comes from. LParen(&'a str), /// A right-parenthesis, including the source text for where it comes from. RParen(&'a str), /// A string literal, which is actually a list of bytes. String(WasmString<'a>), /// An identifier (like `$foo`). /// /// All identifiers start with `$` and the payload here is the original /// source text. Id(&'a str), /// A keyword, or something that starts with an alphabetic character. /// /// The payload here is the original source text. Keyword(&'a str), /// A reserved series of `idchar` symbols. Unknown what this is meant to be /// used for, you'll probably generate an error about an unexpected token. Reserved(&'a str), /// An integer. Integer(Integer<'a>), /// A float. Float(Float<'a>), } enum ReservedKind<'a> { String(Cow<'a, [u8]>), Idchars, Reserved, } /// Errors that can be generated while lexing. /// /// All lexing errors have line/colum/position information as well as a /// `LexError` indicating what kind of error happened while lexing. #[derive(Debug, Clone, PartialEq, Eq)] #[non_exhaustive] pub enum LexError { /// A dangling block comment was found with an unbalanced `(;` which was /// never terminated in the file. DanglingBlockComment, /// An unexpected character was encountered when generally parsing and /// looking for something else. Unexpected(char), /// An invalid `char` in a string literal was found. InvalidStringElement(char), /// An invalid string escape letter was found (the thing after the `\` in /// string literals) InvalidStringEscape(char), /// An invalid hexadecimal digit was found. InvalidHexDigit(char), /// An invalid base-10 digit was found. InvalidDigit(char), /// Parsing expected `wanted` but ended up finding `found` instead where the /// two characters aren't the same. Expected { /// The character that was expected to be found wanted: char, /// The character that was actually found found: char, }, /// We needed to parse more but EOF (or end of the string) was encountered. UnexpectedEof, /// A number failed to parse because it was too big to fit within the target /// type. NumberTooBig, /// An invalid unicode value was found in a `\u{...}` escape in a string, /// only valid unicode scalars can be escaped that way. InvalidUnicodeValue(u32), /// A lone underscore was found when parsing a number, since underscores /// should always be preceded and succeeded with a digit of some form. LoneUnderscore, /// A "confusing" unicode character is present in a comment or a string /// literal, such as a character that changes the direction text is /// typically displayed in editors. This could cause the human-read /// version to behave differently than the compiler-visible version, so /// these are simply rejected for now. ConfusingUnicode(char), } /// A sign token for an integer. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum SignToken { /// Plus sign: "+", Plus, /// Minus sign: "-", Minus, } /// A parsed integer, signed or unsigned. /// /// Methods can be use to access the value of the integer. #[derive(Debug, PartialEq)] pub struct Integer<'a>(Box>); #[derive(Debug, PartialEq)] struct IntegerInner<'a> { sign: Option, src: &'a str, val: Cow<'a, str>, hex: bool, } /// A parsed float. /// /// Methods can be use to access the value of the float. #[derive(Debug, PartialEq)] pub struct Float<'a>(Box>); #[derive(Debug, PartialEq)] struct FloatInner<'a> { src: &'a str, val: FloatVal<'a>, } /// A parsed string. #[derive(Debug, PartialEq)] pub struct WasmString<'a>(Box>); #[derive(Debug, PartialEq)] struct WasmStringInner<'a> { src: &'a str, val: Cow<'a, [u8]>, } /// Possible parsed float values #[derive(Debug, PartialEq, Eq)] pub enum FloatVal<'a> { /// A float `NaN` representation Nan { /// The specific bits to encode for this float, optionally val: Option, /// Whether or not this is a negative `NaN` or not. negative: bool, }, /// An float infinite representation, Inf { #[allow(missing_docs)] negative: bool, }, /// A parsed and separated floating point value Val { /// Whether or not the `integral` and `decimal` are specified in hex hex: bool, /// The float parts before the `.` integral: Cow<'a, str>, /// The float parts after the `.` decimal: Option>, /// The exponent to multiple this `integral.decimal` portion of the /// float by. If `hex` is true this is `2^exponent` and otherwise it's /// `10^exponent` exponent: Option>, }, } // https://webassembly.github.io/spec/core/text/values.html#text-idchar macro_rules! idchars { () => { b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'/' | b':' | b'<' | b'=' | b'>' | b'?' | b'@' | b'\\' | b'^' | b'_' | b'`' | b'|' | b'~' } } impl<'a> Lexer<'a> { /// Creates a new lexer which will lex the `input` source string. pub fn new(input: &str) -> Lexer<'_> { Lexer { remaining: input, input, allow_confusing_unicode: false, } } /// Returns the original source input that we're lexing. pub fn input(&self) -> &'a str { self.input } /// Configures whether "confusing" unicode characters are allowed while /// lexing. /// /// If allowed then no error will happen if these characters are found, but /// otherwise if disallowed a lex error will be produced when these /// characters are found. Confusing characters are denied by default. /// /// For now "confusing characters" are primarily related to the "trojan /// source" problem where it refers to characters which cause humans to read /// text differently than this lexer, such as characters that alter the /// left-to-right display of the source code. pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self { self.allow_confusing_unicode = allow; self } /// Lexes the next token in the input. /// /// Returns `Some` if a token is found or `None` if we're at EOF. /// /// # Errors /// /// Returns an error if the input is malformed. pub fn parse(&mut self) -> Result>, Error> { let pos = self.cur(); // This `match` generally parses the grammar specified at // // https://webassembly.github.io/spec/core/text/lexical.html#text-token let byte = match self.remaining.as_bytes().first() { Some(b) => b, None => return Ok(None), }; match byte { // Open-parens check the next character to see if this is the start // of a block comment, otherwise it's just a bland left-paren // token. b'(' => match self.remaining.as_bytes().get(1) { Some(b';') => { let mut level = 1; // Note that we're doing a byte-level search here for the // close-delimiter of `;)`. The actual source text is utf-8 // encode in `self.remaining` but due to how utf-8 works we // can safely search for an ASCII byte since it'll never // otherwise appear in the middle of a codepoint and if we // find it then it's guaranteed to be the right byte. // // Mainly we're avoiding the overhead of decoding utf-8 // characters into a Rust `char` since it's otherwise // unnecessary work. let mut iter = self.remaining.as_bytes()[2..].iter(); while let Some(ch) = iter.next() { match ch { b'(' => { if let Some(b';') = iter.as_slice().first() { level += 1; iter.next(); } } b';' => { if let Some(b')') = iter.as_slice().first() { level -= 1; iter.next(); if level == 0 { let len = self.remaining.len() - iter.as_slice().len(); let (comment, remaining) = self.remaining.split_at(len); self.remaining = remaining; self.check_confusing_comment(comment)?; return Ok(Some(Token::BlockComment(comment))); } } } _ => {} } } Err(self.error(pos, LexError::DanglingBlockComment)) } _ => Ok(Some(Token::LParen(self.split_first_byte()))), }, b')' => Ok(Some(Token::RParen(self.split_first_byte()))), // https://webassembly.github.io/spec/core/text/lexical.html#white-space b' ' | b'\n' | b'\r' | b'\t' => Ok(Some(Token::Whitespace(self.split_ws()))), c @ (idchars!() | b'"') => { let (kind, src) = self.split_reserved()?; match kind { // If the reserved token was simply a single string then // that is converted to a standalone string token ReservedKind::String(val) => { return Ok(Some(Token::String(WasmString(Box::new(WasmStringInner { val, src, }))))); } // If only idchars were consumed then this could be a // specific kind of standalone token we're interested in. ReservedKind::Idchars => { // https://webassembly.github.io/spec/core/text/values.html#integers if let Some(number) = self.number(src) { return Ok(Some(number)); // https://webassembly.github.io/spec/core/text/values.html#text-id } else if *c == b'$' && src.len() > 1 { return Ok(Some(Token::Id(src))); // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword } else if b'a' <= *c && *c <= b'z' { return Ok(Some(Token::Keyword(src))); } } // ... otherwise this was a conglomeration of idchars, // strings, or just idchars that don't match a prior rule, // meaning this falls through to the fallback `Reserved` // token. ReservedKind::Reserved => {} } Ok(Some(Token::Reserved(src))) } // This could be a line comment, otherwise `;` is a reserved token. // The second byte is checked to see if it's a `;;` line comment // // Note that this character being considered as part of a // `reserved` token is part of the annotations proposal. b';' => match self.remaining.as_bytes().get(1) { Some(b';') => { let comment = self.split_until(b'\n'); self.check_confusing_comment(comment)?; Ok(Some(Token::LineComment(comment))) } _ => Ok(Some(Token::Reserved(self.split_first_byte()))), }, // Other known reserved tokens other than `;` // // Note that these characters being considered as part of a // `reserved` token is part of the annotations proposal. b',' | b'[' | b']' | b'{' | b'}' => Ok(Some(Token::Reserved(self.split_first_byte()))), _ => { let ch = self.remaining.chars().next().unwrap(); Err(self.error(pos, LexError::Unexpected(ch))) } } } fn split_first_byte(&mut self) -> &'a str { let (token, remaining) = self.remaining.split_at(1); self.remaining = remaining; token } fn split_until(&mut self, byte: u8) -> &'a str { let pos = memchr::memchr(byte, self.remaining.as_bytes()).unwrap_or(self.remaining.len()); let (ret, remaining) = self.remaining.split_at(pos); self.remaining = remaining; ret } fn split_ws(&mut self) -> &'a str { // This table is a byte lookup table to determine whether a byte is a // whitespace byte. There are only 4 whitespace bytes for the `*.wat` // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes // have a '1' in the table below. // // Due to how utf-8 works (our input is guaranteed to be utf-8) it is // known that if these bytes are found they're guaranteed to be the // whitespace byte, so they can be safely skipped and we don't have to // do full utf-8 decoding. This means that the goal of this function is // to find the first non-whitespace byte in `self.remaining`. // // For now this lookup table seems to be the fastest, but projects like // https://github.com/lemire/despacer show other simd algorithms which // can possibly accelerate this even more. Note that `*.wat` files often // have a lot of whitespace so this function is typically quite hot when // parsing inputs. #[rustfmt::skip] const WS: [u8; 256] = [ // \t \n \r /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ' ' /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; let pos = self .remaining .as_bytes() .iter() .position(|b| WS[*b as usize] != 1) .unwrap_or(self.remaining.len()); let (ret, remaining) = self.remaining.split_at(pos); self.remaining = remaining; ret } /// Splits off a "reserved" token which is then further processed later on /// to figure out which kind of token it is `depending on `ReservedKind`. /// /// For more information on this method see the clarification at /// https://github.com/WebAssembly/spec/pull/1499 but the general gist is /// that this is parsing the grammar: /// /// ```text /// reserved := (idchar | string)+ /// ``` /// /// which means that it is eating any number of adjacent string/idchar /// tokens (e.g. `a"b"c`) and returning the classification of what was /// eaten. The classification assists in determining what the actual token /// here eaten looks like. fn split_reserved(&mut self) -> Result<(ReservedKind<'a>, &'a str), Error> { let mut idchars = false; let mut strings = 0u32; let mut last_string_val = None; let mut pos = 0; while let Some(byte) = self.remaining.as_bytes().get(pos) { match byte { // Normal `idchars` production which appends to the reserved // token that's being produced. idchars!() => { idchars = true; pos += 1; } // https://webassembly.github.io/spec/core/text/values.html#text-string b'"' => { strings += 1; pos += 1; let mut it = self.remaining[pos..].chars(); let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode); pos = self.remaining.len() - it.as_str().len(); match result { Ok(s) => last_string_val = Some(s), Err(e) => { let start = self.input.len() - self.remaining.len(); self.remaining = &self.remaining[pos..]; let err_pos = match &e { LexError::UnexpectedEof => self.input.len(), _ => { self.input[..start + pos] .char_indices() .next_back() .unwrap() .0 } }; return Err(self.error(err_pos, e)); } } } // Nothing else is considered part of a reserved token _ => break, } } let (ret, remaining) = self.remaining.split_at(pos); self.remaining = remaining; Ok(match (idchars, strings) { (false, 0) => unreachable!(), (false, 1) => (ReservedKind::String(last_string_val.unwrap()), ret), (true, 0) => (ReservedKind::Idchars, ret), _ => (ReservedKind::Reserved, ret), }) } fn number(&self, src: &'a str) -> Option> { let (sign, num) = if let Some(stripped) = src.strip_prefix('+') { (Some(SignToken::Plus), stripped) } else if let Some(stripped) = src.strip_prefix('-') { (Some(SignToken::Minus), stripped) } else { (None, src) }; let negative = sign == Some(SignToken::Minus); // Handle `inf` and `nan` which are special numbers here if num == "inf" { return Some(Token::Float(Float(Box::new(FloatInner { src, val: FloatVal::Inf { negative }, })))); } else if num == "nan" { return Some(Token::Float(Float(Box::new(FloatInner { src, val: FloatVal::Nan { val: None, negative, }, })))); } else if let Some(stripped) = num.strip_prefix("nan:0x") { let mut it = stripped.chars(); let to_parse = skip_undescores(&mut it, false, char::is_ascii_hexdigit)?; if it.next().is_some() { return None; } let n = u64::from_str_radix(&to_parse, 16).ok()?; return Some(Token::Float(Float(Box::new(FloatInner { src, val: FloatVal::Nan { val: Some(n), negative, }, })))); } // Figure out if we're a hex number or not let (mut it, hex, test_valid) = if let Some(stripped) = num.strip_prefix("0x") { ( stripped.chars(), true, char::is_ascii_hexdigit as fn(&char) -> bool, ) } else { ( num.chars(), false, char::is_ascii_digit as fn(&char) -> bool, ) }; // Evaluate the first part, moving out all underscores let val = skip_undescores(&mut it, negative, test_valid)?; match it.clone().next() { // If we're followed by something this may be a float so keep going. Some(_) => {} // Otherwise this is a valid integer literal! None => { return Some(Token::Integer(Integer(Box::new(IntegerInner { sign, src, val, hex, })))) } } // A number can optionally be after the decimal so only actually try to // parse one if it's there. let decimal = if it.clone().next() == Some('.') { it.next(); match it.clone().next() { Some(c) if test_valid(&c) => Some(skip_undescores(&mut it, false, test_valid)?), Some(_) | None => None, } } else { None }; // Figure out if there's an exponential part here to make a float, and // if so parse it but defer its actual calculation until later. let exponent = match (hex, it.next()) { (true, Some('p')) | (true, Some('P')) | (false, Some('e')) | (false, Some('E')) => { let negative = match it.clone().next() { Some('-') => { it.next(); true } Some('+') => { it.next(); false } _ => false, }; Some(skip_undescores(&mut it, negative, char::is_ascii_digit)?) } (_, None) => None, _ => return None, }; // We should have eaten everything by now, if not then this is surely // not a float or integer literal. if it.next().is_some() { return None; } return Some(Token::Float(Float(Box::new(FloatInner { src, val: FloatVal::Val { hex, integral: val, exponent, decimal, }, })))); fn skip_undescores<'a>( it: &mut str::Chars<'a>, negative: bool, good: fn(&char) -> bool, ) -> Option> { enum State { Raw, Collecting(String), } let mut last_underscore = false; let mut state = if negative { State::Collecting("-".to_string()) } else { State::Raw }; let input = it.as_str(); let first = it.next()?; if !good(&first) { return None; } if let State::Collecting(s) = &mut state { s.push(first); } let mut last = 1; while let Some(c) = it.clone().next() { if c == '_' && !last_underscore { if let State::Raw = state { state = State::Collecting(input[..last].to_string()); } it.next(); last_underscore = true; continue; } if !good(&c) { break; } if let State::Collecting(s) = &mut state { s.push(c); } last_underscore = false; it.next(); last += 1; } if last_underscore { return None; } Some(match state { State::Raw => input[..last].into(), State::Collecting(s) => s.into(), }) } } /// Verifies that `comment`, which is about to be returned, has a "confusing /// unicode character" in it and should instead be transformed into an /// error. fn check_confusing_comment(&self, comment: &str) -> Result<(), Error> { if self.allow_confusing_unicode { return Ok(()); } // In an effort to avoid utf-8 decoding the entire `comment` the search // here is a bit more optimized. This checks for the `0xe2` byte because // in the utf-8 encoding that's the leading encoding byte for all // "confusing characters". Each instance of 0xe2 is checked to see if it // starts a confusing character, and if so that's returned. // // Also note that 0xe2 will never be found in the middle of a codepoint, // it's always the start of a codepoint. This means that if our special // characters show up they're guaranteed to start with 0xe2 bytes. let bytes = comment.as_bytes(); for pos in memchr::Memchr::new(0xe2, bytes) { if let Some(c) = comment[pos..].chars().next() { if is_confusing_unicode(c) { // Note that `self.cur()` accounts for already having // parsed `comment`, so we move backwards to where // `comment` started and then add the index within // `comment`. let pos = self.cur() - comment.len() + pos; return Err(self.error(pos, LexError::ConfusingUnicode(c))); } } } Ok(()) } fn parse_str( it: &mut str::Chars<'a>, allow_confusing_unicode: bool, ) -> Result, LexError> { enum State { Start, String(Vec), } let orig = it.as_str(); let mut state = State::Start; loop { match it.next().ok_or(LexError::UnexpectedEof)? { '"' => break, '\\' => { match state { State::String(_) => {} State::Start => { let pos = orig.len() - it.as_str().len() - 1; state = State::String(orig[..pos].as_bytes().to_vec()); } } let buf = match &mut state { State::String(b) => b, State::Start => unreachable!(), }; match it.next().ok_or(LexError::UnexpectedEof)? { '"' => buf.push(b'"'), '\'' => buf.push(b'\''), 't' => buf.push(b'\t'), 'n' => buf.push(b'\n'), 'r' => buf.push(b'\r'), '\\' => buf.push(b'\\'), 'u' => { Lexer::must_eat_char(it, '{')?; let n = Lexer::hexnum(it)?; let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?; buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); Lexer::must_eat_char(it, '}')?; } c1 if c1.is_ascii_hexdigit() => { let c2 = Lexer::hexdigit(it)?; buf.push(to_hex(c1) * 16 + c2); } c => return Err(LexError::InvalidStringEscape(c)), } } c if (c as u32) < 0x20 || c as u32 == 0x7f => { return Err(LexError::InvalidStringElement(c)) } c if !allow_confusing_unicode && is_confusing_unicode(c) => { return Err(LexError::ConfusingUnicode(c)) } c => match &mut state { State::Start => {} State::String(v) => { v.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); } }, } } match state { State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()), State::String(s) => Ok(s.into()), } } fn hexnum(it: &mut str::Chars<'_>) -> Result { let n = Lexer::hexdigit(it)?; let mut last_underscore = false; let mut n = n as u32; while let Some(c) = it.clone().next() { if c == '_' { it.next(); last_underscore = true; continue; } if !c.is_ascii_hexdigit() { break; } last_underscore = false; it.next(); n = n .checked_mul(16) .and_then(|n| n.checked_add(to_hex(c) as u32)) .ok_or(LexError::NumberTooBig)?; } if last_underscore { return Err(LexError::LoneUnderscore); } Ok(n) } /// Reads a hexidecimal digit from the input stream, returning where it's /// defined and the hex value. Returns an error on EOF or an invalid hex /// digit. fn hexdigit(it: &mut str::Chars<'_>) -> Result { let ch = Lexer::must_char(it)?; if ch.is_ascii_hexdigit() { Ok(to_hex(ch)) } else { Err(LexError::InvalidHexDigit(ch)) } } /// Reads the next character from the input string and where it's located, /// returning an error if the input stream is empty. fn must_char(it: &mut str::Chars<'_>) -> Result { it.next().ok_or(LexError::UnexpectedEof) } /// Expects that a specific character must be read next fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> { let found = Lexer::must_char(it)?; if wanted == found { Ok(()) } else { Err(LexError::Expected { wanted, found }) } } /// Returns the current position of our iterator through the input string fn cur(&self) -> usize { self.input.len() - self.remaining.len() } /// Creates an error at `pos` with the specified `kind` fn error(&self, pos: usize, kind: LexError) -> Error { Error::lex(Span { offset: pos }, self.input, kind) } } impl<'a> Iterator for Lexer<'a> { type Item = Result, Error>; fn next(&mut self) -> Option { self.parse().transpose() } } impl<'a> Token<'a> { /// Returns the original source text for this token. pub fn src(&self) -> &'a str { match self { Token::Whitespace(s) => s, Token::BlockComment(s) => s, Token::LineComment(s) => s, Token::LParen(s) => s, Token::RParen(s) => s, Token::String(s) => s.src(), Token::Id(s) => s, Token::Keyword(s) => s, Token::Reserved(s) => s, Token::Integer(i) => i.src(), Token::Float(f) => f.src(), } } } impl<'a> Integer<'a> { /// Returns the sign token for this integer. pub fn sign(&self) -> Option { self.0.sign } /// Returns the original source text for this integer. pub fn src(&self) -> &'a str { self.0.src } /// Returns the value string that can be parsed for this integer, as well as /// the base that it should be parsed in pub fn val(&self) -> (&str, u32) { (&self.0.val, if self.0.hex { 16 } else { 10 }) } } impl<'a> Float<'a> { /// Returns the original source text for this integer. pub fn src(&self) -> &'a str { self.0.src } /// Returns a parsed value of this float with all of the components still /// listed as strings. pub fn val(&self) -> &FloatVal<'a> { &self.0.val } } impl<'a> WasmString<'a> { /// Returns the original source text for this string. pub fn src(&self) -> &'a str { self.0.src } /// Returns a parsed value, as a list of bytes, for this string. pub fn val(&self) -> &[u8] { &self.0.val } } fn to_hex(c: char) -> u8 { match c { 'a'..='f' => c as u8 - b'a' + 10, 'A'..='F' => c as u8 - b'A' + 10, _ => c as u8 - b'0', } } impl fmt::Display for LexError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use LexError::*; match self { DanglingBlockComment => f.write_str("unterminated block comment")?, Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?, InvalidStringElement(c) => { write!(f, "invalid character in string '{}'", escape_char(*c))? } InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?, InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?, InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?, Expected { wanted, found } => write!( f, "expected '{}' but found '{}'", escape_char(*wanted), escape_char(*found) )?, UnexpectedEof => write!(f, "unexpected end-of-file")?, NumberTooBig => f.write_str("number is too big to parse")?, InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?, LoneUnderscore => write!(f, "bare underscore in numeric literal")?, ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?, } Ok(()) } } fn escape_char(c: char) -> String { match c { '\t' => String::from("\\t"), '\r' => String::from("\\r"), '\n' => String::from("\\n"), '\\' => String::from("\\\\"), '\'' => String::from("\\\'"), '\"' => String::from("\""), '\x20'..='\x7e' => String::from(c), _ => c.escape_unicode().to_string(), } } /// This is an attempt to protect agains the "trojan source" [1] problem where /// unicode characters can cause editors to render source code differently /// for humans than the compiler itself sees. /// /// To mitigate this issue, and because it's relatively rare in practice, /// this simply rejects characters of that form. /// /// [1]: https://www.trojansource.codes/ fn is_confusing_unicode(ch: char) -> bool { matches!( ch, '\u{202a}' | '\u{202b}' | '\u{202d}' | '\u{202e}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{206c}' | '\u{2069}' ) } #[cfg(test)] mod tests { use super::*; #[test] fn ws_smoke() { fn get_whitespace(input: &str) -> &str { match Lexer::new(input).parse().expect("no first token") { Some(Token::Whitespace(s)) => s, other => panic!("unexpected {:?}", other), } } assert_eq!(get_whitespace(" "), " "); assert_eq!(get_whitespace(" "), " "); assert_eq!(get_whitespace(" \n "), " \n "); assert_eq!(get_whitespace(" x"), " "); assert_eq!(get_whitespace(" ;"), " "); } #[test] fn line_comment_smoke() { fn get_line_comment(input: &str) -> &str { match Lexer::new(input).parse().expect("no first token") { Some(Token::LineComment(s)) => s, other => panic!("unexpected {:?}", other), } } assert_eq!(get_line_comment(";;"), ";;"); assert_eq!(get_line_comment(";; xyz"), ";; xyz"); assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz"); assert_eq!(get_line_comment(";;\nabc"), ";;"); assert_eq!(get_line_comment(";; \nabc"), ";; "); } #[test] fn block_comment_smoke() { fn get_block_comment(input: &str) -> &str { match Lexer::new(input).parse().expect("no first token") { Some(Token::BlockComment(s)) => s, other => panic!("unexpected {:?}", other), } } assert_eq!(get_block_comment("(;;)"), "(;;)"); assert_eq!(get_block_comment("(; ;)"), "(; ;)"); assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)"); } fn get_token(input: &str) -> Token<'_> { Lexer::new(input) .parse() .expect("no first token") .expect("no token") } #[test] fn lparen() { assert_eq!(get_token("(("), Token::LParen("(")); } #[test] fn rparen() { assert_eq!(get_token(")("), Token::RParen(")")); } #[test] fn strings() { fn get_string(input: &str) -> Vec { match get_token(input) { Token::String(s) => { assert_eq!(input, s.src()); s.val().to_vec() } other => panic!("not string {:?}", other), } } assert_eq!(&*get_string("\"\""), b""); assert_eq!(&*get_string("\"a\""), b"a"); assert_eq!(&*get_string("\"a b c d\""), b"a b c d"); assert_eq!(&*get_string("\"\\\"\""), b"\""); assert_eq!(&*get_string("\"\\'\""), b"'"); assert_eq!(&*get_string("\"\\n\""), b"\n"); assert_eq!(&*get_string("\"\\t\""), b"\t"); assert_eq!(&*get_string("\"\\r\""), b"\r"); assert_eq!(&*get_string("\"\\\\\""), b"\\"); assert_eq!(&*get_string("\"\\01\""), &[1]); assert_eq!(&*get_string("\"\\u{1}\""), &[1]); assert_eq!( &*get_string("\"\\u{0f3}\""), '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() ); assert_eq!( &*get_string("\"\\u{0_f_3}\""), '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() ); for i in 0..=255i32 { let s = format!("\"\\{:02x}\"", i); assert_eq!(&*get_string(&s), &[i as u8]); } } #[test] fn id() { fn get_id(input: &str) -> &str { match get_token(input) { Token::Id(s) => s, other => panic!("not id {:?}", other), } } assert_eq!(get_id("$x"), "$x"); assert_eq!(get_id("$xyz"), "$xyz"); assert_eq!(get_id("$x_z"), "$x_z"); assert_eq!(get_id("$0^"), "$0^"); assert_eq!(get_id("$0^;;"), "$0^"); assert_eq!(get_id("$0^ ;;"), "$0^"); } #[test] fn keyword() { fn get_keyword(input: &str) -> &str { match get_token(input) { Token::Keyword(s) => s, other => panic!("not id {:?}", other), } } assert_eq!(get_keyword("x"), "x"); assert_eq!(get_keyword("xyz"), "xyz"); assert_eq!(get_keyword("x_z"), "x_z"); assert_eq!(get_keyword("x_z "), "x_z"); assert_eq!(get_keyword("x_z "), "x_z"); } #[test] fn reserved() { fn get_reserved(input: &str) -> &str { match get_token(input) { Token::Reserved(s) => s, other => panic!("not reserved {:?}", other), } } assert_eq!(get_reserved("$ "), "$"); assert_eq!(get_reserved("^_x "), "^_x"); } #[test] fn integer() { fn get_integer(input: &str) -> String { match get_token(input) { Token::Integer(i) => { assert_eq!(input, i.src()); i.val().0.to_string() } other => panic!("not integer {:?}", other), } } assert_eq!(get_integer("1"), "1"); assert_eq!(get_integer("0"), "0"); assert_eq!(get_integer("-1"), "-1"); assert_eq!(get_integer("+1"), "1"); assert_eq!(get_integer("+1_000"), "1000"); assert_eq!(get_integer("+1_0_0_0"), "1000"); assert_eq!(get_integer("+0x10"), "10"); assert_eq!(get_integer("-0x10"), "-10"); assert_eq!(get_integer("0x10"), "10"); } #[test] fn float() { fn get_float(input: &str) -> FloatVal<'_> { match get_token(input) { Token::Float(i) => { assert_eq!(input, i.src()); i.0.val } other => panic!("not reserved {:?}", other), } } assert_eq!( get_float("nan"), FloatVal::Nan { val: None, negative: false }, ); assert_eq!( get_float("-nan"), FloatVal::Nan { val: None, negative: true, }, ); assert_eq!( get_float("+nan"), FloatVal::Nan { val: None, negative: false, }, ); assert_eq!( get_float("+nan:0x1"), FloatVal::Nan { val: Some(1), negative: false, }, ); assert_eq!( get_float("nan:0x7f_ffff"), FloatVal::Nan { val: Some(0x7fffff), negative: false, }, ); assert_eq!(get_float("inf"), FloatVal::Inf { negative: false }); assert_eq!(get_float("-inf"), FloatVal::Inf { negative: true }); assert_eq!(get_float("+inf"), FloatVal::Inf { negative: false }); assert_eq!( get_float("1.2"), FloatVal::Val { integral: "1".into(), decimal: Some("2".into()), exponent: None, hex: false, }, ); assert_eq!( get_float("1.2e3"), FloatVal::Val { integral: "1".into(), decimal: Some("2".into()), exponent: Some("3".into()), hex: false, }, ); assert_eq!( get_float("-1_2.1_1E+0_1"), FloatVal::Val { integral: "-12".into(), decimal: Some("11".into()), exponent: Some("01".into()), hex: false, }, ); assert_eq!( get_float("+1_2.1_1E-0_1"), FloatVal::Val { integral: "12".into(), decimal: Some("11".into()), exponent: Some("-01".into()), hex: false, }, ); assert_eq!( get_float("0x1_2.3_4p5_6"), FloatVal::Val { integral: "12".into(), decimal: Some("34".into()), exponent: Some("56".into()), hex: true, }, ); assert_eq!( get_float("+0x1_2.3_4P-5_6"), FloatVal::Val { integral: "12".into(), decimal: Some("34".into()), exponent: Some("-56".into()), hex: true, }, ); assert_eq!( get_float("1."), FloatVal::Val { integral: "1".into(), decimal: None, exponent: None, hex: false, }, ); assert_eq!( get_float("0x1p-24"), FloatVal::Val { integral: "1".into(), decimal: None, exponent: Some("-24".into()), hex: true, }, ); } }