diff options
Diffstat (limited to 'third_party/rust/wast/src/lexer.rs')
-rw-r--r-- | third_party/rust/wast/src/lexer.rs | 1334 |
1 files changed, 1334 insertions, 0 deletions
diff --git a/third_party/rust/wast/src/lexer.rs b/third_party/rust/wast/src/lexer.rs new file mode 100644 index 0000000000..a4f8f128c7 --- /dev/null +++ b/third_party/rust/wast/src/lexer.rs @@ -0,0 +1,1334 @@ +//! Definition of a lexer for the WebAssembly text format. +//! +//! This module provides a [`Lexer`][] type which is an iterate over the raw +//! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single +//! byte in a WebAssembly text field, returning tokens even for comments and +//! whitespace. Typically you'll ignore comments and whitespace, however. +//! +//! If you'd like to iterate over the tokens in a file you can do so via: +//! +//! ``` +//! # fn foo() -> Result<(), wast::Error> { +//! use wast::lexer::Lexer; +//! +//! let wat = "(module (func $foo))"; +//! for token in Lexer::new(wat) { +//! println!("{:?}", token?); +//! } +//! # Ok(()) +//! # } +//! ``` +//! +//! Note that you'll typically not use this module but will rather use +//! [`ParseBuffer`](crate::parser::ParseBuffer) instead. +//! +//! [`Lexer`]: crate::lexer::Lexer + +use crate::token::Span; +use crate::Error; +use std::borrow::Cow; +use std::char; +use std::fmt; +use std::str; + +/// A structure used to lex the s-expression syntax of WAT files. +/// +/// This structure is used to generate [`Token`] items, which should account for +/// every single byte of the input as we iterate over it. A [`LexError`] is +/// returned for any non-lexable text. +#[derive(Clone)] +pub struct Lexer<'a> { + remaining: &'a str, + input: &'a str, + allow_confusing_unicode: bool, +} + +/// A fragment of source lex'd from an input string. +/// +/// This enumeration contains all kinds of fragments, including comments and +/// whitespace. For most cases you'll probably ignore these and simply look at +/// tokens. +#[derive(Debug, PartialEq)] +pub enum Token<'a> { + /// A line comment, preceded with `;;` + LineComment(&'a str), + + /// A block comment, surrounded by `(;` and `;)`. Note that these can be + /// nested. + BlockComment(&'a str), + + /// A fragment of source that represents whitespace. + Whitespace(&'a str), + + /// A left-parenthesis, including the source text for where it comes from. + LParen(&'a str), + /// A right-parenthesis, including the source text for where it comes from. + RParen(&'a str), + + /// A string literal, which is actually a list of bytes. + String(WasmString<'a>), + + /// An identifier (like `$foo`). + /// + /// All identifiers start with `$` and the payload here is the original + /// source text. + Id(&'a str), + + /// A keyword, or something that starts with an alphabetic character. + /// + /// The payload here is the original source text. + Keyword(&'a str), + + /// A reserved series of `idchar` symbols. Unknown what this is meant to be + /// used for, you'll probably generate an error about an unexpected token. + Reserved(&'a str), + + /// An integer. + Integer(Integer<'a>), + + /// A float. + Float(Float<'a>), +} + +enum ReservedKind<'a> { + String(Cow<'a, [u8]>), + Idchars, + Reserved, +} + +/// Errors that can be generated while lexing. +/// +/// All lexing errors have line/colum/position information as well as a +/// `LexError` indicating what kind of error happened while lexing. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum LexError { + /// A dangling block comment was found with an unbalanced `(;` which was + /// never terminated in the file. + DanglingBlockComment, + + /// An unexpected character was encountered when generally parsing and + /// looking for something else. + Unexpected(char), + + /// An invalid `char` in a string literal was found. + InvalidStringElement(char), + + /// An invalid string escape letter was found (the thing after the `\` in + /// string literals) + InvalidStringEscape(char), + + /// An invalid hexadecimal digit was found. + InvalidHexDigit(char), + + /// An invalid base-10 digit was found. + InvalidDigit(char), + + /// Parsing expected `wanted` but ended up finding `found` instead where the + /// two characters aren't the same. + Expected { + /// The character that was expected to be found + wanted: char, + /// The character that was actually found + found: char, + }, + + /// We needed to parse more but EOF (or end of the string) was encountered. + UnexpectedEof, + + /// A number failed to parse because it was too big to fit within the target + /// type. + NumberTooBig, + + /// An invalid unicode value was found in a `\u{...}` escape in a string, + /// only valid unicode scalars can be escaped that way. + InvalidUnicodeValue(u32), + + /// A lone underscore was found when parsing a number, since underscores + /// should always be preceded and succeeded with a digit of some form. + LoneUnderscore, + + /// A "confusing" unicode character is present in a comment or a string + /// literal, such as a character that changes the direction text is + /// typically displayed in editors. This could cause the human-read + /// version to behave differently than the compiler-visible version, so + /// these are simply rejected for now. + ConfusingUnicode(char), +} + +/// A sign token for an integer. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum SignToken { + /// Plus sign: "+", + Plus, + /// Minus sign: "-", + Minus, +} + +/// A parsed integer, signed or unsigned. +/// +/// Methods can be use to access the value of the integer. +#[derive(Debug, PartialEq)] +pub struct Integer<'a>(Box<IntegerInner<'a>>); + +#[derive(Debug, PartialEq)] +struct IntegerInner<'a> { + sign: Option<SignToken>, + src: &'a str, + val: Cow<'a, str>, + hex: bool, +} + +/// A parsed float. +/// +/// Methods can be use to access the value of the float. +#[derive(Debug, PartialEq)] +pub struct Float<'a>(Box<FloatInner<'a>>); + +#[derive(Debug, PartialEq)] +struct FloatInner<'a> { + src: &'a str, + val: FloatVal<'a>, +} + +/// A parsed string. +#[derive(Debug, PartialEq)] +pub struct WasmString<'a>(Box<WasmStringInner<'a>>); + +#[derive(Debug, PartialEq)] +struct WasmStringInner<'a> { + src: &'a str, + val: Cow<'a, [u8]>, +} + +/// Possible parsed float values +#[derive(Debug, PartialEq, Eq)] +pub enum FloatVal<'a> { + /// A float `NaN` representation + Nan { + /// The specific bits to encode for this float, optionally + val: Option<u64>, + /// Whether or not this is a negative `NaN` or not. + negative: bool, + }, + /// An float infinite representation, + Inf { + #[allow(missing_docs)] + negative: bool, + }, + /// A parsed and separated floating point value + Val { + /// Whether or not the `integral` and `decimal` are specified in hex + hex: bool, + /// The float parts before the `.` + integral: Cow<'a, str>, + /// The float parts after the `.` + decimal: Option<Cow<'a, str>>, + /// The exponent to multiple this `integral.decimal` portion of the + /// float by. If `hex` is true this is `2^exponent` and otherwise it's + /// `10^exponent` + exponent: Option<Cow<'a, str>>, + }, +} + +// https://webassembly.github.io/spec/core/text/values.html#text-idchar +macro_rules! idchars { + () => { + b'0'..=b'9' + | b'A'..=b'Z' + | b'a'..=b'z' + | b'!' + | b'#' + | b'$' + | b'%' + | b'&' + | b'\'' + | b'*' + | b'+' + | b'-' + | b'.' + | b'/' + | b':' + | b'<' + | b'=' + | b'>' + | b'?' + | b'@' + | b'\\' + | b'^' + | b'_' + | b'`' + | b'|' + | b'~' + } +} + +impl<'a> Lexer<'a> { + /// Creates a new lexer which will lex the `input` source string. + pub fn new(input: &str) -> Lexer<'_> { + Lexer { + remaining: input, + input, + allow_confusing_unicode: false, + } + } + + /// Returns the original source input that we're lexing. + pub fn input(&self) -> &'a str { + self.input + } + + /// Configures whether "confusing" unicode characters are allowed while + /// lexing. + /// + /// If allowed then no error will happen if these characters are found, but + /// otherwise if disallowed a lex error will be produced when these + /// characters are found. Confusing characters are denied by default. + /// + /// For now "confusing characters" are primarily related to the "trojan + /// source" problem where it refers to characters which cause humans to read + /// text differently than this lexer, such as characters that alter the + /// left-to-right display of the source code. + pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self { + self.allow_confusing_unicode = allow; + self + } + + /// Lexes the next token in the input. + /// + /// Returns `Some` if a token is found or `None` if we're at EOF. + /// + /// # Errors + /// + /// Returns an error if the input is malformed. + pub fn parse(&mut self) -> Result<Option<Token<'a>>, Error> { + let pos = self.cur(); + // This `match` generally parses the grammar specified at + // + // https://webassembly.github.io/spec/core/text/lexical.html#text-token + let byte = match self.remaining.as_bytes().first() { + Some(b) => b, + None => return Ok(None), + }; + + match byte { + // Open-parens check the next character to see if this is the start + // of a block comment, otherwise it's just a bland left-paren + // token. + b'(' => match self.remaining.as_bytes().get(1) { + Some(b';') => { + let mut level = 1; + // Note that we're doing a byte-level search here for the + // close-delimiter of `;)`. The actual source text is utf-8 + // encode in `self.remaining` but due to how utf-8 works we + // can safely search for an ASCII byte since it'll never + // otherwise appear in the middle of a codepoint and if we + // find it then it's guaranteed to be the right byte. + // + // Mainly we're avoiding the overhead of decoding utf-8 + // characters into a Rust `char` since it's otherwise + // unnecessary work. + let mut iter = self.remaining.as_bytes()[2..].iter(); + while let Some(ch) = iter.next() { + match ch { + b'(' => { + if let Some(b';') = iter.as_slice().first() { + level += 1; + iter.next(); + } + } + b';' => { + if let Some(b')') = iter.as_slice().first() { + level -= 1; + iter.next(); + if level == 0 { + let len = self.remaining.len() - iter.as_slice().len(); + let (comment, remaining) = self.remaining.split_at(len); + self.remaining = remaining; + self.check_confusing_comment(comment)?; + return Ok(Some(Token::BlockComment(comment))); + } + } + } + _ => {} + } + } + Err(self.error(pos, LexError::DanglingBlockComment)) + } + _ => Ok(Some(Token::LParen(self.split_first_byte()))), + }, + + b')' => Ok(Some(Token::RParen(self.split_first_byte()))), + + // https://webassembly.github.io/spec/core/text/lexical.html#white-space + b' ' | b'\n' | b'\r' | b'\t' => Ok(Some(Token::Whitespace(self.split_ws()))), + + c @ (idchars!() | b'"') => { + let (kind, src) = self.split_reserved()?; + match kind { + // If the reserved token was simply a single string then + // that is converted to a standalone string token + ReservedKind::String(val) => { + return Ok(Some(Token::String(WasmString(Box::new(WasmStringInner { + val, + src, + }))))); + } + + // If only idchars were consumed then this could be a + // specific kind of standalone token we're interested in. + ReservedKind::Idchars => { + // https://webassembly.github.io/spec/core/text/values.html#integers + if let Some(number) = self.number(src) { + return Ok(Some(number)); + // https://webassembly.github.io/spec/core/text/values.html#text-id + } else if *c == b'$' && src.len() > 1 { + return Ok(Some(Token::Id(src))); + // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword + } else if b'a' <= *c && *c <= b'z' { + return Ok(Some(Token::Keyword(src))); + } + } + + // ... otherwise this was a conglomeration of idchars, + // strings, or just idchars that don't match a prior rule, + // meaning this falls through to the fallback `Reserved` + // token. + ReservedKind::Reserved => {} + } + + Ok(Some(Token::Reserved(src))) + } + + // This could be a line comment, otherwise `;` is a reserved token. + // The second byte is checked to see if it's a `;;` line comment + // + // Note that this character being considered as part of a + // `reserved` token is part of the annotations proposal. + b';' => match self.remaining.as_bytes().get(1) { + Some(b';') => { + let comment = self.split_until(b'\n'); + self.check_confusing_comment(comment)?; + Ok(Some(Token::LineComment(comment))) + } + _ => Ok(Some(Token::Reserved(self.split_first_byte()))), + }, + + // Other known reserved tokens other than `;` + // + // Note that these characters being considered as part of a + // `reserved` token is part of the annotations proposal. + b',' | b'[' | b']' | b'{' | b'}' => Ok(Some(Token::Reserved(self.split_first_byte()))), + + _ => { + let ch = self.remaining.chars().next().unwrap(); + Err(self.error(pos, LexError::Unexpected(ch))) + } + } + } + + fn split_first_byte(&mut self) -> &'a str { + let (token, remaining) = self.remaining.split_at(1); + self.remaining = remaining; + token + } + + fn split_until(&mut self, byte: u8) -> &'a str { + let pos = memchr::memchr(byte, self.remaining.as_bytes()).unwrap_or(self.remaining.len()); + let (ret, remaining) = self.remaining.split_at(pos); + self.remaining = remaining; + ret + } + + fn split_ws(&mut self) -> &'a str { + // This table is a byte lookup table to determine whether a byte is a + // whitespace byte. There are only 4 whitespace bytes for the `*.wat` + // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes + // have a '1' in the table below. + // + // Due to how utf-8 works (our input is guaranteed to be utf-8) it is + // known that if these bytes are found they're guaranteed to be the + // whitespace byte, so they can be safely skipped and we don't have to + // do full utf-8 decoding. This means that the goal of this function is + // to find the first non-whitespace byte in `self.remaining`. + // + // For now this lookup table seems to be the fastest, but projects like + // https://github.com/lemire/despacer show other simd algorithms which + // can possibly accelerate this even more. Note that `*.wat` files often + // have a lot of whitespace so this function is typically quite hot when + // parsing inputs. + #[rustfmt::skip] + const WS: [u8; 256] = [ + // \t \n \r + /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, + /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // ' ' + /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let pos = self + .remaining + .as_bytes() + .iter() + .position(|b| WS[*b as usize] != 1) + .unwrap_or(self.remaining.len()); + let (ret, remaining) = self.remaining.split_at(pos); + self.remaining = remaining; + ret + } + + /// Splits off a "reserved" token which is then further processed later on + /// to figure out which kind of token it is `depending on `ReservedKind`. + /// + /// For more information on this method see the clarification at + /// https://github.com/WebAssembly/spec/pull/1499 but the general gist is + /// that this is parsing the grammar: + /// + /// ```text + /// reserved := (idchar | string)+ + /// ``` + /// + /// which means that it is eating any number of adjacent string/idchar + /// tokens (e.g. `a"b"c`) and returning the classification of what was + /// eaten. The classification assists in determining what the actual token + /// here eaten looks like. + fn split_reserved(&mut self) -> Result<(ReservedKind<'a>, &'a str), Error> { + let mut idchars = false; + let mut strings = 0u32; + let mut last_string_val = None; + let mut pos = 0; + while let Some(byte) = self.remaining.as_bytes().get(pos) { + match byte { + // Normal `idchars` production which appends to the reserved + // token that's being produced. + idchars!() => { + idchars = true; + pos += 1; + } + + // https://webassembly.github.io/spec/core/text/values.html#text-string + b'"' => { + strings += 1; + pos += 1; + let mut it = self.remaining[pos..].chars(); + let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode); + pos = self.remaining.len() - it.as_str().len(); + match result { + Ok(s) => last_string_val = Some(s), + Err(e) => { + let start = self.input.len() - self.remaining.len(); + self.remaining = &self.remaining[pos..]; + let err_pos = match &e { + LexError::UnexpectedEof => self.input.len(), + _ => { + self.input[..start + pos] + .char_indices() + .next_back() + .unwrap() + .0 + } + }; + return Err(self.error(err_pos, e)); + } + } + } + + // Nothing else is considered part of a reserved token + _ => break, + } + } + let (ret, remaining) = self.remaining.split_at(pos); + self.remaining = remaining; + Ok(match (idchars, strings) { + (false, 0) => unreachable!(), + (false, 1) => (ReservedKind::String(last_string_val.unwrap()), ret), + (true, 0) => (ReservedKind::Idchars, ret), + _ => (ReservedKind::Reserved, ret), + }) + } + + fn number(&self, src: &'a str) -> Option<Token<'a>> { + let (sign, num) = if let Some(stripped) = src.strip_prefix('+') { + (Some(SignToken::Plus), stripped) + } else if let Some(stripped) = src.strip_prefix('-') { + (Some(SignToken::Minus), stripped) + } else { + (None, src) + }; + + let negative = sign == Some(SignToken::Minus); + + // Handle `inf` and `nan` which are special numbers here + if num == "inf" { + return Some(Token::Float(Float(Box::new(FloatInner { + src, + val: FloatVal::Inf { negative }, + })))); + } else if num == "nan" { + return Some(Token::Float(Float(Box::new(FloatInner { + src, + val: FloatVal::Nan { + val: None, + negative, + }, + })))); + } else if let Some(stripped) = num.strip_prefix("nan:0x") { + let mut it = stripped.chars(); + let to_parse = skip_undescores(&mut it, false, char::is_ascii_hexdigit)?; + if it.next().is_some() { + return None; + } + let n = u64::from_str_radix(&to_parse, 16).ok()?; + return Some(Token::Float(Float(Box::new(FloatInner { + src, + val: FloatVal::Nan { + val: Some(n), + negative, + }, + })))); + } + + // Figure out if we're a hex number or not + let (mut it, hex, test_valid) = if let Some(stripped) = num.strip_prefix("0x") { + ( + stripped.chars(), + true, + char::is_ascii_hexdigit as fn(&char) -> bool, + ) + } else { + ( + num.chars(), + false, + char::is_ascii_digit as fn(&char) -> bool, + ) + }; + + // Evaluate the first part, moving out all underscores + let val = skip_undescores(&mut it, negative, test_valid)?; + + match it.clone().next() { + // If we're followed by something this may be a float so keep going. + Some(_) => {} + + // Otherwise this is a valid integer literal! + None => { + return Some(Token::Integer(Integer(Box::new(IntegerInner { + sign, + src, + val, + hex, + })))) + } + } + + // A number can optionally be after the decimal so only actually try to + // parse one if it's there. + let decimal = if it.clone().next() == Some('.') { + it.next(); + match it.clone().next() { + Some(c) if test_valid(&c) => Some(skip_undescores(&mut it, false, test_valid)?), + Some(_) | None => None, + } + } else { + None + }; + + // Figure out if there's an exponential part here to make a float, and + // if so parse it but defer its actual calculation until later. + let exponent = match (hex, it.next()) { + (true, Some('p')) | (true, Some('P')) | (false, Some('e')) | (false, Some('E')) => { + let negative = match it.clone().next() { + Some('-') => { + it.next(); + true + } + Some('+') => { + it.next(); + false + } + _ => false, + }; + Some(skip_undescores(&mut it, negative, char::is_ascii_digit)?) + } + (_, None) => None, + _ => return None, + }; + + // We should have eaten everything by now, if not then this is surely + // not a float or integer literal. + if it.next().is_some() { + return None; + } + + return Some(Token::Float(Float(Box::new(FloatInner { + src, + val: FloatVal::Val { + hex, + integral: val, + exponent, + decimal, + }, + })))); + + fn skip_undescores<'a>( + it: &mut str::Chars<'a>, + negative: bool, + good: fn(&char) -> bool, + ) -> Option<Cow<'a, str>> { + enum State { + Raw, + Collecting(String), + } + let mut last_underscore = false; + let mut state = if negative { + State::Collecting("-".to_string()) + } else { + State::Raw + }; + let input = it.as_str(); + let first = it.next()?; + if !good(&first) { + return None; + } + if let State::Collecting(s) = &mut state { + s.push(first); + } + let mut last = 1; + while let Some(c) = it.clone().next() { + if c == '_' && !last_underscore { + if let State::Raw = state { + state = State::Collecting(input[..last].to_string()); + } + it.next(); + last_underscore = true; + continue; + } + if !good(&c) { + break; + } + if let State::Collecting(s) = &mut state { + s.push(c); + } + last_underscore = false; + it.next(); + last += 1; + } + if last_underscore { + return None; + } + Some(match state { + State::Raw => input[..last].into(), + State::Collecting(s) => s.into(), + }) + } + } + + /// Verifies that `comment`, which is about to be returned, has a "confusing + /// unicode character" in it and should instead be transformed into an + /// error. + fn check_confusing_comment(&self, comment: &str) -> Result<(), Error> { + if self.allow_confusing_unicode { + return Ok(()); + } + + // In an effort to avoid utf-8 decoding the entire `comment` the search + // here is a bit more optimized. This checks for the `0xe2` byte because + // in the utf-8 encoding that's the leading encoding byte for all + // "confusing characters". Each instance of 0xe2 is checked to see if it + // starts a confusing character, and if so that's returned. + // + // Also note that 0xe2 will never be found in the middle of a codepoint, + // it's always the start of a codepoint. This means that if our special + // characters show up they're guaranteed to start with 0xe2 bytes. + let bytes = comment.as_bytes(); + for pos in memchr::Memchr::new(0xe2, bytes) { + if let Some(c) = comment[pos..].chars().next() { + if is_confusing_unicode(c) { + // Note that `self.cur()` accounts for already having + // parsed `comment`, so we move backwards to where + // `comment` started and then add the index within + // `comment`. + let pos = self.cur() - comment.len() + pos; + return Err(self.error(pos, LexError::ConfusingUnicode(c))); + } + } + } + + Ok(()) + } + + fn parse_str( + it: &mut str::Chars<'a>, + allow_confusing_unicode: bool, + ) -> Result<Cow<'a, [u8]>, LexError> { + enum State { + Start, + String(Vec<u8>), + } + let orig = it.as_str(); + let mut state = State::Start; + loop { + match it.next().ok_or(LexError::UnexpectedEof)? { + '"' => break, + '\\' => { + match state { + State::String(_) => {} + State::Start => { + let pos = orig.len() - it.as_str().len() - 1; + state = State::String(orig[..pos].as_bytes().to_vec()); + } + } + let buf = match &mut state { + State::String(b) => b, + State::Start => unreachable!(), + }; + match it.next().ok_or(LexError::UnexpectedEof)? { + '"' => buf.push(b'"'), + '\'' => buf.push(b'\''), + 't' => buf.push(b'\t'), + 'n' => buf.push(b'\n'), + 'r' => buf.push(b'\r'), + '\\' => buf.push(b'\\'), + 'u' => { + Lexer::must_eat_char(it, '{')?; + let n = Lexer::hexnum(it)?; + let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?; + buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); + Lexer::must_eat_char(it, '}')?; + } + c1 if c1.is_ascii_hexdigit() => { + let c2 = Lexer::hexdigit(it)?; + buf.push(to_hex(c1) * 16 + c2); + } + c => return Err(LexError::InvalidStringEscape(c)), + } + } + c if (c as u32) < 0x20 || c as u32 == 0x7f => { + return Err(LexError::InvalidStringElement(c)) + } + c if !allow_confusing_unicode && is_confusing_unicode(c) => { + return Err(LexError::ConfusingUnicode(c)) + } + c => match &mut state { + State::Start => {} + State::String(v) => { + v.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); + } + }, + } + } + match state { + State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()), + State::String(s) => Ok(s.into()), + } + } + + fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> { + let n = Lexer::hexdigit(it)?; + let mut last_underscore = false; + let mut n = n as u32; + while let Some(c) = it.clone().next() { + if c == '_' { + it.next(); + last_underscore = true; + continue; + } + if !c.is_ascii_hexdigit() { + break; + } + last_underscore = false; + it.next(); + n = n + .checked_mul(16) + .and_then(|n| n.checked_add(to_hex(c) as u32)) + .ok_or(LexError::NumberTooBig)?; + } + if last_underscore { + return Err(LexError::LoneUnderscore); + } + Ok(n) + } + + /// Reads a hexidecimal digit from the input stream, returning where it's + /// defined and the hex value. Returns an error on EOF or an invalid hex + /// digit. + fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> { + let ch = Lexer::must_char(it)?; + if ch.is_ascii_hexdigit() { + Ok(to_hex(ch)) + } else { + Err(LexError::InvalidHexDigit(ch)) + } + } + + /// Reads the next character from the input string and where it's located, + /// returning an error if the input stream is empty. + fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> { + it.next().ok_or(LexError::UnexpectedEof) + } + + /// Expects that a specific character must be read next + fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> { + let found = Lexer::must_char(it)?; + if wanted == found { + Ok(()) + } else { + Err(LexError::Expected { wanted, found }) + } + } + + /// Returns the current position of our iterator through the input string + fn cur(&self) -> usize { + self.input.len() - self.remaining.len() + } + + /// Creates an error at `pos` with the specified `kind` + fn error(&self, pos: usize, kind: LexError) -> Error { + Error::lex(Span { offset: pos }, self.input, kind) + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Result<Token<'a>, Error>; + + fn next(&mut self) -> Option<Self::Item> { + self.parse().transpose() + } +} + +impl<'a> Token<'a> { + /// Returns the original source text for this token. + pub fn src(&self) -> &'a str { + match self { + Token::Whitespace(s) => s, + Token::BlockComment(s) => s, + Token::LineComment(s) => s, + Token::LParen(s) => s, + Token::RParen(s) => s, + Token::String(s) => s.src(), + Token::Id(s) => s, + Token::Keyword(s) => s, + Token::Reserved(s) => s, + Token::Integer(i) => i.src(), + Token::Float(f) => f.src(), + } + } +} + +impl<'a> Integer<'a> { + /// Returns the sign token for this integer. + pub fn sign(&self) -> Option<SignToken> { + self.0.sign + } + + /// Returns the original source text for this integer. + pub fn src(&self) -> &'a str { + self.0.src + } + + /// Returns the value string that can be parsed for this integer, as well as + /// the base that it should be parsed in + pub fn val(&self) -> (&str, u32) { + (&self.0.val, if self.0.hex { 16 } else { 10 }) + } +} + +impl<'a> Float<'a> { + /// Returns the original source text for this integer. + pub fn src(&self) -> &'a str { + self.0.src + } + + /// Returns a parsed value of this float with all of the components still + /// listed as strings. + pub fn val(&self) -> &FloatVal<'a> { + &self.0.val + } +} + +impl<'a> WasmString<'a> { + /// Returns the original source text for this string. + pub fn src(&self) -> &'a str { + self.0.src + } + + /// Returns a parsed value, as a list of bytes, for this string. + pub fn val(&self) -> &[u8] { + &self.0.val + } +} + +fn to_hex(c: char) -> u8 { + match c { + 'a'..='f' => c as u8 - b'a' + 10, + 'A'..='F' => c as u8 - b'A' + 10, + _ => c as u8 - b'0', + } +} + +impl fmt::Display for LexError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use LexError::*; + match self { + DanglingBlockComment => f.write_str("unterminated block comment")?, + Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?, + InvalidStringElement(c) => { + write!(f, "invalid character in string '{}'", escape_char(*c))? + } + InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?, + InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?, + InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?, + Expected { wanted, found } => write!( + f, + "expected '{}' but found '{}'", + escape_char(*wanted), + escape_char(*found) + )?, + UnexpectedEof => write!(f, "unexpected end-of-file")?, + NumberTooBig => f.write_str("number is too big to parse")?, + InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?, + LoneUnderscore => write!(f, "bare underscore in numeric literal")?, + ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?, + } + Ok(()) + } +} + +fn escape_char(c: char) -> String { + match c { + '\t' => String::from("\\t"), + '\r' => String::from("\\r"), + '\n' => String::from("\\n"), + '\\' => String::from("\\\\"), + '\'' => String::from("\\\'"), + '\"' => String::from("\""), + '\x20'..='\x7e' => String::from(c), + _ => c.escape_unicode().to_string(), + } +} + +/// This is an attempt to protect agains the "trojan source" [1] problem where +/// unicode characters can cause editors to render source code differently +/// for humans than the compiler itself sees. +/// +/// To mitigate this issue, and because it's relatively rare in practice, +/// this simply rejects characters of that form. +/// +/// [1]: https://www.trojansource.codes/ +fn is_confusing_unicode(ch: char) -> bool { + matches!( + ch, + '\u{202a}' + | '\u{202b}' + | '\u{202d}' + | '\u{202e}' + | '\u{2066}' + | '\u{2067}' + | '\u{2068}' + | '\u{206c}' + | '\u{2069}' + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn ws_smoke() { + fn get_whitespace(input: &str) -> &str { + match Lexer::new(input).parse().expect("no first token") { + Some(Token::Whitespace(s)) => s, + other => panic!("unexpected {:?}", other), + } + } + assert_eq!(get_whitespace(" "), " "); + assert_eq!(get_whitespace(" "), " "); + assert_eq!(get_whitespace(" \n "), " \n "); + assert_eq!(get_whitespace(" x"), " "); + assert_eq!(get_whitespace(" ;"), " "); + } + + #[test] + fn line_comment_smoke() { + fn get_line_comment(input: &str) -> &str { + match Lexer::new(input).parse().expect("no first token") { + Some(Token::LineComment(s)) => s, + other => panic!("unexpected {:?}", other), + } + } + assert_eq!(get_line_comment(";;"), ";;"); + assert_eq!(get_line_comment(";; xyz"), ";; xyz"); + assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz"); + assert_eq!(get_line_comment(";;\nabc"), ";;"); + assert_eq!(get_line_comment(";; \nabc"), ";; "); + } + + #[test] + fn block_comment_smoke() { + fn get_block_comment(input: &str) -> &str { + match Lexer::new(input).parse().expect("no first token") { + Some(Token::BlockComment(s)) => s, + other => panic!("unexpected {:?}", other), + } + } + assert_eq!(get_block_comment("(;;)"), "(;;)"); + assert_eq!(get_block_comment("(; ;)"), "(; ;)"); + assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)"); + } + + fn get_token(input: &str) -> Token<'_> { + Lexer::new(input) + .parse() + .expect("no first token") + .expect("no token") + } + + #[test] + fn lparen() { + assert_eq!(get_token("(("), Token::LParen("(")); + } + + #[test] + fn rparen() { + assert_eq!(get_token(")("), Token::RParen(")")); + } + + #[test] + fn strings() { + fn get_string(input: &str) -> Vec<u8> { + match get_token(input) { + Token::String(s) => { + assert_eq!(input, s.src()); + s.val().to_vec() + } + other => panic!("not string {:?}", other), + } + } + assert_eq!(&*get_string("\"\""), b""); + assert_eq!(&*get_string("\"a\""), b"a"); + assert_eq!(&*get_string("\"a b c d\""), b"a b c d"); + assert_eq!(&*get_string("\"\\\"\""), b"\""); + assert_eq!(&*get_string("\"\\'\""), b"'"); + assert_eq!(&*get_string("\"\\n\""), b"\n"); + assert_eq!(&*get_string("\"\\t\""), b"\t"); + assert_eq!(&*get_string("\"\\r\""), b"\r"); + assert_eq!(&*get_string("\"\\\\\""), b"\\"); + assert_eq!(&*get_string("\"\\01\""), &[1]); + assert_eq!(&*get_string("\"\\u{1}\""), &[1]); + assert_eq!( + &*get_string("\"\\u{0f3}\""), + '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() + ); + assert_eq!( + &*get_string("\"\\u{0_f_3}\""), + '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() + ); + + for i in 0..=255i32 { + let s = format!("\"\\{:02x}\"", i); + assert_eq!(&*get_string(&s), &[i as u8]); + } + } + + #[test] + fn id() { + fn get_id(input: &str) -> &str { + match get_token(input) { + Token::Id(s) => s, + other => panic!("not id {:?}", other), + } + } + assert_eq!(get_id("$x"), "$x"); + assert_eq!(get_id("$xyz"), "$xyz"); + assert_eq!(get_id("$x_z"), "$x_z"); + assert_eq!(get_id("$0^"), "$0^"); + assert_eq!(get_id("$0^;;"), "$0^"); + assert_eq!(get_id("$0^ ;;"), "$0^"); + } + + #[test] + fn keyword() { + fn get_keyword(input: &str) -> &str { + match get_token(input) { + Token::Keyword(s) => s, + other => panic!("not id {:?}", other), + } + } + assert_eq!(get_keyword("x"), "x"); + assert_eq!(get_keyword("xyz"), "xyz"); + assert_eq!(get_keyword("x_z"), "x_z"); + assert_eq!(get_keyword("x_z "), "x_z"); + assert_eq!(get_keyword("x_z "), "x_z"); + } + + #[test] + fn reserved() { + fn get_reserved(input: &str) -> &str { + match get_token(input) { + Token::Reserved(s) => s, + other => panic!("not reserved {:?}", other), + } + } + assert_eq!(get_reserved("$ "), "$"); + assert_eq!(get_reserved("^_x "), "^_x"); + } + + #[test] + fn integer() { + fn get_integer(input: &str) -> String { + match get_token(input) { + Token::Integer(i) => { + assert_eq!(input, i.src()); + i.val().0.to_string() + } + other => panic!("not integer {:?}", other), + } + } + assert_eq!(get_integer("1"), "1"); + assert_eq!(get_integer("0"), "0"); + assert_eq!(get_integer("-1"), "-1"); + assert_eq!(get_integer("+1"), "1"); + assert_eq!(get_integer("+1_000"), "1000"); + assert_eq!(get_integer("+1_0_0_0"), "1000"); + assert_eq!(get_integer("+0x10"), "10"); + assert_eq!(get_integer("-0x10"), "-10"); + assert_eq!(get_integer("0x10"), "10"); + } + + #[test] + fn float() { + fn get_float(input: &str) -> FloatVal<'_> { + match get_token(input) { + Token::Float(i) => { + assert_eq!(input, i.src()); + i.0.val + } + other => panic!("not reserved {:?}", other), + } + } + assert_eq!( + get_float("nan"), + FloatVal::Nan { + val: None, + negative: false + }, + ); + assert_eq!( + get_float("-nan"), + FloatVal::Nan { + val: None, + negative: true, + }, + ); + assert_eq!( + get_float("+nan"), + FloatVal::Nan { + val: None, + negative: false, + }, + ); + assert_eq!( + get_float("+nan:0x1"), + FloatVal::Nan { + val: Some(1), + negative: false, + }, + ); + assert_eq!( + get_float("nan:0x7f_ffff"), + FloatVal::Nan { + val: Some(0x7fffff), + negative: false, + }, + ); + assert_eq!(get_float("inf"), FloatVal::Inf { negative: false }); + assert_eq!(get_float("-inf"), FloatVal::Inf { negative: true }); + assert_eq!(get_float("+inf"), FloatVal::Inf { negative: false }); + + assert_eq!( + get_float("1.2"), + FloatVal::Val { + integral: "1".into(), + decimal: Some("2".into()), + exponent: None, + hex: false, + }, + ); + assert_eq!( + get_float("1.2e3"), + FloatVal::Val { + integral: "1".into(), + decimal: Some("2".into()), + exponent: Some("3".into()), + hex: false, + }, + ); + assert_eq!( + get_float("-1_2.1_1E+0_1"), + FloatVal::Val { + integral: "-12".into(), + decimal: Some("11".into()), + exponent: Some("01".into()), + hex: false, + }, + ); + assert_eq!( + get_float("+1_2.1_1E-0_1"), + FloatVal::Val { + integral: "12".into(), + decimal: Some("11".into()), + exponent: Some("-01".into()), + hex: false, + }, + ); + assert_eq!( + get_float("0x1_2.3_4p5_6"), + FloatVal::Val { + integral: "12".into(), + decimal: Some("34".into()), + exponent: Some("56".into()), + hex: true, + }, + ); + assert_eq!( + get_float("+0x1_2.3_4P-5_6"), + FloatVal::Val { + integral: "12".into(), + decimal: Some("34".into()), + exponent: Some("-56".into()), + hex: true, + }, + ); + assert_eq!( + get_float("1."), + FloatVal::Val { + integral: "1".into(), + decimal: None, + exponent: None, + hex: false, + }, + ); + assert_eq!( + get_float("0x1p-24"), + FloatVal::Val { + integral: "1".into(), + decimal: None, + exponent: Some("-24".into()), + hex: true, + }, + ); + } +} |