//! Definition of a lexer for the WebAssembly text format. //! //! This module provides a [`Lexer`][] type which is an iterate over the raw //! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single //! byte in a WebAssembly text field, returning tokens even for comments and //! whitespace. Typically you'll ignore comments and whitespace, however. //! //! If you'd like to iterate over the tokens in a file you can do so via: //! //! ``` //! # fn foo() -> Result<(), wast::Error> { //! use wast::lexer::Lexer; //! //! let wat = "(module (func $foo))"; //! for token in Lexer::new(wat).iter(0) { //! println!("{:?}", token?); //! } //! # Ok(()) //! # } //! ``` //! //! Note that you'll typically not use this module but will rather use //! [`ParseBuffer`](crate::parser::ParseBuffer) instead. //! //! [`Lexer`]: crate::lexer::Lexer use crate::token::Span; use crate::Error; use std::borrow::Cow; use std::char; use std::fmt; use std::slice; use std::str; /// A structure used to lex the s-expression syntax of WAT files. /// /// This structure is used to generate [`Token`] items, which should account for /// every single byte of the input as we iterate over it. A [`LexError`] is /// returned for any non-lexable text. #[derive(Clone)] pub struct Lexer<'a> { input: &'a str, allow_confusing_unicode: bool, } /// A single token parsed from a `Lexer`. #[derive(Copy, Clone, Debug, PartialEq)] pub struct Token { /// The kind of token this represents, such as whether it's whitespace, a /// keyword, etc. pub kind: TokenKind, /// The byte offset within the original source for where this token came /// from. pub offset: usize, /// The byte length of this token as it resides in the original source. // // NB: this is `u32` to enable packing `Token` into two pointers of size. // This does limit a single token to being at most 4G large, but that seems // probably ok. pub len: u32, } const _: () = { assert!(std::mem::size_of::() <= std::mem::size_of::() * 2); }; /// Classification of what was parsed from the input stream. /// /// This enumeration contains all kinds of fragments, including comments and /// whitespace. #[derive(Copy, Clone, Debug, PartialEq)] pub enum TokenKind { /// A line comment, preceded with `;;` LineComment, /// A block comment, surrounded by `(;` and `;)`. Note that these can be /// nested. BlockComment, /// A fragment of source that represents whitespace. Whitespace, /// A left-parenthesis, including the source text for where it comes from. LParen, /// A right-parenthesis, including the source text for where it comes from. RParen, /// A string literal, which is actually a list of bytes. String, /// An identifier (like `$foo`). /// /// All identifiers start with `$` and the payload here is the original /// source text. Id, /// A keyword, or something that starts with an alphabetic character. /// /// The payload here is the original source text. Keyword, /// A reserved series of `idchar` symbols. Unknown what this is meant to be /// used for, you'll probably generate an error about an unexpected token. Reserved, /// An integer. Integer(IntegerKind), /// A float. Float(FloatKind), } /// Description of the parsed integer from the source. #[derive(Copy, Clone, Debug, PartialEq)] pub struct IntegerKind { sign: Option, has_underscores: bool, hex: bool, } /// Description of a parsed float from the source. #[allow(missing_docs)] #[derive(Copy, Clone, Debug, PartialEq)] pub enum FloatKind { #[doc(hidden)] Inf { negative: bool }, #[doc(hidden)] Nan { negative: bool }, #[doc(hidden)] NanVal { negative: bool, has_underscores: bool, }, #[doc(hidden)] Normal { has_underscores: bool, hex: bool }, } enum ReservedKind { String, Idchars, Reserved, } /// Errors that can be generated while lexing. /// /// All lexing errors have line/colum/position information as well as a /// `LexError` indicating what kind of error happened while lexing. #[derive(Debug, Clone, PartialEq, Eq)] #[non_exhaustive] pub enum LexError { /// A dangling block comment was found with an unbalanced `(;` which was /// never terminated in the file. DanglingBlockComment, /// An unexpected character was encountered when generally parsing and /// looking for something else. Unexpected(char), /// An invalid `char` in a string literal was found. InvalidStringElement(char), /// An invalid string escape letter was found (the thing after the `\` in /// string literals) InvalidStringEscape(char), /// An invalid hexadecimal digit was found. InvalidHexDigit(char), /// An invalid base-10 digit was found. InvalidDigit(char), /// Parsing expected `wanted` but ended up finding `found` instead where the /// two characters aren't the same. Expected { /// The character that was expected to be found wanted: char, /// The character that was actually found found: char, }, /// We needed to parse more but EOF (or end of the string) was encountered. UnexpectedEof, /// A number failed to parse because it was too big to fit within the target /// type. NumberTooBig, /// An invalid unicode value was found in a `\u{...}` escape in a string, /// only valid unicode scalars can be escaped that way. InvalidUnicodeValue(u32), /// A lone underscore was found when parsing a number, since underscores /// should always be preceded and succeeded with a digit of some form. LoneUnderscore, /// A "confusing" unicode character is present in a comment or a string /// literal, such as a character that changes the direction text is /// typically displayed in editors. This could cause the human-read /// version to behave differently than the compiler-visible version, so /// these are simply rejected for now. ConfusingUnicode(char), } /// A sign token for an integer. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum SignToken { /// Plus sign: "+", Plus, /// Minus sign: "-", Minus, } /// A fully parsed integer from a source string with a payload ready to parse /// into an integral type. #[derive(Debug, PartialEq)] pub struct Integer<'a> { sign: Option, val: Cow<'a, str>, hex: bool, } /// Possible parsed float values #[derive(Debug, PartialEq, Eq)] pub enum Float<'a> { /// A float `NaN` representation Nan { /// The specific bits to encode for this float, optionally val: Option>, /// Whether or not this is a negative `NaN` or not. negative: bool, }, /// An float infinite representation, Inf { #[allow(missing_docs)] negative: bool, }, /// A parsed and separated floating point value Val { /// Whether or not the `integral` and `decimal` are specified in hex hex: bool, /// The float parts before the `.` integral: Cow<'a, str>, /// The float parts after the `.` decimal: Option>, /// The exponent to multiple this `integral.decimal` portion of the /// float by. If `hex` is true this is `2^exponent` and otherwise it's /// `10^exponent` exponent: Option>, }, } // https://webassembly.github.io/spec/core/text/values.html#text-idchar macro_rules! idchars { () => { b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'/' | b':' | b'<' | b'=' | b'>' | b'?' | b'@' | b'\\' | b'^' | b'_' | b'`' | b'|' | b'~' } } impl<'a> Lexer<'a> { /// Creates a new lexer which will lex the `input` source string. pub fn new(input: &str) -> Lexer<'_> { Lexer { input, allow_confusing_unicode: false, } } /// Returns the original source input that we're lexing. pub fn input(&self) -> &'a str { self.input } /// Configures whether "confusing" unicode characters are allowed while /// lexing. /// /// If allowed then no error will happen if these characters are found, but /// otherwise if disallowed a lex error will be produced when these /// characters are found. Confusing characters are denied by default. /// /// For now "confusing characters" are primarily related to the "trojan /// source" problem where it refers to characters which cause humans to read /// text differently than this lexer, such as characters that alter the /// left-to-right display of the source code. pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self { self.allow_confusing_unicode = allow; self } /// Lexes the next at the byte position `pos` in the input. /// /// Returns `Some` if a token is found or `None` if we're at EOF. /// /// The `pos` argument will be updated to point to the next token on a /// successful parse. /// /// # Errors /// /// Returns an error if the input is malformed. pub fn parse(&self, pos: &mut usize) -> Result, Error> { let offset = *pos; Ok(match self.parse_kind(pos)? { Some(kind) => Some(Token { kind, offset, len: (*pos - offset).try_into().unwrap(), }), None => None, }) } fn parse_kind(&self, pos: &mut usize) -> Result, Error> { let start = *pos; // This `match` generally parses the grammar specified at // // https://webassembly.github.io/spec/core/text/lexical.html#text-token let remaining = &self.input.as_bytes()[start..]; let byte = match remaining.first() { Some(b) => b, None => return Ok(None), }; match byte { // Open-parens check the next character to see if this is the start // of a block comment, otherwise it's just a bland left-paren // token. b'(' => match remaining.get(1) { Some(b';') => { let mut level = 1; // Note that we're doing a byte-level search here for the // close-delimiter of `;)`. The actual source text is utf-8 // encode in `remaining` but due to how utf-8 works we // can safely search for an ASCII byte since it'll never // otherwise appear in the middle of a codepoint and if we // find it then it's guaranteed to be the right byte. // // Mainly we're avoiding the overhead of decoding utf-8 // characters into a Rust `char` since it's otherwise // unnecessary work. let mut iter = remaining[2..].iter(); while let Some(ch) = iter.next() { match ch { b'(' => { if let Some(b';') = iter.as_slice().first() { level += 1; iter.next(); } } b';' => { if let Some(b')') = iter.as_slice().first() { level -= 1; iter.next(); if level == 0 { let len = remaining.len() - iter.as_slice().len(); let comment = &self.input[start..][..len]; *pos += len; self.check_confusing_comment(*pos, comment)?; return Ok(Some(TokenKind::BlockComment)); } } } _ => {} } } Err(self.error(start, LexError::DanglingBlockComment)) } _ => { *pos += 1; Ok(Some(TokenKind::LParen)) } }, b')' => { *pos += 1; Ok(Some(TokenKind::RParen)) } // https://webassembly.github.io/spec/core/text/lexical.html#white-space b' ' | b'\n' | b'\r' | b'\t' => { self.skip_ws(pos); Ok(Some(TokenKind::Whitespace)) } c @ (idchars!() | b'"') => { let (kind, src) = self.parse_reserved(pos)?; match kind { // If the reserved token was simply a single string then // that is converted to a standalone string token ReservedKind::String => return Ok(Some(TokenKind::String)), // If only idchars were consumed then this could be a // specific kind of standalone token we're interested in. ReservedKind::Idchars => { // https://webassembly.github.io/spec/core/text/values.html#integers if let Some(ret) = self.classify_number(src) { return Ok(Some(ret)); // https://webassembly.github.io/spec/core/text/values.html#text-id } else if *c == b'$' && src.len() > 1 { return Ok(Some(TokenKind::Id)); // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword } else if b'a' <= *c && *c <= b'z' { return Ok(Some(TokenKind::Keyword)); } } // ... otherwise this was a conglomeration of idchars, // strings, or just idchars that don't match a prior rule, // meaning this falls through to the fallback `Reserved` // token. ReservedKind::Reserved => {} } Ok(Some(TokenKind::Reserved)) } // This could be a line comment, otherwise `;` is a reserved token. // The second byte is checked to see if it's a `;;` line comment // // Note that this character being considered as part of a // `reserved` token is part of the annotations proposal. b';' => match remaining.get(1) { Some(b';') => { let remaining = &self.input[*pos..]; let byte_pos = memchr::memchr2(b'\n', b'\r', remaining.as_bytes()) .unwrap_or(remaining.len()); *pos += byte_pos; let comment = &remaining[..byte_pos]; self.check_confusing_comment(*pos, comment)?; Ok(Some(TokenKind::LineComment)) } _ => { *pos += 1; Ok(Some(TokenKind::Reserved)) } }, // Other known reserved tokens other than `;` // // Note that these characters being considered as part of a // `reserved` token is part of the annotations proposal. b',' | b'[' | b']' | b'{' | b'}' => { *pos += 1; Ok(Some(TokenKind::Reserved)) } _ => { let ch = self.input[start..].chars().next().unwrap(); Err(self.error(*pos, LexError::Unexpected(ch))) } } } fn skip_ws(&self, pos: &mut usize) { // This table is a byte lookup table to determine whether a byte is a // whitespace byte. There are only 4 whitespace bytes for the `*.wat` // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes // have a '1' in the table below. // // Due to how utf-8 works (our input is guaranteed to be utf-8) it is // known that if these bytes are found they're guaranteed to be the // whitespace byte, so they can be safely skipped and we don't have to // do full utf-8 decoding. This means that the goal of this function is // to find the first non-whitespace byte in `remaining`. // // For now this lookup table seems to be the fastest, but projects like // https://github.com/lemire/despacer show other simd algorithms which // can possibly accelerate this even more. Note that `*.wat` files often // have a lot of whitespace so this function is typically quite hot when // parsing inputs. #[rustfmt::skip] const WS: [u8; 256] = [ // \t \n \r /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ' ' /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; let remaining = &self.input[*pos..]; let non_ws_pos = remaining .as_bytes() .iter() .position(|b| WS[*b as usize] != 1) .unwrap_or(remaining.len()); *pos += non_ws_pos; } /// Splits off a "reserved" token which is then further processed later on /// to figure out which kind of token it is `depending on `ReservedKind`. /// /// For more information on this method see the clarification at /// but the general gist is /// that this is parsing the grammar: /// /// ```text /// reserved := (idchar | string)+ /// ``` /// /// which means that it is eating any number of adjacent string/idchar /// tokens (e.g. `a"b"c`) and returning the classification of what was /// eaten. The classification assists in determining what the actual token /// here eaten looks like. fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> { let mut idchars = false; let mut strings = 0u32; let start = *pos; while let Some(byte) = self.input.as_bytes().get(*pos) { match byte { // Normal `idchars` production which appends to the reserved // token that's being produced. idchars!() => { idchars = true; *pos += 1; } // https://webassembly.github.io/spec/core/text/values.html#text-string b'"' => { strings += 1; *pos += 1; let mut it = self.input[*pos..].chars(); let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode); *pos = self.input.len() - it.as_str().len(); match result { Ok(_) => {} Err(e) => { let err_pos = match &e { LexError::UnexpectedEof => self.input.len(), _ => self.input[..*pos].char_indices().next_back().unwrap().0, }; return Err(self.error(err_pos, e)); } } } // Nothing else is considered part of a reserved token _ => break, } } let ret = &self.input[start..*pos]; Ok(match (idchars, strings) { (false, 0) => unreachable!(), (false, 1) => (ReservedKind::String, ret), (true, 0) => (ReservedKind::Idchars, ret), _ => (ReservedKind::Reserved, ret), }) } fn classify_number(&self, src: &str) -> Option { let (sign, num) = if let Some(stripped) = src.strip_prefix('+') { (Some(SignToken::Plus), stripped) } else if let Some(stripped) = src.strip_prefix('-') { (Some(SignToken::Minus), stripped) } else { (None, src) }; let negative = sign == Some(SignToken::Minus); // Handle `inf` and `nan` which are special numbers here if num == "inf" { return Some(TokenKind::Float(FloatKind::Inf { negative })); } else if num == "nan" { return Some(TokenKind::Float(FloatKind::Nan { negative })); } else if let Some(stripped) = num.strip_prefix("nan:0x") { let mut it = stripped.as_bytes().iter(); let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?; if it.next().is_some() { return None; } return Some(TokenKind::Float(FloatKind::NanVal { negative, has_underscores, })); } // Figure out if we're a hex number or not let test_valid: fn(u8) -> bool; let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x") { test_valid = |x: u8| char::from(x).is_ascii_hexdigit(); (stripped.as_bytes().iter(), true) } else { test_valid = |x: u8| char::from(x).is_ascii_digit(); (num.as_bytes().iter(), false) }; // Evaluate the first part, moving out all underscores let mut has_underscores = skip_underscores(&mut it, test_valid)?; match it.clone().next() { // If we're followed by something this may be a float so keep going. Some(_) => {} // Otherwise this is a valid integer literal! None => { return Some(TokenKind::Integer(IntegerKind { has_underscores, sign, hex, })) } } // A number can optionally be after the decimal so only actually try to // parse one if it's there. if it.clone().next() == Some(&b'.') { it.next(); match it.clone().next() { Some(c) if test_valid(*c) => { if skip_underscores(&mut it, test_valid)? { has_underscores = true; } } Some(_) | None => {} } }; // Figure out if there's an exponential part here to make a float, and // if so parse it but defer its actual calculation until later. match (hex, it.next()) { (true, Some(b'p')) | (true, Some(b'P')) | (false, Some(b'e')) | (false, Some(b'E')) => { match it.clone().next() { Some(b'-') => { it.next(); } Some(b'+') => { it.next(); } _ => {} } if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? { has_underscores = true; } } (_, None) => {} _ => return None, } // We should have eaten everything by now, if not then this is surely // not a float or integer literal. if it.next().is_some() { return None; } return Some(TokenKind::Float(FloatKind::Normal { has_underscores, hex, })); fn skip_underscores<'a>( it: &mut slice::Iter<'_, u8>, good: fn(u8) -> bool, ) -> Option { let mut last_underscore = false; let mut has_underscores = false; let first = *it.next()?; if !good(first) { return None; } while let Some(c) = it.clone().next() { if *c == b'_' && !last_underscore { has_underscores = true; it.next(); last_underscore = true; continue; } if !good(*c) { break; } last_underscore = false; it.next(); } if last_underscore { return None; } Some(has_underscores) } } /// Verifies that `comment`, which is about to be returned, has a "confusing /// unicode character" in it and should instead be transformed into an /// error. fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> { if self.allow_confusing_unicode { return Ok(()); } // In an effort to avoid utf-8 decoding the entire `comment` the search // here is a bit more optimized. This checks for the `0xe2` byte because // in the utf-8 encoding that's the leading encoding byte for all // "confusing characters". Each instance of 0xe2 is checked to see if it // starts a confusing character, and if so that's returned. // // Also note that 0xe2 will never be found in the middle of a codepoint, // it's always the start of a codepoint. This means that if our special // characters show up they're guaranteed to start with 0xe2 bytes. let bytes = comment.as_bytes(); for pos in memchr::Memchr::new(0xe2, bytes) { if let Some(c) = comment[pos..].chars().next() { if is_confusing_unicode(c) { // Note that `self.cur()` accounts for already having // parsed `comment`, so we move backwards to where // `comment` started and then add the index within // `comment`. let pos = end - comment.len() + pos; return Err(self.error(pos, LexError::ConfusingUnicode(c))); } } } Ok(()) } fn parse_str( it: &mut str::Chars<'a>, allow_confusing_unicode: bool, ) -> Result, LexError> { enum State { Start, String(Vec), } let orig = it.as_str(); let mut state = State::Start; loop { match it.next().ok_or(LexError::UnexpectedEof)? { '"' => break, '\\' => { match state { State::String(_) => {} State::Start => { let pos = orig.len() - it.as_str().len() - 1; state = State::String(orig[..pos].as_bytes().to_vec()); } } let buf = match &mut state { State::String(b) => b, State::Start => unreachable!(), }; match it.next().ok_or(LexError::UnexpectedEof)? { '"' => buf.push(b'"'), '\'' => buf.push(b'\''), 't' => buf.push(b'\t'), 'n' => buf.push(b'\n'), 'r' => buf.push(b'\r'), '\\' => buf.push(b'\\'), 'u' => { Lexer::must_eat_char(it, '{')?; let n = Lexer::hexnum(it)?; let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?; buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); Lexer::must_eat_char(it, '}')?; } c1 if c1.is_ascii_hexdigit() => { let c2 = Lexer::hexdigit(it)?; buf.push(to_hex(c1) * 16 + c2); } c => return Err(LexError::InvalidStringEscape(c)), } } c if (c as u32) < 0x20 || c as u32 == 0x7f => { return Err(LexError::InvalidStringElement(c)) } c if !allow_confusing_unicode && is_confusing_unicode(c) => { return Err(LexError::ConfusingUnicode(c)) } c => match &mut state { State::Start => {} State::String(v) => { v.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); } }, } } match state { State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()), State::String(s) => Ok(s.into()), } } fn hexnum(it: &mut str::Chars<'_>) -> Result { let n = Lexer::hexdigit(it)?; let mut last_underscore = false; let mut n = n as u32; while let Some(c) = it.clone().next() { if c == '_' { it.next(); last_underscore = true; continue; } if !c.is_ascii_hexdigit() { break; } last_underscore = false; it.next(); n = n .checked_mul(16) .and_then(|n| n.checked_add(to_hex(c) as u32)) .ok_or(LexError::NumberTooBig)?; } if last_underscore { return Err(LexError::LoneUnderscore); } Ok(n) } /// Reads a hexidecimal digit from the input stream, returning where it's /// defined and the hex value. Returns an error on EOF or an invalid hex /// digit. fn hexdigit(it: &mut str::Chars<'_>) -> Result { let ch = Lexer::must_char(it)?; if ch.is_ascii_hexdigit() { Ok(to_hex(ch)) } else { Err(LexError::InvalidHexDigit(ch)) } } /// Reads the next character from the input string and where it's located, /// returning an error if the input stream is empty. fn must_char(it: &mut str::Chars<'_>) -> Result { it.next().ok_or(LexError::UnexpectedEof) } /// Expects that a specific character must be read next fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> { let found = Lexer::must_char(it)?; if wanted == found { Ok(()) } else { Err(LexError::Expected { wanted, found }) } } /// Creates an error at `pos` with the specified `kind` fn error(&self, pos: usize, kind: LexError) -> Error { Error::lex(Span { offset: pos }, self.input, kind) } /// Returns an iterator over all tokens in the original source string /// starting at the `pos` specified. pub fn iter(&self, mut pos: usize) -> impl Iterator> + '_ { std::iter::from_fn(move || self.parse(&mut pos).transpose()) } /// Returns whether an annotation is present at `pos` and the name of the /// annotation. pub fn annotation(&self, mut pos: usize) -> Option<&'a str> { let bytes = self.input.as_bytes(); // Quickly reject anything that for sure isn't an annotation since this // method is used every time an lparen is parsed. if bytes.get(pos) != Some(&b'@') { return None; } match self.parse(&mut pos) { Ok(Some(token)) => { match token.kind { TokenKind::Reserved => {} _ => return None, } if token.len == 1 { None // just the `@` character isn't a valid annotation } else { Some(&token.src(self.input)[1..]) } } Ok(None) | Err(_) => None, } } } impl Token { /// Returns the original source text for this token. pub fn src<'a>(&self, s: &'a str) -> &'a str { &s[self.offset..][..self.len.try_into().unwrap()] } /// Returns the identifier, without the leading `$` symbol, that this token /// represents. /// /// Should only be used with `TokenKind::Id`. pub fn id<'a>(&self, s: &'a str) -> &'a str { &self.src(s)[1..] } /// Returns the keyword this token represents. /// /// Should only be used with [`TokenKind::Keyword`]. pub fn keyword<'a>(&self, s: &'a str) -> &'a str { self.src(s) } /// Returns the reserved string this token represents. /// /// Should only be used with [`TokenKind::Reserved`]. pub fn reserved<'a>(&self, s: &'a str) -> &'a str { self.src(s) } /// Returns the parsed string that this token represents. /// /// This returns either a raw byte slice into the source if that's possible /// or an owned representation to handle escaped characters and such. /// /// Should only be used with [`TokenKind::String`]. pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> { let mut ch = self.src(s).chars(); ch.next().unwrap(); Lexer::parse_str(&mut ch, true).unwrap() } /// Returns the decomposed float token that this represents. /// /// This will slice up the float token into its component parts and return a /// description of the float token in the source. /// /// Should only be used with [`TokenKind::Float`]. pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> { match kind { FloatKind::Inf { negative } => Float::Inf { negative }, FloatKind::Nan { negative } => Float::Nan { val: None, negative, }, FloatKind::NanVal { negative, has_underscores, } => { let src = self.src(s); let src = if src.starts_with("n") { src } else { &src[1..] }; let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap()); if has_underscores { *val.to_mut() = val.replace("_", ""); } Float::Nan { val: Some(val), negative, } } FloatKind::Normal { has_underscores, hex, } => { let src = self.src(s); let (integral, decimal, exponent) = match src.find('.') { Some(i) => { let integral = &src[..i]; let rest = &src[i + 1..]; let exponent = if hex { rest.find('p').or_else(|| rest.find('P')) } else { rest.find('e').or_else(|| rest.find('E')) }; match exponent { Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])), None => (integral, Some(rest), None), } } None => { let exponent = if hex { src.find('p').or_else(|| src.find('P')) } else { src.find('e').or_else(|| src.find('E')) }; match exponent { Some(i) => (&src[..i], None, Some(&src[i + 1..])), None => (src, None, None), } } }; let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral)); let mut decimal = decimal.and_then(|s| { if s.is_empty() { None } else { Some(Cow::Borrowed(s)) } }); let mut exponent = exponent.map(|s| Cow::Borrowed(s.strip_prefix('+').unwrap_or(s))); if has_underscores { *integral.to_mut() = integral.replace("_", ""); if let Some(decimal) = &mut decimal { *decimal.to_mut() = decimal.replace("_", ""); } if let Some(exponent) = &mut exponent { *exponent.to_mut() = exponent.replace("_", ""); } } if hex { *integral.to_mut() = integral.replace("0x", ""); } Float::Val { hex, integral, decimal, exponent, } } } } /// Returns the decomposed integer token that this represents. /// /// This will slice up the integer token into its component parts and /// return a description of the integer token in the source. /// /// Should only be used with [`TokenKind::Integer`]. pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> { let src = self.src(s); let val = match kind.sign { Some(SignToken::Plus) => src.strip_prefix('+').unwrap(), Some(SignToken::Minus) => src, None => src, }; let mut val = Cow::Borrowed(val); if kind.has_underscores { *val.to_mut() = val.replace("_", ""); } if kind.hex { *val.to_mut() = val.replace("0x", ""); } Integer { sign: kind.sign, hex: kind.hex, val, } } } impl<'a> Integer<'a> { /// Returns the sign token for this integer. pub fn sign(&self) -> Option { self.sign } /// Returns the value string that can be parsed for this integer, as well /// as the base that it should be parsed in pub fn val(&self) -> (&str, u32) { (&self.val, if self.hex { 16 } else { 10 }) } } fn to_hex(c: char) -> u8 { match c { 'a'..='f' => c as u8 - b'a' + 10, 'A'..='F' => c as u8 - b'A' + 10, _ => c as u8 - b'0', } } impl fmt::Display for LexError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use LexError::*; match self { DanglingBlockComment => f.write_str("unterminated block comment")?, Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?, InvalidStringElement(c) => { write!(f, "invalid character in string '{}'", escape_char(*c))? } InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?, InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?, InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?, Expected { wanted, found } => write!( f, "expected '{}' but found '{}'", escape_char(*wanted), escape_char(*found) )?, UnexpectedEof => write!(f, "unexpected end-of-file")?, NumberTooBig => f.write_str("number is too big to parse")?, InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?, LoneUnderscore => write!(f, "bare underscore in numeric literal")?, ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?, } Ok(()) } } fn escape_char(c: char) -> String { match c { '\t' => String::from("\\t"), '\r' => String::from("\\r"), '\n' => String::from("\\n"), '\\' => String::from("\\\\"), '\'' => String::from("\\\'"), '\"' => String::from("\""), '\x20'..='\x7e' => String::from(c), _ => c.escape_unicode().to_string(), } } /// This is an attempt to protect agains the "trojan source" [1] problem where /// unicode characters can cause editors to render source code differently /// for humans than the compiler itself sees. /// /// To mitigate this issue, and because it's relatively rare in practice, /// this simply rejects characters of that form. /// /// [1]: https://www.trojansource.codes/ fn is_confusing_unicode(ch: char) -> bool { matches!( ch, '\u{202a}' | '\u{202b}' | '\u{202d}' | '\u{202e}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{206c}' | '\u{2069}' ) } #[cfg(test)] mod tests { use super::*; #[test] fn ws_smoke() { fn get_whitespace(input: &str) -> &str { let token = get_token(input); match token.kind { TokenKind::Whitespace => token.src(input), other => panic!("unexpected {:?}", other), } } assert_eq!(get_whitespace(" "), " "); assert_eq!(get_whitespace(" "), " "); assert_eq!(get_whitespace(" \n "), " \n "); assert_eq!(get_whitespace(" x"), " "); assert_eq!(get_whitespace(" ;"), " "); } #[test] fn line_comment_smoke() { fn get_line_comment(input: &str) -> &str { let token = get_token(input); match token.kind { TokenKind::LineComment => token.src(input), other => panic!("unexpected {:?}", other), } } assert_eq!(get_line_comment(";;"), ";;"); assert_eq!(get_line_comment(";; xyz"), ";; xyz"); assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz"); assert_eq!(get_line_comment(";;\nabc"), ";;"); assert_eq!(get_line_comment(";; \nabc"), ";; "); assert_eq!(get_line_comment(";; \rabc"), ";; "); assert_eq!(get_line_comment(";; \r\nabc"), ";; "); } #[test] fn block_comment_smoke() { fn get_block_comment(input: &str) -> &str { let token = get_token(input); match token.kind { TokenKind::BlockComment => token.src(input), other => panic!("unexpected {:?}", other), } } assert_eq!(get_block_comment("(;;)"), "(;;)"); assert_eq!(get_block_comment("(; ;)"), "(; ;)"); assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)"); } fn get_token(input: &str) -> Token { Lexer::new(input) .parse(&mut 0) .expect("no first token") .expect("no token") } #[test] fn lparen() { assert_eq!(get_token("((").kind, TokenKind::LParen); } #[test] fn rparen() { assert_eq!(get_token(")(").kind, TokenKind::RParen); } #[test] fn strings() { fn get_string(input: &str) -> Vec { let token = get_token(input); match token.kind { TokenKind::String => token.string(input).to_vec(), other => panic!("not keyword {:?}", other), } } assert_eq!(&*get_string("\"\""), b""); assert_eq!(&*get_string("\"a\""), b"a"); assert_eq!(&*get_string("\"a b c d\""), b"a b c d"); assert_eq!(&*get_string("\"\\\"\""), b"\""); assert_eq!(&*get_string("\"\\'\""), b"'"); assert_eq!(&*get_string("\"\\n\""), b"\n"); assert_eq!(&*get_string("\"\\t\""), b"\t"); assert_eq!(&*get_string("\"\\r\""), b"\r"); assert_eq!(&*get_string("\"\\\\\""), b"\\"); assert_eq!(&*get_string("\"\\01\""), &[1]); assert_eq!(&*get_string("\"\\u{1}\""), &[1]); assert_eq!( &*get_string("\"\\u{0f3}\""), '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() ); assert_eq!( &*get_string("\"\\u{0_f_3}\""), '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() ); for i in 0..=255i32 { let s = format!("\"\\{:02x}\"", i); assert_eq!(&*get_string(&s), &[i as u8]); } } #[test] fn id() { fn get_id(input: &str) -> &str { let token = get_token(input); match token.kind { TokenKind::Id => token.id(input), other => panic!("not id {:?}", other), } } assert_eq!(get_id("$x"), "x"); assert_eq!(get_id("$xyz"), "xyz"); assert_eq!(get_id("$x_z"), "x_z"); assert_eq!(get_id("$0^"), "0^"); assert_eq!(get_id("$0^;;"), "0^"); assert_eq!(get_id("$0^ ;;"), "0^"); } #[test] fn keyword() { fn get_keyword(input: &str) -> &str { let token = get_token(input); match token.kind { TokenKind::Keyword => token.keyword(input), other => panic!("not keyword {:?}", other), } } assert_eq!(get_keyword("x"), "x"); assert_eq!(get_keyword("xyz"), "xyz"); assert_eq!(get_keyword("x_z"), "x_z"); assert_eq!(get_keyword("x_z "), "x_z"); assert_eq!(get_keyword("x_z "), "x_z"); } #[test] fn reserved() { fn get_reserved(input: &str) -> &str { let token = get_token(input); match token.kind { TokenKind::Reserved => token.reserved(input), other => panic!("not reserved {:?}", other), } } assert_eq!(get_reserved("$ "), "$"); assert_eq!(get_reserved("^_x "), "^_x"); } #[test] fn integer() { fn get_integer(input: &str) -> String { let token = get_token(input); match token.kind { TokenKind::Integer(i) => token.integer(input, i).val.to_string(), other => panic!("not integer {:?}", other), } } assert_eq!(get_integer("1"), "1"); assert_eq!(get_integer("0"), "0"); assert_eq!(get_integer("-1"), "-1"); assert_eq!(get_integer("+1"), "1"); assert_eq!(get_integer("+1_000"), "1000"); assert_eq!(get_integer("+1_0_0_0"), "1000"); assert_eq!(get_integer("+0x10"), "10"); assert_eq!(get_integer("-0x10"), "-10"); assert_eq!(get_integer("0x10"), "10"); } #[test] fn float() { fn get_float(input: &str) -> Float<'_> { let token = get_token(input); match token.kind { TokenKind::Float(f) => token.float(input, f), other => panic!("not float {:?}", other), } } assert_eq!( get_float("nan"), Float::Nan { val: None, negative: false }, ); assert_eq!( get_float("-nan"), Float::Nan { val: None, negative: true, }, ); assert_eq!( get_float("+nan"), Float::Nan { val: None, negative: false, }, ); assert_eq!( get_float("+nan:0x1"), Float::Nan { val: Some("1".into()), negative: false, }, ); assert_eq!( get_float("nan:0x7f_ffff"), Float::Nan { val: Some("7fffff".into()), negative: false, }, ); assert_eq!(get_float("inf"), Float::Inf { negative: false }); assert_eq!(get_float("-inf"), Float::Inf { negative: true }); assert_eq!(get_float("+inf"), Float::Inf { negative: false }); assert_eq!( get_float("1.2"), Float::Val { integral: "1".into(), decimal: Some("2".into()), exponent: None, hex: false, }, ); assert_eq!( get_float("1.2e3"), Float::Val { integral: "1".into(), decimal: Some("2".into()), exponent: Some("3".into()), hex: false, }, ); assert_eq!( get_float("-1_2.1_1E+0_1"), Float::Val { integral: "-12".into(), decimal: Some("11".into()), exponent: Some("01".into()), hex: false, }, ); assert_eq!( get_float("+1_2.1_1E-0_1"), Float::Val { integral: "12".into(), decimal: Some("11".into()), exponent: Some("-01".into()), hex: false, }, ); assert_eq!( get_float("0x1_2.3_4p5_6"), Float::Val { integral: "12".into(), decimal: Some("34".into()), exponent: Some("56".into()), hex: true, }, ); assert_eq!( get_float("+0x1_2.3_4P-5_6"), Float::Val { integral: "12".into(), decimal: Some("34".into()), exponent: Some("-56".into()), hex: true, }, ); assert_eq!( get_float("1."), Float::Val { integral: "1".into(), decimal: None, exponent: None, hex: false, }, ); assert_eq!( get_float("0x1p-24"), Float::Val { integral: "1".into(), decimal: None, exponent: Some("-24".into()), hex: true, }, ); } }