diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/wast/src/lexer.rs | |
parent | Initial commit. (diff) | |
download | firefox-upstream.tar.xz firefox-upstream.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/wast/src/lexer.rs')
-rw-r--r-- | third_party/rust/wast/src/lexer.rs | 1125 |
1 files changed, 1125 insertions, 0 deletions
diff --git a/third_party/rust/wast/src/lexer.rs b/third_party/rust/wast/src/lexer.rs new file mode 100644 index 0000000000..99c46239f1 --- /dev/null +++ b/third_party/rust/wast/src/lexer.rs @@ -0,0 +1,1125 @@ +//! Definition of a lexer for the WebAssembly text format. +//! +//! This module provides a [`Lexer`][] type which is an iterate over the raw +//! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single +//! byte in a WebAssembly text field, returning tokens even for comments and +//! whitespace. Typically you'll ignore comments and whitespace, however. +//! +//! If you'd like to iterate over the tokens in a file you can do so via: +//! +//! ``` +//! # fn foo() -> Result<(), wast::Error> { +//! use wast::lexer::Lexer; +//! +//! let wat = "(module (func $foo))"; +//! for token in Lexer::new(wat) { +//! println!("{:?}", token?); +//! } +//! # Ok(()) +//! # } +//! ``` +//! +//! Note that you'll typically not use this module but will rather use +//! [`ParseBuffer`](crate::parser::ParseBuffer) instead. +//! +//! [`Lexer`]: crate::lexer::Lexer + +use crate::{Error, Span}; +use std::borrow::Cow; +use std::char; +use std::fmt; +use std::iter; +use std::str; + +/// A structure used to lex the s-expression syntax of WAT files. +/// +/// This structure is used to generate [`Source`] items, which should account for +/// every single byte of the input as we iterate over it. A [`LexError`] is +/// returned for any non-lexable text. +#[derive(Clone)] +pub struct Lexer<'a> { + it: iter::Peekable<str::CharIndices<'a>>, + input: &'a str, +} + +/// A fragment of source lex'd from an input string. +/// +/// This enumeration contains all kinds of fragments, including comments and +/// whitespace. For most cases you'll probably ignore these and simply look at +/// tokens. +#[derive(Debug, PartialEq)] +pub enum Token<'a> { + /// A line comment, preceded with `;;` + LineComment(&'a str), + + /// A block comment, surrounded by `(;` and `;)`. Note that these can be + /// nested. + BlockComment(&'a str), + + /// A fragment of source that represents whitespace. + Whitespace(&'a str), + + /// A left-parenthesis, including the source text for where it comes from. + LParen(&'a str), + /// A right-parenthesis, including the source text for where it comes from. + RParen(&'a str), + + /// A string literal, which is actually a list of bytes. + String(WasmString<'a>), + + /// An identifier (like `$foo`). + /// + /// All identifiers start with `$` and the payload here is the original + /// source text. + Id(&'a str), + + /// A keyword, or something that starts with an alphabetic character. + /// + /// The payload here is the original source text. + Keyword(&'a str), + + /// A reserved series of `idchar` symbols. Unknown what this is meant to be + /// used for, you'll probably generate an error about an unexpected token. + Reserved(&'a str), + + /// An integer. + Integer(Integer<'a>), + + /// A float. + Float(Float<'a>), +} + +/// Errors that can be generated while lexing. +/// +/// All lexing errors have line/colum/position information as well as a +/// `LexError` indicating what kind of error happened while lexing. +#[derive(Debug, Clone, PartialEq)] +pub enum LexError { + /// A dangling block comment was found with an unbalanced `(;` which was + /// never terminated in the file. + DanglingBlockComment, + + /// An unexpected character was encountered when generally parsing and + /// looking for something else. + Unexpected(char), + + /// An invalid `char` in a string literal was found. + InvalidStringElement(char), + + /// An invalid string escape letter was found (the thing after the `\` in + /// string literals) + InvalidStringEscape(char), + + /// An invalid hexadecimal digit was found. + InvalidHexDigit(char), + + /// An invalid base-10 digit was found. + InvalidDigit(char), + + /// Parsing expected `wanted` but ended up finding `found` instead where the + /// two characters aren't the same. + Expected { + /// The character that was expected to be found + wanted: char, + /// The character that was actually found + found: char, + }, + + /// We needed to parse more but EOF (or end of the string) was encountered. + UnexpectedEof, + + /// A number failed to parse because it was too big to fit within the target + /// type. + NumberTooBig, + + /// An invalid unicode value was found in a `\u{...}` escape in a string, + /// only valid unicode scalars can be escaped that way. + InvalidUnicodeValue(u32), + + /// A lone underscore was found when parsing a number, since underscores + /// should always be preceded and succeeded with a digit of some form. + LoneUnderscore, + + #[doc(hidden)] + __Nonexhaustive, +} + +/// A sign token for an integer. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum SignToken { + /// Plus sign: "+", + Plus, + /// Minus sign: "-", + Minus, +} + +/// A parsed integer, signed or unsigned. +/// +/// Methods can be use to access the value of the integer. +#[derive(Debug, PartialEq)] +pub struct Integer<'a>(Box<IntegerInner<'a>>); + +#[derive(Debug, PartialEq)] +struct IntegerInner<'a> { + sign: Option<SignToken>, + src: &'a str, + val: Cow<'a, str>, + hex: bool, +} + +/// A parsed float. +/// +/// Methods can be use to access the value of the float. +#[derive(Debug, PartialEq)] +pub struct Float<'a>(Box<FloatInner<'a>>); + +#[derive(Debug, PartialEq)] +struct FloatInner<'a> { + src: &'a str, + val: FloatVal<'a>, +} + +/// A parsed string. +#[derive(Debug, PartialEq)] +pub struct WasmString<'a>(Box<WasmStringInner<'a>>); + +#[derive(Debug, PartialEq)] +struct WasmStringInner<'a> { + src: &'a str, + val: Cow<'a, [u8]>, +} + +/// Possible parsed float values +#[derive(Debug, PartialEq)] +pub enum FloatVal<'a> { + /// A float `NaN` representation + Nan { + /// The specific bits to encode for this float, optionally + val: Option<u64>, + /// Whether or not this is a negative `NaN` or not. + negative: bool, + }, + /// An float infinite representation, + Inf { + #[allow(missing_docs)] + negative: bool, + }, + /// A parsed and separated floating point value + Val { + /// Whether or not the `integral` and `decimal` are specified in hex + hex: bool, + /// The float parts before the `.` + integral: Cow<'a, str>, + /// The float parts after the `.` + decimal: Option<Cow<'a, str>>, + /// The exponent to multiple this `integral.decimal` portion of the + /// float by. If `hex` is true this is `2^exponent` and otherwise it's + /// `10^exponent` + exponent: Option<Cow<'a, str>>, + }, +} + +impl<'a> Lexer<'a> { + /// Creates a new lexer which will lex the `input` source string. + pub fn new(input: &str) -> Lexer<'_> { + Lexer { + it: input.char_indices().peekable(), + input, + } + } + + /// Returns the original source input that we're lexing. + pub fn input(&self) -> &'a str { + self.input + } + + /// Lexes the next token in the input. + /// + /// Returns `Some` if a token is found or `None` if we're at EOF. + /// + /// # Errors + /// + /// Returns an error if the input is malformed. + pub fn parse(&mut self) -> Result<Option<Token<'a>>, Error> { + if let Some(ws) = self.ws() { + return Ok(Some(Token::Whitespace(ws))); + } + if let Some(comment) = self.comment()? { + return Ok(Some(comment)); + } + if let Some(token) = self.token()? { + return Ok(Some(token)); + } + match self.it.next() { + Some((i, ch)) => Err(self.error(i, LexError::Unexpected(ch))), + None => Ok(None), + } + } + + fn token(&mut self) -> Result<Option<Token<'a>>, Error> { + // First two are easy, they're just parens + if let Some(pos) = self.eat_char('(') { + return Ok(Some(Token::LParen(&self.input[pos..pos + 1]))); + } + if let Some(pos) = self.eat_char(')') { + return Ok(Some(Token::RParen(&self.input[pos..pos + 1]))); + } + + // Strings are also pretty easy, leading `"` is a dead giveaway + if let Some(pos) = self.eat_char('"') { + let val = self.string()?; + let src = &self.input[pos..self.cur()]; + return Ok(Some(Token::String(WasmString(Box::new(WasmStringInner { + val, + src, + }))))); + } + + let (start, prefix) = match self.it.peek().cloned() { + Some((i, ch)) if is_idchar(ch) => (i, ch), + Some((i, ch)) if is_reserved_extra(ch) => { + self.it.next(); + return Ok(Some(Token::Reserved(&self.input[i..self.cur()]))); + } + Some((i, ch)) => return Err(self.error(i, LexError::Unexpected(ch))), + None => return Ok(None), + }; + + while let Some((_, ch)) = self.it.peek().cloned() { + if is_idchar(ch) { + self.it.next(); + } else { + break; + } + } + + let reserved = &self.input[start..self.cur()]; + if let Some(number) = self.number(reserved) { + Ok(Some(number)) + } else if prefix == '$' && reserved.len() > 1 { + Ok(Some(Token::Id(reserved))) + } else if 'a' <= prefix && prefix <= 'z' { + Ok(Some(Token::Keyword(reserved))) + } else { + Ok(Some(Token::Reserved(reserved))) + } + } + + fn number(&self, src: &'a str) -> Option<Token<'a>> { + let (sign, num) = if src.starts_with('+') { + (Some(SignToken::Plus), &src[1..]) + } else if src.starts_with('-') { + (Some(SignToken::Minus), &src[1..]) + } else { + (None, src) + }; + + let negative = sign == Some(SignToken::Minus); + + // Handle `inf` and `nan` which are special numbers here + if num == "inf" { + return Some(Token::Float(Float(Box::new(FloatInner { + src, + val: FloatVal::Inf { negative }, + })))); + } else if num == "nan" { + return Some(Token::Float(Float(Box::new(FloatInner { + src, + val: FloatVal::Nan { + val: None, + negative, + }, + })))); + } else if num.starts_with("nan:0x") { + let mut it = num[6..].chars(); + let to_parse = skip_undescores(&mut it, false, char::is_ascii_hexdigit)?; + if it.next().is_some() { + return None; + } + let n = u64::from_str_radix(&to_parse, 16).ok()?; + return Some(Token::Float(Float(Box::new(FloatInner { + src, + val: FloatVal::Nan { + val: Some(n), + negative, + }, + })))); + } + + // Figure out if we're a hex number or not + let (mut it, hex, test_valid) = if num.starts_with("0x") { + ( + num[2..].chars(), + true, + char::is_ascii_hexdigit as fn(&char) -> bool, + ) + } else { + ( + num.chars(), + false, + char::is_ascii_digit as fn(&char) -> bool, + ) + }; + + // Evaluate the first part, moving out all underscores + let val = skip_undescores(&mut it, negative, test_valid)?; + + match it.clone().next() { + // If we're followed by something this may be a float so keep going. + Some(_) => {} + + // Otherwise this is a valid integer literal! + None => { + return Some(Token::Integer(Integer(Box::new(IntegerInner { + sign, + src, + val, + hex, + })))) + } + } + + // A number can optionally be after the decimal so only actually try to + // parse one if it's there. + let decimal = if it.clone().next() == Some('.') { + it.next(); + match it.clone().next() { + Some(c) if test_valid(&c) => Some(skip_undescores(&mut it, false, test_valid)?), + Some(_) | None => None, + } + } else { + None + }; + + // Figure out if there's an exponential part here to make a float, and + // if so parse it but defer its actual calculation until later. + let exponent = match (hex, it.next()) { + (true, Some('p')) | (true, Some('P')) | (false, Some('e')) | (false, Some('E')) => { + let negative = match it.clone().next() { + Some('-') => { + it.next(); + true + } + Some('+') => { + it.next(); + false + } + _ => false, + }; + Some(skip_undescores(&mut it, negative, char::is_ascii_digit)?) + } + (_, None) => None, + _ => return None, + }; + + // We should have eaten everything by now, if not then this is surely + // not a float or integer literal. + if it.next().is_some() { + return None; + } + + return Some(Token::Float(Float(Box::new(FloatInner { + src, + val: FloatVal::Val { + hex, + integral: val, + exponent, + decimal, + }, + })))); + + fn skip_undescores<'a>( + it: &mut str::Chars<'a>, + negative: bool, + good: fn(&char) -> bool, + ) -> Option<Cow<'a, str>> { + enum State { + Raw, + Collecting(String), + } + let mut last_underscore = false; + let mut state = if negative { + State::Collecting("-".to_string()) + } else { + State::Raw + }; + let input = it.as_str(); + let first = it.next()?; + if !good(&first) { + return None; + } + if let State::Collecting(s) = &mut state { + s.push(first); + } + let mut last = 1; + while let Some(c) = it.clone().next() { + if c == '_' && !last_underscore { + if let State::Raw = state { + state = State::Collecting(input[..last].to_string()); + } + it.next(); + last_underscore = true; + continue; + } + if !good(&c) { + break; + } + if let State::Collecting(s) = &mut state { + s.push(c); + } + last_underscore = false; + it.next(); + last += 1; + } + if last_underscore { + return None; + } + Some(match state { + State::Raw => input[..last].into(), + State::Collecting(s) => s.into(), + }) + } + } + + /// Attempts to consume whitespace from the input stream, returning `None` + /// if there's no whitespace to consume + fn ws(&mut self) -> Option<&'a str> { + let start = self.cur(); + loop { + match self.it.peek() { + Some((_, ' ')) | Some((_, '\n')) | Some((_, '\r')) | Some((_, '\t')) => { + drop(self.it.next()) + } + _ => break, + } + } + let end = self.cur(); + if start != end { + Some(&self.input[start..end]) + } else { + None + } + } + + /// Attempts to read a comment from the input stream + fn comment(&mut self) -> Result<Option<Token<'a>>, Error> { + if let Some(start) = self.eat_str(";;") { + loop { + match self.it.peek() { + None | Some((_, '\n')) => break, + _ => drop(self.it.next()), + } + } + let end = self.cur(); + return Ok(Some(Token::LineComment(&self.input[start..end]))); + } + if let Some(start) = self.eat_str("(;") { + let mut level = 1; + while let Some((_, ch)) = self.it.next() { + if ch == '(' && self.eat_char(';').is_some() { + level += 1; + } + if ch == ';' && self.eat_char(')').is_some() { + level -= 1; + if level == 0 { + let end = self.cur(); + return Ok(Some(Token::BlockComment(&self.input[start..end]))); + } + } + } + + return Err(self.error(start, LexError::DanglingBlockComment)); + } + Ok(None) + } + + /// Reads everything for a literal string except the leading `"`. Returns + /// the string value that has been read. + fn string(&mut self) -> Result<Cow<'a, [u8]>, Error> { + enum State { + Start(usize), + String(Vec<u8>), + } + let mut state = State::Start(self.cur()); + loop { + match self.it.next() { + Some((i, '\\')) => { + match state { + State::String(_) => {} + State::Start(start) => { + state = State::String(self.input[start..i].as_bytes().to_vec()); + } + } + let buf = match &mut state { + State::String(b) => b, + State::Start(_) => unreachable!(), + }; + match self.it.next() { + Some((_, '"')) => buf.push(b'"'), + Some((_, '\'')) => buf.push(b'\''), + Some((_, 't')) => buf.push(b'\t'), + Some((_, 'n')) => buf.push(b'\n'), + Some((_, 'r')) => buf.push(b'\r'), + Some((_, '\\')) => buf.push(b'\\'), + Some((i, 'u')) => { + self.must_eat_char('{')?; + let n = self.hexnum()?; + let c = char::from_u32(n) + .ok_or_else(|| self.error(i, LexError::InvalidUnicodeValue(n)))?; + buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); + self.must_eat_char('}')?; + } + Some((_, c1)) if c1.is_ascii_hexdigit() => { + let (_, c2) = self.hexdigit()?; + buf.push(to_hex(c1) * 16 + c2); + } + Some((i, c)) => return Err(self.error(i, LexError::InvalidStringEscape(c))), + None => return Err(self.error(self.input.len(), LexError::UnexpectedEof)), + } + } + Some((_, '"')) => break, + Some((i, c)) => { + if (c as u32) < 0x20 || c as u32 == 0x7f { + return Err(self.error(i, LexError::InvalidStringElement(c))); + } + match &mut state { + State::Start(_) => {} + State::String(v) => { + v.extend(c.encode_utf8(&mut [0; 4]).as_bytes()); + } + } + } + None => return Err(self.error(self.input.len(), LexError::UnexpectedEof)), + } + } + match state { + State::Start(pos) => Ok(self.input[pos..self.cur() - 1].as_bytes().into()), + State::String(s) => Ok(s.into()), + } + } + + fn hexnum(&mut self) -> Result<u32, Error> { + let (_, n) = self.hexdigit()?; + let mut last_underscore = false; + let mut n = n as u32; + while let Some((i, c)) = self.it.peek().cloned() { + if c == '_' { + self.it.next(); + last_underscore = true; + continue; + } + if !c.is_ascii_hexdigit() { + break; + } + last_underscore = false; + self.it.next(); + n = n + .checked_mul(16) + .and_then(|n| n.checked_add(to_hex(c) as u32)) + .ok_or_else(|| self.error(i, LexError::NumberTooBig))?; + } + if last_underscore { + let cur = self.cur(); + return Err(self.error(cur - 1, LexError::LoneUnderscore)); + } + Ok(n) + } + + /// Reads a hexidecimal digit from the input stream, returning where it's + /// defined and the hex value. Returns an error on EOF or an invalid hex + /// digit. + fn hexdigit(&mut self) -> Result<(usize, u8), Error> { + let (i, ch) = self.must_char()?; + if ch.is_ascii_hexdigit() { + Ok((i, to_hex(ch))) + } else { + Err(self.error(i, LexError::InvalidHexDigit(ch))) + } + } + + /// Returns where the match started, if any + fn eat_str(&mut self, s: &str) -> Option<usize> { + if !self.cur_str().starts_with(s) { + return None; + } + let ret = self.cur(); + for _ in s.chars() { + self.it.next(); + } + Some(ret) + } + + /// Returns where the match happened, if any + fn eat_char(&mut self, needle: char) -> Option<usize> { + match self.it.peek() { + Some((i, c)) if *c == needle => { + let ret = *i; + self.it.next(); + Some(ret) + } + _ => None, + } + } + + /// Reads the next character from the input string and where it's located, + /// returning an error if the input stream is empty. + fn must_char(&mut self) -> Result<(usize, char), Error> { + self.it + .next() + .ok_or_else(|| self.error(self.input.len(), LexError::UnexpectedEof)) + } + + /// Expects that a specific character must be read next + fn must_eat_char(&mut self, wanted: char) -> Result<usize, Error> { + let (pos, found) = self.must_char()?; + if wanted == found { + Ok(pos) + } else { + Err(self.error(pos, LexError::Expected { wanted, found })) + } + } + + /// Returns the current position of our iterator through the input string + fn cur(&mut self) -> usize { + self.it.peek().map(|p| p.0).unwrap_or(self.input.len()) + } + + /// Returns the remaining string that we have left to parse + fn cur_str(&mut self) -> &'a str { + &self.input[self.cur()..] + } + + /// Creates an error at `pos` with the specified `kind` + fn error(&self, pos: usize, kind: LexError) -> Error { + Error::lex(Span { offset: pos }, self.input, kind) + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Result<Token<'a>, Error>; + + fn next(&mut self) -> Option<Self::Item> { + self.parse().transpose() + } +} + +impl<'a> Token<'a> { + /// Returns the original source text for this token. + pub fn src(&self) -> &'a str { + match self { + Token::Whitespace(s) => s, + Token::BlockComment(s) => s, + Token::LineComment(s) => s, + Token::LParen(s) => s, + Token::RParen(s) => s, + Token::String(s) => s.src(), + Token::Id(s) => s, + Token::Keyword(s) => s, + Token::Reserved(s) => s, + Token::Integer(i) => i.src(), + Token::Float(f) => f.src(), + } + } +} + +impl<'a> Integer<'a> { + /// Returns the sign token for this integer. + pub fn sign(&self) -> Option<SignToken> { + self.0.sign + } + + /// Returns the original source text for this integer. + pub fn src(&self) -> &'a str { + self.0.src + } + + /// Returns the value string that can be parsed for this integer, as well as + /// the base that it should be parsed in + pub fn val(&self) -> (&str, u32) { + (&self.0.val, if self.0.hex { 16 } else { 10 }) + } +} + +impl<'a> Float<'a> { + /// Returns the original source text for this integer. + pub fn src(&self) -> &'a str { + self.0.src + } + + /// Returns a parsed value of this float with all of the components still + /// listed as strings. + pub fn val(&self) -> &FloatVal<'a> { + &self.0.val + } +} + +impl<'a> WasmString<'a> { + /// Returns the original source text for this string. + pub fn src(&self) -> &'a str { + self.0.src + } + + /// Returns a parsed value, as a list of bytes, for this string. + pub fn val(&self) -> &[u8] { + &self.0.val + } +} + +fn to_hex(c: char) -> u8 { + match c { + 'a'..='f' => c as u8 - b'a' + 10, + 'A'..='F' => c as u8 - b'A' + 10, + _ => c as u8 - b'0', + } +} + +fn is_idchar(c: char) -> bool { + match c { + '0'..='9' + | 'a'..='z' + | 'A'..='Z' + | '!' + | '#' + | '$' + | '%' + | '&' + | '\'' + | '*' + | '+' + | '-' + | '.' + | '/' + | ':' + | '<' + | '=' + | '>' + | '?' + | '@' + | '\\' + | '^' + | '_' + | '`' + | '|' + | '~' => true, + _ => false, + } +} + +fn is_reserved_extra(c: char) -> bool { + match c { + ',' | ';' | '[' | ']' | '{' | '}' => true, + _ => false, + } +} + +impl fmt::Display for LexError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use LexError::*; + match self { + DanglingBlockComment => f.write_str("unterminated block comment")?, + Unexpected(c) => write!(f, "unexpected character {:?}", c)?, + InvalidStringElement(c) => write!(f, "invalid character in string {:?}", c)?, + InvalidStringEscape(c) => write!(f, "invalid string escape {:?}", c)?, + InvalidHexDigit(c) => write!(f, "invalid hex digit {:?}", c)?, + InvalidDigit(c) => write!(f, "invalid decimal digit {:?}", c)?, + Expected { wanted, found } => write!(f, "expected {:?} but found {:?}", wanted, found)?, + UnexpectedEof => write!(f, "unexpected end-of-file")?, + NumberTooBig => f.write_str("number is too big to parse")?, + InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?, + LoneUnderscore => write!(f, "bare underscore in numeric literal")?, + __Nonexhaustive => unreachable!(), + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn ws_smoke() { + fn get_whitespace(input: &str) -> &str { + match Lexer::new(input).parse().expect("no first token") { + Some(Token::Whitespace(s)) => s, + other => panic!("unexpected {:?}", other), + } + } + assert_eq!(get_whitespace(" "), " "); + assert_eq!(get_whitespace(" "), " "); + assert_eq!(get_whitespace(" \n "), " \n "); + assert_eq!(get_whitespace(" x"), " "); + assert_eq!(get_whitespace(" ;"), " "); + } + + #[test] + fn line_comment_smoke() { + fn get_line_comment(input: &str) -> &str { + match Lexer::new(input).parse().expect("no first token") { + Some(Token::LineComment(s)) => s, + other => panic!("unexpected {:?}", other), + } + } + assert_eq!(get_line_comment(";;"), ";;"); + assert_eq!(get_line_comment(";; xyz"), ";; xyz"); + assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz"); + assert_eq!(get_line_comment(";;\nabc"), ";;"); + assert_eq!(get_line_comment(";; \nabc"), ";; "); + } + + #[test] + fn block_comment_smoke() { + fn get_block_comment(input: &str) -> &str { + match Lexer::new(input).parse().expect("no first token") { + Some(Token::BlockComment(s)) => s, + other => panic!("unexpected {:?}", other), + } + } + assert_eq!(get_block_comment("(;;)"), "(;;)"); + assert_eq!(get_block_comment("(; ;)"), "(; ;)"); + assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)"); + } + + fn get_token(input: &str) -> Token<'_> { + Lexer::new(input) + .parse() + .expect("no first token") + .expect("no token") + } + + #[test] + fn lparen() { + assert_eq!(get_token("(("), Token::LParen("(")); + } + + #[test] + fn rparen() { + assert_eq!(get_token(")("), Token::RParen(")")); + } + + #[test] + fn strings() { + fn get_string(input: &str) -> Vec<u8> { + match get_token(input) { + Token::String(s) => { + assert_eq!(input, s.src()); + s.val().to_vec() + } + other => panic!("not string {:?}", other), + } + } + assert_eq!(&*get_string("\"\""), b""); + assert_eq!(&*get_string("\"a\""), b"a"); + assert_eq!(&*get_string("\"a b c d\""), b"a b c d"); + assert_eq!(&*get_string("\"\\\"\""), b"\""); + assert_eq!(&*get_string("\"\\'\""), b"'"); + assert_eq!(&*get_string("\"\\n\""), b"\n"); + assert_eq!(&*get_string("\"\\t\""), b"\t"); + assert_eq!(&*get_string("\"\\r\""), b"\r"); + assert_eq!(&*get_string("\"\\\\\""), b"\\"); + assert_eq!(&*get_string("\"\\01\""), &[1]); + assert_eq!(&*get_string("\"\\u{1}\""), &[1]); + assert_eq!( + &*get_string("\"\\u{0f3}\""), + '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() + ); + assert_eq!( + &*get_string("\"\\u{0_f_3}\""), + '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes() + ); + + for i in 0..=255i32 { + let s = format!("\"\\{:02x}\"", i); + assert_eq!(&*get_string(&s), &[i as u8]); + } + } + + #[test] + fn id() { + fn get_id(input: &str) -> &str { + match get_token(input) { + Token::Id(s) => s, + other => panic!("not id {:?}", other), + } + } + assert_eq!(get_id("$x"), "$x"); + assert_eq!(get_id("$xyz"), "$xyz"); + assert_eq!(get_id("$x_z"), "$x_z"); + assert_eq!(get_id("$0^"), "$0^"); + assert_eq!(get_id("$0^;;"), "$0^"); + assert_eq!(get_id("$0^ ;;"), "$0^"); + } + + #[test] + fn keyword() { + fn get_keyword(input: &str) -> &str { + match get_token(input) { + Token::Keyword(s) => s, + other => panic!("not id {:?}", other), + } + } + assert_eq!(get_keyword("x"), "x"); + assert_eq!(get_keyword("xyz"), "xyz"); + assert_eq!(get_keyword("x_z"), "x_z"); + assert_eq!(get_keyword("x_z "), "x_z"); + assert_eq!(get_keyword("x_z "), "x_z"); + } + + #[test] + fn reserved() { + fn get_reserved(input: &str) -> &str { + match get_token(input) { + Token::Reserved(s) => s, + other => panic!("not reserved {:?}", other), + } + } + assert_eq!(get_reserved("$ "), "$"); + assert_eq!(get_reserved("^_x "), "^_x"); + } + + #[test] + fn integer() { + fn get_integer(input: &str) -> String { + match get_token(input) { + Token::Integer(i) => { + assert_eq!(input, i.src()); + i.val().0.to_string() + } + other => panic!("not integer {:?}", other), + } + } + assert_eq!(get_integer("1"), "1"); + assert_eq!(get_integer("0"), "0"); + assert_eq!(get_integer("-1"), "-1"); + assert_eq!(get_integer("+1"), "1"); + assert_eq!(get_integer("+1_000"), "1000"); + assert_eq!(get_integer("+1_0_0_0"), "1000"); + assert_eq!(get_integer("+0x10"), "10"); + assert_eq!(get_integer("-0x10"), "-10"); + assert_eq!(get_integer("0x10"), "10"); + } + + #[test] + fn float() { + fn get_float(input: &str) -> FloatVal<'_> { + match get_token(input) { + Token::Float(i) => { + assert_eq!(input, i.src()); + i.0.val + } + other => panic!("not reserved {:?}", other), + } + } + assert_eq!( + get_float("nan"), + FloatVal::Nan { + val: None, + negative: false + }, + ); + assert_eq!( + get_float("-nan"), + FloatVal::Nan { + val: None, + negative: true, + }, + ); + assert_eq!( + get_float("+nan"), + FloatVal::Nan { + val: None, + negative: false, + }, + ); + assert_eq!( + get_float("+nan:0x1"), + FloatVal::Nan { + val: Some(1), + negative: false, + }, + ); + assert_eq!( + get_float("nan:0x7f_ffff"), + FloatVal::Nan { + val: Some(0x7fffff), + negative: false, + }, + ); + assert_eq!(get_float("inf"), FloatVal::Inf { negative: false }); + assert_eq!(get_float("-inf"), FloatVal::Inf { negative: true }); + assert_eq!(get_float("+inf"), FloatVal::Inf { negative: false }); + + assert_eq!( + get_float("1.2"), + FloatVal::Val { + integral: "1".into(), + decimal: Some("2".into()), + exponent: None, + hex: false, + }, + ); + assert_eq!( + get_float("1.2e3"), + FloatVal::Val { + integral: "1".into(), + decimal: Some("2".into()), + exponent: Some("3".into()), + hex: false, + }, + ); + assert_eq!( + get_float("-1_2.1_1E+0_1"), + FloatVal::Val { + integral: "-12".into(), + decimal: Some("11".into()), + exponent: Some("01".into()), + hex: false, + }, + ); + assert_eq!( + get_float("+1_2.1_1E-0_1"), + FloatVal::Val { + integral: "12".into(), + decimal: Some("11".into()), + exponent: Some("-01".into()), + hex: false, + }, + ); + assert_eq!( + get_float("0x1_2.3_4p5_6"), + FloatVal::Val { + integral: "12".into(), + decimal: Some("34".into()), + exponent: Some("56".into()), + hex: true, + }, + ); + assert_eq!( + get_float("+0x1_2.3_4P-5_6"), + FloatVal::Val { + integral: "12".into(), + decimal: Some("34".into()), + exponent: Some("-56".into()), + hex: true, + }, + ); + assert_eq!( + get_float("1."), + FloatVal::Val { + integral: "1".into(), + decimal: None, + exponent: None, + hex: false, + }, + ); + assert_eq!( + get_float("0x1p-24"), + FloatVal::Val { + integral: "1".into(), + decimal: None, + exponent: Some("-24".into()), + hex: true, + }, + ); + } +} |