//! Simple hand-written ungrammar lexer use crate::error::{bail, Result}; #[derive(Debug, Eq, PartialEq)] pub(crate) enum TokenKind { Node(String), Token(String), Eq, Star, Pipe, QMark, Colon, LParen, RParen, } #[derive(Debug)] pub(crate) struct Token { pub(crate) kind: TokenKind, pub(crate) loc: Location, } #[derive(Copy, Clone, Default, Debug)] pub(crate) struct Location { pub(crate) line: usize, pub(crate) column: usize, } impl Location { fn advance(&mut self, text: &str) { match text.rfind('\n') { Some(idx) => { self.line += text.chars().filter(|&it| it == '\n').count(); self.column = text[idx + 1..].chars().count(); } None => self.column += text.chars().count(), } } } pub(crate) fn tokenize(mut input: &str) -> Result> { let mut res = Vec::new(); let mut loc = Location::default(); while !input.is_empty() { let old_input = input; skip_ws(&mut input); skip_comment(&mut input); if old_input.len() == input.len() { match advance(&mut input) { Ok(kind) => { res.push(Token { kind, loc }); } Err(err) => return Err(err.with_location(loc)), } } let consumed = old_input.len() - input.len(); loc.advance(&old_input[..consumed]); } Ok(res) } fn skip_ws(input: &mut &str) { *input = input.trim_start_matches(is_whitespace) } fn skip_comment(input: &mut &str) { if input.starts_with("//") { let idx = input.find('\n').map_or(input.len(), |it| it + 1); *input = &input[idx..] } } fn advance(input: &mut &str) -> Result { let mut chars = input.chars(); let c = chars.next().unwrap(); let res = match c { '=' => TokenKind::Eq, '*' => TokenKind::Star, '?' => TokenKind::QMark, '(' => TokenKind::LParen, ')' => TokenKind::RParen, '|' => TokenKind::Pipe, ':' => TokenKind::Colon, '\'' => { let mut buf = String::new(); loop { match chars.next() { None => bail!("unclosed token literal"), Some('\\') => match chars.next() { Some(c) if is_escapable(c) => buf.push(c), _ => bail!("invalid escape in token literal"), }, Some('\'') => break, Some(c) => buf.push(c), } } TokenKind::Token(buf) } c if is_ident_char(c) => { let mut buf = String::new(); buf.push(c); loop { match chars.clone().next() { Some(c) if is_ident_char(c) => { chars.next(); buf.push(c); } _ => break, } } TokenKind::Node(buf) } '\r' => bail!("unexpected `\\r`, only Unix-style line endings allowed"), c => bail!("unexpected character: `{}`", c), }; *input = chars.as_str(); Ok(res) } fn is_escapable(c: char) -> bool { matches!(c, '\\' | '\'') } fn is_whitespace(c: char) -> bool { matches!(c, ' ' | '\t' | '\n') } fn is_ident_char(c: char) -> bool { matches!(c, 'a'..='z' | 'A'..='Z' | '_') }