// pest. The Elegant Parser // Copyright (c) 2018 DragoČ™ Tiselice // // Licensed under the Apache License, Version 2.0 // or the MIT // license , at your // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. //! Helpers for validating pest grammars that could help with debugging //! and provide a more user-friendly error message. use once_cell::sync::Lazy; use std::collections::{HashMap, HashSet}; use pest::error::{Error, ErrorVariant, InputLocation}; use pest::iterators::Pairs; use pest::unicode::unicode_property_names; use pest::Span; use crate::parser::{ParserExpr, ParserNode, ParserRule, Rule}; static RUST_KEYWORDS: Lazy> = Lazy::new(|| { [ "abstract", "alignof", "as", "become", "box", "break", "const", "continue", "crate", "do", "else", "enum", "extern", "false", "final", "fn", "for", "if", "impl", "in", "let", "loop", "macro", "match", "mod", "move", "mut", "offsetof", "override", "priv", "proc", "pure", "pub", "ref", "return", "Self", "self", "sizeof", "static", "struct", "super", "trait", "true", "type", "typeof", "unsafe", "unsized", "use", "virtual", "where", "while", "yield", ] .iter() .cloned() .collect() }); static PEST_KEYWORDS: Lazy> = Lazy::new(|| { [ "_", "ANY", "DROP", "EOI", "PEEK", "PEEK_ALL", "POP", "POP_ALL", "PUSH", "SOI", ] .iter() .cloned() .collect() }); static BUILTINS: Lazy> = Lazy::new(|| { [ "ANY", "DROP", "EOI", "PEEK", "PEEK_ALL", "POP", "POP_ALL", "SOI", "ASCII_DIGIT", "ASCII_NONZERO_DIGIT", "ASCII_BIN_DIGIT", "ASCII_OCT_DIGIT", "ASCII_HEX_DIGIT", "ASCII_ALPHA_LOWER", "ASCII_ALPHA_UPPER", "ASCII_ALPHA", "ASCII_ALPHANUMERIC", "ASCII", "NEWLINE", ] .iter() .cloned() .chain(unicode_property_names()) .collect::>() }); /// It checks the parsed grammar for common mistakes: /// - using Pest keywords /// - duplicate rules /// - undefined rules /// /// It returns a `Result` with a `Vec` of `Error`s if any of the above is found. /// If no errors are found, it returns the vector of names of used builtin rules. pub fn validate_pairs(pairs: Pairs<'_, Rule>) -> Result, Vec>> { let definitions: Vec<_> = pairs .clone() .filter(|pair| pair.as_rule() == Rule::grammar_rule) .map(|pair| pair.into_inner().next().unwrap()) .filter(|pair| pair.as_rule() != Rule::line_doc) .map(|pair| pair.as_span()) .collect(); let called_rules: Vec<_> = pairs .clone() .filter(|pair| pair.as_rule() == Rule::grammar_rule) .flat_map(|pair| { pair.into_inner() .flatten() .skip(1) .filter(|pair| pair.as_rule() == Rule::identifier) .map(|pair| pair.as_span()) }) .collect(); let mut errors = vec![]; errors.extend(validate_pest_keywords(&definitions)); errors.extend(validate_already_defined(&definitions)); errors.extend(validate_undefined(&definitions, &called_rules)); if !errors.is_empty() { return Err(errors); } let definitions: HashSet<_> = definitions.iter().map(|span| span.as_str()).collect(); let called_rules: HashSet<_> = called_rules.iter().map(|span| span.as_str()).collect(); let defaults = called_rules.difference(&definitions); Ok(defaults.cloned().collect()) } /// Validates that the given `definitions` do not contain any Rust keywords. #[allow(clippy::ptr_arg)] #[deprecated = "Rust keywords are no longer restricted from the pest grammar"] pub fn validate_rust_keywords(definitions: &Vec>) -> Vec> { let mut errors = vec![]; for definition in definitions { let name = definition.as_str(); if RUST_KEYWORDS.contains(name) { errors.push(Error::new_from_span( ErrorVariant::CustomError { message: format!("{} is a rust keyword", name), }, *definition, )) } } errors } /// Validates that the given `definitions` do not contain any Pest keywords. #[allow(clippy::ptr_arg)] pub fn validate_pest_keywords(definitions: &Vec>) -> Vec> { let mut errors = vec![]; for definition in definitions { let name = definition.as_str(); if PEST_KEYWORDS.contains(name) { errors.push(Error::new_from_span( ErrorVariant::CustomError { message: format!("{} is a pest keyword", name), }, *definition, )) } } errors } /// Validates that the given `definitions` do not contain any duplicate rules. #[allow(clippy::ptr_arg)] pub fn validate_already_defined(definitions: &Vec>) -> Vec> { let mut errors = vec![]; let mut defined = HashSet::new(); for definition in definitions { let name = definition.as_str(); if defined.contains(&name) { errors.push(Error::new_from_span( ErrorVariant::CustomError { message: format!("rule {} already defined", name), }, *definition, )) } else { defined.insert(name); } } errors } /// Validates that the given `definitions` do not contain any undefined rules. #[allow(clippy::ptr_arg)] pub fn validate_undefined<'i>( definitions: &Vec>, called_rules: &Vec>, ) -> Vec> { let mut errors = vec![]; let definitions: HashSet<_> = definitions.iter().map(|span| span.as_str()).collect(); for rule in called_rules { let name = rule.as_str(); if !definitions.contains(name) && !BUILTINS.contains(name) { errors.push(Error::new_from_span( ErrorVariant::CustomError { message: format!("rule {} is undefined", name), }, *rule, )) } } errors } /// Validates the abstract syntax tree for common mistakes: /// - infinite repetitions /// - choices that cannot be reached /// - left recursion #[allow(clippy::ptr_arg)] pub fn validate_ast<'a, 'i: 'a>(rules: &'a Vec>) -> Vec> { let mut errors = vec![]; errors.extend(validate_repetition(rules)); errors.extend(validate_choices(rules)); errors.extend(validate_whitespace_comment(rules)); errors.extend(validate_left_recursion(rules)); errors.sort_by_key(|error| match error.location { InputLocation::Span(span) => span, _ => unreachable!(), }); errors } fn is_non_progressing<'i>( expr: &ParserExpr<'i>, rules: &HashMap>, trace: &mut Vec, ) -> bool { match *expr { ParserExpr::Str(ref string) => string.is_empty(), ParserExpr::Ident(ref ident) => { if ident == "soi" || ident == "eoi" { return true; } if !trace.contains(ident) { if let Some(node) = rules.get(ident) { trace.push(ident.clone()); let result = is_non_progressing(&node.expr, rules, trace); trace.pop().unwrap(); return result; } } false } ParserExpr::PosPred(_) => true, ParserExpr::NegPred(_) => true, ParserExpr::Seq(ref lhs, ref rhs) => { is_non_progressing(&lhs.expr, rules, trace) && is_non_progressing(&rhs.expr, rules, trace) } ParserExpr::Choice(ref lhs, ref rhs) => { is_non_progressing(&lhs.expr, rules, trace) || is_non_progressing(&rhs.expr, rules, trace) } _ => false, } } fn is_non_failing<'i>( expr: &ParserExpr<'i>, rules: &HashMap>, trace: &mut Vec, ) -> bool { match *expr { ParserExpr::Str(ref string) => string.is_empty(), ParserExpr::Ident(ref ident) => { if !trace.contains(ident) { if let Some(node) = rules.get(ident) { trace.push(ident.clone()); let result = is_non_failing(&node.expr, rules, trace); trace.pop().unwrap(); return result; } } false } ParserExpr::Opt(_) => true, ParserExpr::Rep(_) => true, ParserExpr::Seq(ref lhs, ref rhs) => { is_non_failing(&lhs.expr, rules, trace) && is_non_failing(&rhs.expr, rules, trace) } ParserExpr::Choice(ref lhs, ref rhs) => { is_non_failing(&lhs.expr, rules, trace) || is_non_failing(&rhs.expr, rules, trace) } _ => false, } } fn validate_repetition<'a, 'i: 'a>(rules: &'a [ParserRule<'i>]) -> Vec> { let mut result = vec![]; let map = to_hash_map(rules); for rule in rules { let mut errors = rule.node .clone() .filter_map_top_down(|node| match node.expr { ParserExpr::Rep(ref other) | ParserExpr::RepOnce(ref other) | ParserExpr::RepMin(ref other, _) => { if is_non_failing(&other.expr, &map, &mut vec![]) { Some(Error::new_from_span( ErrorVariant::CustomError { message: "expression inside repetition cannot fail and will repeat \ infinitely" .to_owned() }, node.span )) } else if is_non_progressing(&other.expr, &map, &mut vec![]) { Some(Error::new_from_span( ErrorVariant::CustomError { message: "expression inside repetition is non-progressing and will repeat \ infinitely" .to_owned(), }, node.span )) } else { None } } _ => None }); result.append(&mut errors); } result } fn validate_choices<'a, 'i: 'a>(rules: &'a [ParserRule<'i>]) -> Vec> { let mut result = vec![]; let map = to_hash_map(rules); for rule in rules { let mut errors = rule .node .clone() .filter_map_top_down(|node| match node.expr { ParserExpr::Choice(ref lhs, _) => { let node = match lhs.expr { ParserExpr::Choice(_, ref rhs) => rhs, _ => lhs, }; if is_non_failing(&node.expr, &map, &mut vec![]) { Some(Error::new_from_span( ErrorVariant::CustomError { message: "expression cannot fail; following choices cannot be reached" .to_owned(), }, node.span, )) } else { None } } _ => None, }); result.append(&mut errors); } result } fn validate_whitespace_comment<'a, 'i: 'a>(rules: &'a [ParserRule<'i>]) -> Vec> { let map = to_hash_map(rules); rules .iter() .filter_map(|rule| { if rule.name == "WHITESPACE" || rule.name == "COMMENT" { if is_non_failing(&rule.node.expr, &map, &mut vec![]) { Some(Error::new_from_span( ErrorVariant::CustomError { message: format!( "{} cannot fail and will repeat infinitely", &rule.name ), }, rule.node.span, )) } else if is_non_progressing(&rule.node.expr, &map, &mut vec![]) { Some(Error::new_from_span( ErrorVariant::CustomError { message: format!( "{} is non-progressing and will repeat infinitely", &rule.name ), }, rule.node.span, )) } else { None } } else { None } }) .collect() } fn validate_left_recursion<'a, 'i: 'a>(rules: &'a [ParserRule<'i>]) -> Vec> { left_recursion(to_hash_map(rules)) } fn to_hash_map<'a, 'i: 'a>(rules: &'a [ParserRule<'i>]) -> HashMap> { rules.iter().map(|r| (r.name.clone(), &r.node)).collect() } fn left_recursion<'a, 'i: 'a>(rules: HashMap>) -> Vec> { fn check_expr<'a, 'i: 'a>( node: &'a ParserNode<'i>, rules: &'a HashMap>, trace: &mut Vec, ) -> Option> { match node.expr.clone() { ParserExpr::Ident(other) => { if trace[0] == other { trace.push(other); let chain = trace .iter() .map(|ident| ident.as_ref()) .collect::>() .join(" -> "); return Some(Error::new_from_span( ErrorVariant::CustomError { message: format!( "rule {} is left-recursive ({}); pest::pratt_parser might be useful \ in this case", node.span.as_str(), chain ) }, node.span )); } if !trace.contains(&other) { if let Some(node) = rules.get(&other) { trace.push(other); let result = check_expr(node, rules, trace); trace.pop().unwrap(); return result; } } None } ParserExpr::Seq(ref lhs, ref rhs) => { if is_non_failing(&lhs.expr, rules, &mut vec![trace.last().unwrap().clone()]) { check_expr(rhs, rules, trace) } else { check_expr(lhs, rules, trace) } } ParserExpr::Choice(ref lhs, ref rhs) => { check_expr(lhs, rules, trace).or_else(|| check_expr(rhs, rules, trace)) } ParserExpr::Rep(ref node) => check_expr(node, rules, trace), ParserExpr::RepOnce(ref node) => check_expr(node, rules, trace), ParserExpr::Opt(ref node) => check_expr(node, rules, trace), ParserExpr::PosPred(ref node) => check_expr(node, rules, trace), ParserExpr::NegPred(ref node) => check_expr(node, rules, trace), ParserExpr::Push(ref node) => check_expr(node, rules, trace), _ => None, } } let mut errors = vec![]; for (name, node) in &rules { let name = name.clone(); if let Some(error) = check_expr(node, &rules, &mut vec![name]) { errors.push(error); } } errors } #[cfg(test)] mod tests { use super::super::parser::{consume_rules, PestParser}; use super::super::unwrap_or_report; use super::*; use pest::Parser; #[test] #[should_panic(expected = "grammar error --> 1:1 | 1 | ANY = { \"a\" } | ^-^ | = ANY is a pest keyword")] fn pest_keyword() { let input = "ANY = { \"a\" }"; unwrap_or_report(validate_pairs( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:13 | 1 | a = { \"a\" } a = { \"a\" } | ^ | = rule a already defined")] fn already_defined() { let input = "a = { \"a\" } a = { \"a\" }"; unwrap_or_report(validate_pairs( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:7 | 1 | a = { b } | ^ | = rule b is undefined")] fn undefined() { let input = "a = { b }"; unwrap_or_report(validate_pairs( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] fn valid_recursion() { let input = "a = { \"\" ~ \"a\"? ~ \"a\"* ~ (\"a\" | \"b\") ~ a }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:16 | 1 | WHITESPACE = { \"\" } | ^^ | = WHITESPACE cannot fail and will repeat infinitely")] fn non_failing_whitespace() { let input = "WHITESPACE = { \"\" }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:13 | 1 | COMMENT = { soi } | ^-^ | = COMMENT is non-progressing and will repeat infinitely")] fn non_progressing_comment() { let input = "COMMENT = { soi }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:7 | 1 | a = { (\"\")* } | ^---^ | = expression inside repetition cannot fail and will repeat infinitely")] fn non_failing_repetition() { let input = "a = { (\"\")* }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:18 | 1 | a = { \"\" } b = { a* } | ^^ | = expression inside repetition cannot fail and will repeat infinitely")] fn indirect_non_failing_repetition() { let input = "a = { \"\" } b = { a* }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:20 | 1 | a = { \"a\" ~ (\"b\" ~ (\"\")*) } | ^---^ | = expression inside repetition cannot fail and will repeat infinitely")] fn deep_non_failing_repetition() { let input = "a = { \"a\" ~ (\"b\" ~ (\"\")*) }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:7 | 1 | a = { (\"\" ~ &\"a\" ~ !\"a\" ~ (soi | eoi))* } | ^-------------------------------^ | = expression inside repetition is non-progressing and will repeat infinitely")] fn non_progressing_repetition() { let input = "a = { (\"\" ~ &\"a\" ~ !\"a\" ~ (soi | eoi))* }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:20 | 1 | a = { !\"a\" } b = { a* } | ^^ | = expression inside repetition is non-progressing and will repeat infinitely")] fn indirect_non_progressing_repetition() { let input = "a = { !\"a\" } b = { a* }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:7 | 1 | a = { a } | ^ | = rule a is left-recursive (a -> a); pest::pratt_parser might be useful in this case")] fn simple_left_recursion() { let input = "a = { a }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:7 | 1 | a = { b } b = { a } | ^ | = rule b is left-recursive (b -> a -> b); pest::pratt_parser might be useful in this case --> 1:17 | 1 | a = { b } b = { a } | ^ | = rule a is left-recursive (a -> b -> a); pest::pratt_parser might be useful in this case")] fn indirect_left_recursion() { let input = "a = { b } b = { a }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:39 | 1 | a = { \"\" ~ \"a\"? ~ \"a\"* ~ (\"a\" | \"\") ~ a } | ^ | = rule a is left-recursive (a -> a); pest::pratt_parser might be useful in this case")] fn non_failing_left_recursion() { let input = "a = { \"\" ~ \"a\"? ~ \"a\"* ~ (\"a\" | \"\") ~ a }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:13 | 1 | a = { \"a\" | a } | ^ | = rule a is left-recursive (a -> a); pest::pratt_parser might be useful in this case")] fn non_primary_choice_left_recursion() { let input = "a = { \"a\" | a }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:7 | 1 | a = { \"a\"* | \"a\" | \"b\" } | ^--^ | = expression cannot fail; following choices cannot be reached")] fn lhs_non_failing_choice() { let input = "a = { \"a\"* | \"a\" | \"b\" }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:13 | 1 | a = { \"a\" | \"a\"* | \"b\" } | ^--^ | = expression cannot fail; following choices cannot be reached")] fn lhs_non_failing_choice_middle() { let input = "a = { \"a\" | \"a\"* | \"b\" }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] #[should_panic(expected = "grammar error --> 1:7 | 1 | a = { b | \"a\" } b = { \"b\"* | \"c\" } | ^ | = expression cannot fail; following choices cannot be reached --> 1:23 | 1 | a = { b | \"a\" } b = { \"b\"* | \"c\" } | ^--^ | = expression cannot fail; following choices cannot be reached")] fn lhs_non_failing_nested_choices() { let input = "a = { b | \"a\" } b = { \"b\"* | \"c\" }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } #[test] fn skip_can_be_defined() { let input = "skip = { \"\" }"; unwrap_or_report(consume_rules( PestParser::parse(Rule::grammar_rules, input).unwrap(), )); } }