// (C) Copyright 2016 Jethro G. Beekman // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Parsing C literals from byte slices. //! //! This will parse a representation of a C literal into a Rust type. //! //! # characters //! Character literals are stored into the `CChar` type, which can hold values //! that are not valid Unicode code points. ASCII characters are represented as //! `char`, literal bytes with the high byte set are converted into the raw //! representation. Escape sequences are supported. If hex and octal escapes //! map to an ASCII character, that is used, otherwise, the raw encoding is //! used, including for values over 255. Unicode escapes are checked for //! validity and mapped to `char`. Character sequences are not supported. Width //! prefixes are ignored. //! //! # strings //! Strings are interpreted as byte vectors. Escape sequences are supported. If //! hex and octal escapes map onto multi-byte characters, they are truncated to //! one 8-bit character. Unicode escapes are converted into their UTF-8 //! encoding. Width prefixes are ignored. //! //! # integers //! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are //! all supported. If the literal value is between `i64::MAX` and `u64::MAX`, //! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and //! sign suffixes are ignored. Sign prefixes are not supported. //! //! # real numbers //! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are //! not supported in the significand. Hexadecimal floating points are not //! supported. use std::char; use std::str::{self, FromStr}; use nom::branch::alt; use nom::bytes::complete::is_not; use nom::bytes::complete::tag; use nom::character::complete::{char, one_of}; use nom::combinator::{complete, map, map_opt, opt, recognize}; use nom::multi::{fold_many0, many0, many1, many_m_n}; use nom::sequence::{delimited, pair, preceded, terminated, tuple}; use nom::*; use crate::expr::EvalResult; use crate::ToCexprResult; #[derive(Debug, Copy, Clone, PartialEq, Eq)] /// Representation of a C character pub enum CChar { /// A character that can be represented as a `char` Char(char), /// Any other character (8-bit characters, unicode surrogates, etc.) Raw(u64), } impl From for CChar { fn from(i: u8) -> CChar { match i { 0..=0x7f => CChar::Char(i as u8 as char), _ => CChar::Raw(i as u64), } } } // A non-allocating version of this would be nice... impl std::convert::Into> for CChar { fn into(self) -> Vec { match self { CChar::Char(c) => { let mut s = String::with_capacity(4); s.extend(&[c]); s.into_bytes() } CChar::Raw(i) => { let mut v = Vec::with_capacity(1); v.push(i as u8); v } } } } /// ensures the child parser consumes the whole input pub fn full( f: F, ) -> impl Fn(I) -> nom::IResult where I: nom::InputLength, F: Fn(I) -> nom::IResult, { move |input| { let res = f(input); match res { Ok((i, o)) => { if i.input_len() == 0 { Ok((i, o)) } else { Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::Complete))) } } r => r, } } } // ================================= // ======== matching digits ======== // ================================= macro_rules! byte { ($($p: pat)|* ) => {{ fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> { match i.split_first() { $(Some((&c @ $p,rest)))|* => Ok((rest,c)), Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))), None => Err(nom::Err::Incomplete(Needed::new(1))), } } parser }} } fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> { byte!(b'0'..=b'1')(i) } fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> { byte!(b'0'..=b'7')(i) } fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> { byte!(b'0'..=b'9')(i) } fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> { byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F')(i) } // ======================================== // ======== characters and strings ======== // ======================================== fn escape2char(c: char) -> CChar { CChar::Char(match c { 'a' => '\x07', 'b' => '\x08', 'f' => '\x0c', 'n' => '\n', 'r' => '\r', 't' => '\t', 'v' => '\x0b', _ => unreachable!("invalid escape {}", c), }) } fn c_raw_escape(n: Vec, radix: u32) -> Option { str::from_utf8(&n) .ok() .and_then(|i| u64::from_str_radix(i, radix).ok()) .map(|i| match i { 0..=0x7f => CChar::Char(i as u8 as char), _ => CChar::Raw(i), }) } fn c_unicode_escape(n: Vec) -> Option { str::from_utf8(&n) .ok() .and_then(|i| u32::from_str_radix(i, 16).ok()) .and_then(char::from_u32) .map(CChar::Char) } fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { preceded( char('\\'), alt(( map(one_of(r#"'"?\"#), CChar::Char), map(one_of("abfnrtv"), escape2char), map_opt(many_m_n(1, 3, octal), |v| c_raw_escape(v, 8)), map_opt(preceded(char('x'), many1(hexadecimal)), |v| { c_raw_escape(v, 16) }), map_opt( preceded(char('u'), many_m_n(4, 4, hexadecimal)), c_unicode_escape, ), map_opt( preceded(char('U'), many_m_n(8, 8, hexadecimal)), c_unicode_escape, ), )), )(i) } fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> { alt((tag("u8"), tag("u"), tag("U"), tag("L")))(i) } fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { delimited( terminated(opt(c_width_prefix), char('\'')), alt(( escaped_char, map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from), )), char('\''), )(i) } fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec> { delimited( alt((preceded(c_width_prefix, char('"')), char('"'))), fold_many0( alt(( map(escaped_char, |c: CChar| c.into()), map(is_not([b'\\', b'"']), |c: &[u8]| c.into()), )), Vec::new, |mut v: Vec, res: Vec| { v.extend_from_slice(&res); v }, ), char('"'), )(i) } // ================================ // ======== parse integers ======== // ================================ fn c_int_radix(n: Vec, radix: u32) -> Option { str::from_utf8(&n) .ok() .and_then(|i| u64::from_str_radix(i, radix).ok()) } fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> { let r = input.split_at_position(|c| c != b'u' && c != b'U' && c != b'l' && c != b'L'); match r { Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)), res => res, } } fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> { map( terminated( alt(( map_opt(preceded(tag("0x"), many1(complete(hexadecimal))), |v| { c_int_radix(v, 16) }), map_opt(preceded(tag("0X"), many1(complete(hexadecimal))), |v| { c_int_radix(v, 16) }), map_opt(preceded(tag("0b"), many1(complete(binary))), |v| { c_int_radix(v, 2) }), map_opt(preceded(tag("0B"), many1(complete(binary))), |v| { c_int_radix(v, 2) }), map_opt(preceded(char('0'), many1(complete(octal))), |v| { c_int_radix(v, 8) }), map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)), |input| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))), )), opt(take_ul), ), |i| i as i64, )(i) } // ============================== // ======== parse floats ======== // ============================== fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> { nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L'))(i) } fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option, Vec)> { preceded( byte!(b'e' | b'E'), pair(opt(byte!(b'-' | b'+')), many1(complete(decimal))), )(i) } fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> { map_opt( alt(( terminated( recognize(tuple(( many1(complete(decimal)), byte!(b'.'), many0(complete(decimal)), ))), opt(float_width), ), terminated( recognize(tuple(( many0(complete(decimal)), byte!(b'.'), many1(complete(decimal)), ))), opt(float_width), ), terminated( recognize(tuple(( many0(complete(decimal)), opt(byte!(b'.')), many1(complete(decimal)), float_exp, ))), opt(float_width), ), terminated( recognize(tuple(( many1(complete(decimal)), opt(byte!(b'.')), many0(complete(decimal)), float_exp, ))), opt(float_width), ), terminated(recognize(many1(complete(decimal))), float_width), )), |v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()), )(i) } // ================================ // ======== main interface ======== // ================================ fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> { alt(( map(full(c_char), EvalResult::Char), map(full(c_int), |i| EvalResult::Int(::std::num::Wrapping(i))), map(full(c_float), EvalResult::Float), map(full(c_string), EvalResult::Str), ))(input) .to_cexpr_result() } /// Parse a C literal. /// /// The input must contain exactly the representation of a single literal /// token, and in particular no whitespace or sign prefixes. pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> { crate::assert_full_parse(one_literal(input)) }