diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/rust/cexpr/src/literal.rs | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/third_party/rust/cexpr/src/literal.rs b/third_party/rust/cexpr/src/literal.rs new file mode 100644 index 0000000000..68e85c7dad --- /dev/null +++ b/third_party/rust/cexpr/src/literal.rs @@ -0,0 +1,361 @@ +// (C) Copyright 2016 Jethro G. Beekman +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! Parsing C literals from byte slices. +//! +//! This will parse a representation of a C literal into a Rust type. +//! +//! # characters +//! Character literals are stored into the `CChar` type, which can hold values +//! that are not valid Unicode code points. ASCII characters are represented as +//! `char`, literal bytes with the high byte set are converted into the raw +//! representation. Escape sequences are supported. If hex and octal escapes +//! map to an ASCII character, that is used, otherwise, the raw encoding is +//! used, including for values over 255. Unicode escapes are checked for +//! validity and mapped to `char`. Character sequences are not supported. Width +//! prefixes are ignored. +//! +//! # strings +//! Strings are interpreted as byte vectors. Escape sequences are supported. If +//! hex and octal escapes map onto multi-byte characters, they are truncated to +//! one 8-bit character. Unicode escapes are converted into their UTF-8 +//! encoding. Width prefixes are ignored. +//! +//! # integers +//! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are +//! all supported. If the literal value is between `i64::MAX` and `u64::MAX`, +//! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and +//! sign suffixes are ignored. Sign prefixes are not supported. +//! +//! # real numbers +//! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are +//! not supported in the significand. Hexadecimal floating points are not +//! supported. + +use std::char; +use std::str::{self, FromStr}; + +use nom::branch::alt; +use nom::bytes::complete::is_not; +use nom::bytes::complete::tag; +use nom::character::complete::{char, one_of}; +use nom::combinator::{complete, map, map_opt, opt, recognize}; +use nom::multi::{fold_many0, many0, many1, many_m_n}; +use nom::sequence::{delimited, pair, preceded, terminated, tuple}; +use nom::*; + +use crate::expr::EvalResult; +use crate::ToCexprResult; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +/// Representation of a C character +pub enum CChar { + /// A character that can be represented as a `char` + Char(char), + /// Any other character (8-bit characters, unicode surrogates, etc.) + Raw(u64), +} + +impl From<u8> for CChar { + fn from(i: u8) -> CChar { + match i { + 0..=0x7f => CChar::Char(i as u8 as char), + _ => CChar::Raw(i as u64), + } + } +} + +// A non-allocating version of this would be nice... +impl std::convert::Into<Vec<u8>> for CChar { + fn into(self) -> Vec<u8> { + match self { + CChar::Char(c) => { + let mut s = String::with_capacity(4); + s.extend(&[c]); + s.into_bytes() + } + CChar::Raw(i) => { + let mut v = Vec::with_capacity(1); + v.push(i as u8); + v + } + } + } +} + +/// ensures the child parser consumes the whole input +pub fn full<I: Clone, O, F>( + f: F, +) -> impl Fn(I) -> nom::IResult<I, O> +where + I: nom::InputLength, + F: Fn(I) -> nom::IResult<I, O>, +{ + move |input| { + let res = f(input); + match res { + Ok((i, o)) => { + if i.input_len() == 0 { + Ok((i, o)) + } else { + Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::Complete))) + } + } + r => r, + } + } +} + +// ================================= +// ======== matching digits ======== +// ================================= + +macro_rules! byte { + ($($p: pat)|* ) => {{ + fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> { + match i.split_first() { + $(Some((&c @ $p,rest)))|* => Ok((rest,c)), + Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))), + None => Err(nom::Err::Incomplete(Needed::new(1))), + } + } + + parser + }} +} + +fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> { + byte!(b'0'..=b'1')(i) +} + +fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> { + byte!(b'0'..=b'7')(i) +} + +fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> { + byte!(b'0'..=b'9')(i) +} + +fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> { + byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F')(i) +} + +// ======================================== +// ======== characters and strings ======== +// ======================================== + +fn escape2char(c: char) -> CChar { + CChar::Char(match c { + 'a' => '\x07', + 'b' => '\x08', + 'f' => '\x0c', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'v' => '\x0b', + _ => unreachable!("invalid escape {}", c), + }) +} + +fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> { + str::from_utf8(&n) + .ok() + .and_then(|i| u64::from_str_radix(i, radix).ok()) + .map(|i| match i { + 0..=0x7f => CChar::Char(i as u8 as char), + _ => CChar::Raw(i), + }) +} + +fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> { + str::from_utf8(&n) + .ok() + .and_then(|i| u32::from_str_radix(i, 16).ok()) + .and_then(char::from_u32) + .map(CChar::Char) +} + +fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { + preceded( + char('\\'), + alt(( + map(one_of(r#"'"?\"#), CChar::Char), + map(one_of("abfnrtv"), escape2char), + map_opt(many_m_n(1, 3, octal), |v| c_raw_escape(v, 8)), + map_opt(preceded(char('x'), many1(hexadecimal)), |v| { + c_raw_escape(v, 16) + }), + map_opt( + preceded(char('u'), many_m_n(4, 4, hexadecimal)), + c_unicode_escape, + ), + map_opt( + preceded(char('U'), many_m_n(8, 8, hexadecimal)), + c_unicode_escape, + ), + )), + )(i) +} + +fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> { + alt((tag("u8"), tag("u"), tag("U"), tag("L")))(i) +} + +fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { + delimited( + terminated(opt(c_width_prefix), char('\'')), + alt(( + escaped_char, + map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from), + )), + char('\''), + )(i) +} + +fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> { + delimited( + alt((preceded(c_width_prefix, char('"')), char('"'))), + fold_many0( + alt(( + map(escaped_char, |c: CChar| c.into()), + map(is_not([b'\\', b'"']), |c: &[u8]| c.into()), + )), + Vec::new, + |mut v: Vec<u8>, res: Vec<u8>| { + v.extend_from_slice(&res); + v + }, + ), + char('"'), + )(i) +} + +// ================================ +// ======== parse integers ======== +// ================================ + +fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> { + str::from_utf8(&n) + .ok() + .and_then(|i| u64::from_str_radix(i, radix).ok()) +} + +fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> { + let r = input.split_at_position(|c| c != b'u' && c != b'U' && c != b'l' && c != b'L'); + match r { + Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)), + res => res, + } +} + +fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> { + map( + terminated( + alt(( + map_opt(preceded(tag("0x"), many1(complete(hexadecimal))), |v| { + c_int_radix(v, 16) + }), + map_opt(preceded(tag("0X"), many1(complete(hexadecimal))), |v| { + c_int_radix(v, 16) + }), + map_opt(preceded(tag("0b"), many1(complete(binary))), |v| { + c_int_radix(v, 2) + }), + map_opt(preceded(tag("0B"), many1(complete(binary))), |v| { + c_int_radix(v, 2) + }), + map_opt(preceded(char('0'), many1(complete(octal))), |v| { + c_int_radix(v, 8) + }), + map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)), + |input| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))), + )), + opt(take_ul), + ), + |i| i as i64, + )(i) +} + +// ============================== +// ======== parse floats ======== +// ============================== + +fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> { + nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L'))(i) +} + +fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> { + preceded( + byte!(b'e' | b'E'), + pair(opt(byte!(b'-' | b'+')), many1(complete(decimal))), + )(i) +} + +fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> { + map_opt( + alt(( + terminated( + recognize(tuple(( + many1(complete(decimal)), + byte!(b'.'), + many0(complete(decimal)), + ))), + opt(float_width), + ), + terminated( + recognize(tuple(( + many0(complete(decimal)), + byte!(b'.'), + many1(complete(decimal)), + ))), + opt(float_width), + ), + terminated( + recognize(tuple(( + many0(complete(decimal)), + opt(byte!(b'.')), + many1(complete(decimal)), + float_exp, + ))), + opt(float_width), + ), + terminated( + recognize(tuple(( + many1(complete(decimal)), + opt(byte!(b'.')), + many0(complete(decimal)), + float_exp, + ))), + opt(float_width), + ), + terminated(recognize(many1(complete(decimal))), float_width), + )), + |v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()), + )(i) +} + +// ================================ +// ======== main interface ======== +// ================================ + +fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> { + alt(( + map(full(c_char), EvalResult::Char), + map(full(c_int), |i| EvalResult::Int(::std::num::Wrapping(i))), + map(full(c_float), EvalResult::Float), + map(full(c_string), EvalResult::Str), + ))(input) + .to_cexpr_result() +} + +/// Parse a C literal. +/// +/// The input must contain exactly the representation of a single literal +/// token, and in particular no whitespace or sign prefixes. +pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> { + crate::assert_full_parse(one_literal(input)) +} |