#[allow(unused, deprecated)] use std::ascii::AsciiExt; use std::error::Error; use std::fmt; use std::iter::Enumerate; use std::str::Bytes; use super::{Mime, Source, ParamSource, Indexed, CHARSET, UTF_8}; #[derive(Debug)] pub enum ParseError { MissingSlash, MissingEqual, MissingQuote, InvalidToken { pos: usize, byte: u8, }, } impl ParseError { fn s(&self) -> &str { use self::ParseError::*; match *self { MissingSlash => "a slash (/) was missing between the type and subtype", MissingEqual => "an equals sign (=) was missing between a parameter and its value", MissingQuote => "a quote (\") was missing from a parameter value", InvalidToken { .. } => "an invalid token was encountered", } } } impl fmt::Display for ParseError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { if let ParseError::InvalidToken { pos, byte } = *self { write!(f, "{}, {:X} at position {}", self.s(), byte, pos) } else { f.write_str(self.s()) } } } impl Error for ParseError { // Minimum Rust is 1.15, Error::description was still required then #[allow(deprecated)] fn description(&self) -> &str { self.s() } } pub fn parse(s: &str) -> Result { if s == "*/*" { return Ok(::STAR_STAR); } let mut iter = s.bytes().enumerate(); // toplevel let mut start; let slash; loop { match iter.next() { Some((_, c)) if is_token(c) => (), Some((i, b'/')) if i > 0 => { slash = i; start = i + 1; break; }, None => return Err(ParseError::MissingSlash), // EOF and no toplevel is no Mime Some((pos, byte)) => return Err(ParseError::InvalidToken { pos: pos, byte: byte, }) }; } // sublevel let mut plus = None; loop { match iter.next() { Some((i, b'+')) if i > start => { plus = Some(i); }, Some((i, b';')) if i > start => { start = i; break; }, Some((_, c)) if is_token(c) => (), None => { return Ok(Mime { source: Source::Dynamic(s.to_ascii_lowercase()), slash: slash, plus: plus, params: ParamSource::None, }); }, Some((pos, byte)) => return Err(ParseError::InvalidToken { pos: pos, byte: byte, }) }; } // params let params = params_from_str(s, &mut iter, start)?; let src = match params { ParamSource::Utf8(_) => s.to_ascii_lowercase(), ParamSource::Custom(semicolon, ref indices) => lower_ascii_with_params(s, semicolon, indices), ParamSource::None => { // Chop off the empty list s[..start].to_ascii_lowercase() } }; Ok(Mime { source: Source::Dynamic(src), slash: slash, plus: plus, params: params, }) } fn params_from_str(s: &str, iter: &mut Enumerate, mut start: usize) -> Result { let semicolon = start; start += 1; let mut params = ParamSource::None; 'params: while start < s.len() { let name; // name 'name: loop { match iter.next() { Some((i, b' ')) if i == start => { start = i + 1; continue 'params; }, Some((_, c)) if is_token(c) => (), Some((i, b'=')) if i > start => { name = Indexed(start, i); start = i + 1; break 'name; }, None => return Err(ParseError::MissingEqual), Some((pos, byte)) => return Err(ParseError::InvalidToken { pos: pos, byte: byte, }), } } let value; // values must be restrict-name-char or "anything goes" let mut is_quoted = false; 'value: loop { if is_quoted { match iter.next() { Some((i, b'"')) if i > start => { value = Indexed(start, i); break 'value; }, Some((_, c)) if is_restricted_quoted_char(c) => (), None => return Err(ParseError::MissingQuote), Some((pos, byte)) => return Err(ParseError::InvalidToken { pos: pos, byte: byte, }), } } else { match iter.next() { Some((i, b'"')) if i == start => { is_quoted = true; start = i + 1; }, Some((_, c)) if is_token(c) => (), Some((i, b';')) if i > start => { value = Indexed(start, i); start = i + 1; break 'value; } None => { value = Indexed(start, s.len()); start = s.len(); break 'value; }, Some((pos, byte)) => return Err(ParseError::InvalidToken { pos: pos, byte: byte, }), } } } if is_quoted { 'ws: loop { match iter.next() { Some((i, b';')) => { // next param start = i + 1; break 'ws; }, Some((_, b' ')) => { // skip whitespace }, None => { // eof start = s.len(); break 'ws; }, Some((pos, byte)) => return Err(ParseError::InvalidToken { pos: pos, byte: byte, }), } } } match params { ParamSource::Utf8(i) => { let i = i + 2; let charset = Indexed(i, "charset".len() + i); let utf8 = Indexed(charset.1 + 1, charset.1 + "utf-8".len() + 1); params = ParamSource::Custom(semicolon, vec![ (charset, utf8), (name, value), ]); }, ParamSource::Custom(_, ref mut vec) => { vec.push((name, value)); }, ParamSource::None => { if semicolon + 2 == name.0 && CHARSET == &s[name.0..name.1] { if UTF_8 == &s[value.0..value.1] { params = ParamSource::Utf8(semicolon); continue 'params; } } params = ParamSource::Custom(semicolon, vec![(name, value)]); }, } } Ok(params) } fn lower_ascii_with_params(s: &str, semi: usize, params: &[(Indexed, Indexed)]) -> String { let mut owned = s.to_owned(); owned[..semi].make_ascii_lowercase(); for &(ref name, ref value) in params { owned[name.0..name.1].make_ascii_lowercase(); // Since we just converted this part of the string to lowercase, // we can skip the `Name == &str` unicase check and do a faster // memcmp instead. if &owned[name.0..name.1] == CHARSET.source { owned[value.0..value.1].make_ascii_lowercase(); } } owned } // From [RFC6838](http://tools.ietf.org/html/rfc6838#section-4.2): // // > All registered media types MUST be assigned top-level type and // > subtype names. The combination of these names serves to uniquely // > identify the media type, and the subtype name facet (or the absence // > of one) identifies the registration tree. Both top-level type and // > subtype names are case-insensitive. // > // > Type and subtype names MUST conform to the following ABNF: // > // > type-name = restricted-name // > subtype-name = restricted-name // > // > restricted-name = restricted-name-first *126restricted-name-chars // > restricted-name-first = ALPHA / DIGIT // > restricted-name-chars = ALPHA / DIGIT / "!" / "#" / // > "$" / "&" / "-" / "^" / "_" // > restricted-name-chars =/ "." ; Characters before first dot always // > ; specify a facet name // > restricted-name-chars =/ "+" ; Characters after last plus always // > ; specify a structured syntax suffix // However, [HTTP](https://tools.ietf.org/html/rfc7231#section-3.1.1.1): // // > media-type = type "/" subtype *( OWS ";" OWS parameter ) // > type = token // > subtype = token // > parameter = token "=" ( token / quoted-string ) // // Where token is defined as: // // > token = 1*tchar // > tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / // > "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA // // So, clearly, ¯\_(Ä_/¯ macro_rules! byte_map { ($($flag:expr,)*) => ([ $($flag != 0,)* ]) } static TOKEN_MAP: [bool; 256] = byte_map![ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; fn is_token(c: u8) -> bool { TOKEN_MAP[c as usize] } fn is_restricted_quoted_char(c: u8) -> bool { c > 31 && c != 127 } #[test] #[allow(warnings)] // ... ranges deprecated fn test_lookup_tables() { for (i, &valid) in TOKEN_MAP.iter().enumerate() { let i = i as u8; let should = match i { b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~' => true, _ => false }; assert_eq!(valid, should, "{:?} ({}) should be {}", i as char, i, should); } }