diff options
Diffstat (limited to 'compiler/rustc_lexer')
-rw-r--r-- | compiler/rustc_lexer/Cargo.toml | 1 | ||||
-rw-r--r-- | compiler/rustc_lexer/src/cursor.rs | 16 | ||||
-rw-r--r-- | compiler/rustc_lexer/src/lib.rs | 76 | ||||
-rw-r--r-- | compiler/rustc_lexer/src/unescape.rs | 27 |
4 files changed, 64 insertions, 56 deletions
diff --git a/compiler/rustc_lexer/Cargo.toml b/compiler/rustc_lexer/Cargo.toml index 35af11053..ad685c2ad 100644 --- a/compiler/rustc_lexer/Cargo.toml +++ b/compiler/rustc_lexer/Cargo.toml @@ -12,7 +12,6 @@ Rust lexer used by rustc. No stability guarantees are provided. # Note: do not remove this blank `[lib]` section. # This will be used when publishing this crate as `rustc-ap-rustc_lexer`. [lib] -doctest = false # Note that this crate purposefully does not depend on other rustc crates [dependencies] diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 21557a9c8..eceef5980 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -4,8 +4,8 @@ use std::str::Chars; /// /// Next characters can be peeked via `first` method, /// and position can be shifted forward via `bump` method. -pub(crate) struct Cursor<'a> { - initial_len: usize, +pub struct Cursor<'a> { + len_remaining: usize, /// Iterator over chars. Slightly faster than a &str. chars: Chars<'a>, #[cfg(debug_assertions)] @@ -15,9 +15,9 @@ pub(crate) struct Cursor<'a> { pub(crate) const EOF_CHAR: char = '\0'; impl<'a> Cursor<'a> { - pub(crate) fn new(input: &'a str) -> Cursor<'a> { + pub fn new(input: &'a str) -> Cursor<'a> { Cursor { - initial_len: input.len(), + len_remaining: input.len(), chars: input.chars(), #[cfg(debug_assertions)] prev: EOF_CHAR, @@ -61,13 +61,13 @@ impl<'a> Cursor<'a> { } /// Returns amount of already consumed symbols. - pub(crate) fn len_consumed(&self) -> u32 { - (self.initial_len - self.chars.as_str().len()) as u32 + pub(crate) fn pos_within_token(&self) -> u32 { + (self.len_remaining - self.chars.as_str().len()) as u32 } /// Resets the number of bytes consumed to 0. - pub(crate) fn reset_len_consumed(&mut self) { - self.initial_len = self.chars.as_str().len(); + pub(crate) fn reset_pos_within_token(&mut self) { + self.len_remaining = self.chars.as_str().len(); } /// Moves to the next character. diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index a79c98264..51515976e 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -29,9 +29,11 @@ pub mod unescape; #[cfg(test)] mod tests; +pub use crate::cursor::Cursor; + use self::LiteralKind::*; use self::TokenKind::*; -use crate::cursor::{Cursor, EOF_CHAR}; +use crate::cursor::EOF_CHAR; use std::convert::TryFrom; /// Parsed token. @@ -55,29 +57,42 @@ pub enum TokenKind { // Multi-char tokens: /// "// comment" LineComment { doc_style: Option<DocStyle> }, + /// `/* block comment */` /// - /// Block comments can be recursive, so the sequence like `/* /* */` + /// Block comments can be recursive, so a sequence like `/* /* */` /// will not be considered terminated and will result in a parsing error. BlockComment { doc_style: Option<DocStyle>, terminated: bool }, - /// Any whitespace characters sequence. + + /// Any whitespace character sequence. Whitespace, + /// "ident" or "continue" - /// At this step keywords are also considered identifiers. + /// + /// At this step, keywords are also considered identifiers. Ident, + /// Like the above, but containing invalid unicode codepoints. InvalidIdent, + /// "r#ident" RawIdent, - /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the + + /// An unknown prefix, like `foo#`, `foo'`, `foo"`. + /// + /// Note that only the /// prefix (`foo`) is included in the token, not the separator (which is /// lexed as its own distinct token). In Rust 2021 and later, reserved /// prefixes are reported as errors; in earlier editions, they result in a /// (allowed by default) lint, and are treated as regular identifier /// tokens. UnknownPrefix, - /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details. + + /// Examples: `"12_u8"`, `"1.0e-40"`, `b"123`. + /// + /// See [LiteralKind] for more details. Literal { kind: LiteralKind, suffix_start: u32 }, + /// "'a" Lifetime { starts_with_number: bool }, @@ -139,6 +154,9 @@ pub enum TokenKind { /// Unknown token, not expected by the lexer, e.g. "№" Unknown, + + /// End of input. + Eof, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -219,13 +237,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> { None } -/// Parses the first token from the provided input string. -#[inline] -pub fn first_token(input: &str) -> Token { - debug_assert!(!input.is_empty()); - Cursor::new(input).advance_token() -} - /// Validates a raw string literal. Used for getting more information about a /// problem with a `RawStr`/`RawByteStr` with a `None` field. #[inline] @@ -243,12 +254,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ { let mut cursor = Cursor::new(input); std::iter::from_fn(move || { - if cursor.is_eof() { - None - } else { - cursor.reset_len_consumed(); - Some(cursor.advance_token()) - } + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { Some(token) } else { None } }) } @@ -311,8 +318,11 @@ pub fn is_ident(string: &str) -> bool { impl Cursor<'_> { /// Parses a token from the input string. - fn advance_token(&mut self) -> Token { - let first_char = self.bump().unwrap(); + pub fn advance_token(&mut self) -> Token { + let first_char = match self.bump() { + Some(c) => c, + None => return Token::new(TokenKind::Eof, 0), + }; let token_kind = match first_char { // Slash, comment or block comment. '/' => match self.first() { @@ -329,7 +339,7 @@ impl Cursor<'_> { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { let res = self.raw_double_quoted_string(1); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if res.is_ok() { self.eat_literal_suffix(); } @@ -344,7 +354,7 @@ impl Cursor<'_> { ('\'', _) => { self.bump(); let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -354,7 +364,7 @@ impl Cursor<'_> { ('"', _) => { self.bump(); let terminated = self.double_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -364,7 +374,7 @@ impl Cursor<'_> { ('r', '"') | ('r', '#') => { self.bump(); let res = self.raw_double_quoted_string(2); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if res.is_ok() { self.eat_literal_suffix(); } @@ -381,7 +391,7 @@ impl Cursor<'_> { // Numeric literal. c @ '0'..='9' => { let literal_kind = self.number(c); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); self.eat_literal_suffix(); TokenKind::Literal { kind: literal_kind, suffix_start } } @@ -420,7 +430,7 @@ impl Cursor<'_> { // String literal. '"' => { let terminated = self.double_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -433,7 +443,9 @@ impl Cursor<'_> { } _ => Unknown, }; - Token::new(token_kind, self.len_consumed()) + let res = Token::new(token_kind, self.pos_within_token()); + self.reset_pos_within_token(); + res } fn line_comment(&mut self) -> TokenKind { @@ -618,7 +630,7 @@ impl Cursor<'_> { if !can_be_a_lifetime { let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -643,7 +655,7 @@ impl Cursor<'_> { if self.first() == '\'' { self.bump(); let kind = Char { terminated: true }; - Literal { kind, suffix_start: self.len_consumed() } + Literal { kind, suffix_start: self.pos_within_token() } } else { Lifetime { starts_with_number } } @@ -724,7 +736,7 @@ impl Cursor<'_> { fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> { debug_assert!(self.prev() == 'r'); - let start_pos = self.len_consumed(); + let start_pos = self.pos_within_token(); let mut possible_terminator_offset = None; let mut max_hashes = 0; @@ -778,7 +790,7 @@ impl Cursor<'_> { // Keep track of possible terminators to give a hint about // where there might be a missing terminator possible_terminator_offset = - Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); + Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len); max_hashes = n_end_hashes; } } diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 3da6bc146..8f64b5f51 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -93,7 +93,7 @@ where // NOTE: Raw strings do not perform any explicit character escaping, here we // only translate CRLF to LF and produce errors on bare CR. Mode::RawStr | Mode::RawByteStr => { - unescape_raw_str_or_byte_str(literal_text, mode, callback) + unescape_raw_str_or_raw_byte_str(literal_text, mode, callback) } } } @@ -105,7 +105,7 @@ pub fn unescape_byte_literal<F>(literal_text: &str, mode: Mode, callback: &mut F where F: FnMut(Range<usize>, Result<u8, EscapeError>), { - assert!(mode.is_bytes()); + debug_assert!(mode.is_bytes()); unescape_literal(literal_text, mode, &mut |range, result| { callback(range, result.map(byte_from_char)); }) @@ -129,7 +129,7 @@ pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> { } /// What kind of literal do we parse. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum Mode { Char, Str, @@ -140,17 +140,13 @@ pub enum Mode { } impl Mode { - pub fn in_single_quotes(self) -> bool { + pub fn in_double_quotes(self) -> bool { match self { - Mode::Char | Mode::Byte => true, - Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => false, + Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true, + Mode::Char | Mode::Byte => false, } } - pub fn in_double_quotes(self) -> bool { - !self.in_single_quotes() - } - pub fn is_bytes(self) -> bool { match self { Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, @@ -184,7 +180,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { let value = hi * 16 + lo; - // For a byte literal verify that it is within ASCII range. + // For a non-byte literal verify that it is within ASCII range. if !mode.is_bytes() && !is_ascii(value) { return Err(EscapeError::OutOfRangeHexEscape); } @@ -263,6 +259,7 @@ fn ascii_check(first_char: char, mode: Mode) -> Result<char, EscapeError> { } fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { + debug_assert!(mode == Mode::Char || mode == Mode::Byte); let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; let res = match first_char { '\\' => scan_escape(chars, mode), @@ -282,7 +279,7 @@ fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range<usize>, Result<char, EscapeError>), { - assert!(mode.in_double_quotes()); + debug_assert!(mode == Mode::Str || mode == Mode::ByteStr); let initial_len = src.len(); let mut chars = src.chars(); while let Some(first_char) = chars.next() { @@ -344,11 +341,11 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only translate CRLF to LF and produce errors on bare CR. -fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F) +fn unescape_raw_str_or_raw_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F) where F: FnMut(Range<usize>, Result<char, EscapeError>), { - assert!(mode.in_double_quotes()); + debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr); let initial_len = literal_text.len(); let mut chars = literal_text.chars(); @@ -368,7 +365,7 @@ where fn byte_from_char(c: char) -> u8 { let res = c as u32; - assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr"); + debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr"); res as u8 } |