4 files changed, 64 insertions, 56 deletions
diff --git a/compiler/rustc_lexer/Cargo.toml b/compiler/rustc_lexer/Cargo.toml
index 35af11053..ad685c2ad 100644
--- a/compiler/rustc_lexer/Cargo.toml
+++ b/compiler/rustc_lexer/Cargo.toml
@@ -12,7 +12,6 @@ Rust lexer used by rustc. No stability guarantees are provided.
 # Note: do not remove this blank `[lib]` section.
 # This will be used when publishing this crate as `rustc-ap-rustc_lexer`.
 [lib]
-doctest = false
 
 # Note that this crate purposefully does not depend on other rustc crates
 [dependencies]
diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs
index 21557a9c8..eceef5980 100644
--- a/compiler/rustc_lexer/src/cursor.rs
+++ b/compiler/rustc_lexer/src/cursor.rs
@@ -4,8 +4,8 @@ use std::str::Chars;
 ///
 /// Next characters can be peeked via `first` method,
 /// and position can be shifted forward via `bump` method.
-pub(crate) struct Cursor<'a> {
-    initial_len: usize,
+pub struct Cursor<'a> {
+    len_remaining: usize,
     /// Iterator over chars. Slightly faster than a &str.
     chars: Chars<'a>,
     #[cfg(debug_assertions)]
@@ -15,9 +15,9 @@ pub(crate) struct Cursor<'a> {
 pub(crate) const EOF_CHAR: char = '\0';
 
 impl<'a> Cursor<'a> {
-    pub(crate) fn new(input: &'a str) -> Cursor<'a> {
+    pub fn new(input: &'a str) -> Cursor<'a> {
         Cursor {
-            initial_len: input.len(),
+            len_remaining: input.len(),
             chars: input.chars(),
             #[cfg(debug_assertions)]
             prev: EOF_CHAR,
@@ -61,13 +61,13 @@ impl<'a> Cursor<'a> {
     }
 
     /// Returns amount of already consumed symbols.
-    pub(crate) fn len_consumed(&self) -> u32 {
-        (self.initial_len - self.chars.as_str().len()) as u32
+    pub(crate) fn pos_within_token(&self) -> u32 {
+        (self.len_remaining - self.chars.as_str().len()) as u32
     }
 
     /// Resets the number of bytes consumed to 0.
-    pub(crate) fn reset_len_consumed(&mut self) {
-        self.initial_len = self.chars.as_str().len();
+    pub(crate) fn reset_pos_within_token(&mut self) {
+        self.len_remaining = self.chars.as_str().len();
     }
 
     /// Moves to the next character.
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index a79c98264..51515976e 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -29,9 +29,11 @@ pub mod unescape;
 #[cfg(test)]
 mod tests;
 
+pub use crate::cursor::Cursor;
+
 use self::LiteralKind::*;
 use self::TokenKind::*;
-use crate::cursor::{Cursor, EOF_CHAR};
+use crate::cursor::EOF_CHAR;
 use std::convert::TryFrom;
 
 /// Parsed token.
@@ -55,29 +57,42 @@ pub enum TokenKind {
     // Multi-char tokens:
     /// "// comment"
     LineComment { doc_style: Option<DocStyle> },
+
     /// `/* block comment */`
     ///
-    /// Block comments can be recursive, so the sequence like `/* /* */`
+    /// Block comments can be recursive, so a sequence like `/* /* */`
     /// will not be considered terminated and will result in a parsing error.
     BlockComment { doc_style: Option<DocStyle>, terminated: bool },
-    /// Any whitespace characters sequence.
+
+    /// Any whitespace character sequence.
     Whitespace,
+
     /// "ident" or "continue"
-    /// At this step keywords are also considered identifiers.
+    ///
+    /// At this step, keywords are also considered identifiers.
     Ident,
+
     /// Like the above, but containing invalid unicode codepoints.
     InvalidIdent,
+
     /// "r#ident"
     RawIdent,
-    /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
+
+    /// An unknown prefix, like `foo#`, `foo'`, `foo"`.
+    ///
+    /// Note that only the
     /// prefix (`foo`) is included in the token, not the separator (which is
     /// lexed as its own distinct token). In Rust 2021 and later, reserved
     /// prefixes are reported as errors; in earlier editions, they result in a
     /// (allowed by default) lint, and are treated as regular identifier
     /// tokens.
     UnknownPrefix,
-    /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
+
+    /// Examples: `"12_u8"`, `"1.0e-40"`, `b"123`.
+    ///
+    /// See [LiteralKind] for more details.
     Literal { kind: LiteralKind, suffix_start: u32 },
+
     /// "'a"
     Lifetime { starts_with_number: bool },
 
@@ -139,6 +154,9 @@ pub enum TokenKind {
 
     /// Unknown token, not expected by the lexer, e.g. "№"
     Unknown,
+
+    /// End of input.
+    Eof,
 }
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -219,13 +237,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
     None
 }
 
-/// Parses the first token from the provided input string.
-#[inline]
-pub fn first_token(input: &str) -> Token {
-    debug_assert!(!input.is_empty());
-    Cursor::new(input).advance_token()
-}
-
 /// Validates a raw string literal. Used for getting more information about a
 /// problem with a `RawStr`/`RawByteStr` with a `None` field.
 #[inline]
@@ -243,12 +254,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
 pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
     let mut cursor = Cursor::new(input);
     std::iter::from_fn(move || {
-        if cursor.is_eof() {
-            None
-        } else {
-            cursor.reset_len_consumed();
-            Some(cursor.advance_token())
-        }
+        let token = cursor.advance_token();
+        if token.kind != TokenKind::Eof { Some(token) } else { None }
     })
 }
 
@@ -311,8 +318,11 @@ pub fn is_ident(string: &str) -> bool {
 
 impl Cursor<'_> {
     /// Parses a token from the input string.
-    fn advance_token(&mut self) -> Token {
-        let first_char = self.bump().unwrap();
+    pub fn advance_token(&mut self) -> Token {
+        let first_char = match self.bump() {
+            Some(c) => c,
+            None => return Token::new(TokenKind::Eof, 0),
+        };
         let token_kind = match first_char {
             // Slash, comment or block comment.
             '/' => match self.first() {
@@ -329,7 +339,7 @@ impl Cursor<'_> {
                 ('#', c1) if is_id_start(c1) => self.raw_ident(),
                 ('#', _) | ('"', _) => {
                     let res = self.raw_double_quoted_string(1);
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                     if res.is_ok() {
                         self.eat_literal_suffix();
                     }
@@ -344,7 +354,7 @@ impl Cursor<'_> {
                 ('\'', _) => {
                     self.bump();
                     let terminated = self.single_quoted_string();
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                     if terminated {
                         self.eat_literal_suffix();
                     }
@@ -354,7 +364,7 @@ impl Cursor<'_> {
                 ('"', _) => {
                     self.bump();
                     let terminated = self.double_quoted_string();
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                     if terminated {
                         self.eat_literal_suffix();
                     }
@@ -364,7 +374,7 @@ impl Cursor<'_> {
                 ('r', '"') | ('r', '#') => {
                     self.bump();
                     let res = self.raw_double_quoted_string(2);
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                     if res.is_ok() {
                         self.eat_literal_suffix();
                     }
@@ -381,7 +391,7 @@ impl Cursor<'_> {
             // Numeric literal.
             c @ '0'..='9' => {
                 let literal_kind = self.number(c);
-                let suffix_start = self.len_consumed();
+                let suffix_start = self.pos_within_token();
                 self.eat_literal_suffix();
                 TokenKind::Literal { kind: literal_kind, suffix_start }
             }
@@ -420,7 +430,7 @@ impl Cursor<'_> {
             // String literal.
             '"' => {
                 let terminated = self.double_quoted_string();
-                let suffix_start = self.len_consumed();
+                let suffix_start = self.pos_within_token();
                 if terminated {
                     self.eat_literal_suffix();
                 }
@@ -433,7 +443,9 @@ impl Cursor<'_> {
             }
             _ => Unknown,
         };
-        Token::new(token_kind, self.len_consumed())
+        let res = Token::new(token_kind, self.pos_within_token());
+        self.reset_pos_within_token();
+        res
     }
 
     fn line_comment(&mut self) -> TokenKind {
@@ -618,7 +630,7 @@ impl Cursor<'_> {
 
         if !can_be_a_lifetime {
             let terminated = self.single_quoted_string();
-            let suffix_start = self.len_consumed();
+            let suffix_start = self.pos_within_token();
             if terminated {
                 self.eat_literal_suffix();
             }
@@ -643,7 +655,7 @@ impl Cursor<'_> {
         if self.first() == '\'' {
             self.bump();
             let kind = Char { terminated: true };
-            Literal { kind, suffix_start: self.len_consumed() }
+            Literal { kind, suffix_start: self.pos_within_token() }
         } else {
             Lifetime { starts_with_number }
         }
@@ -724,7 +736,7 @@ impl Cursor<'_> {
 
     fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
         debug_assert!(self.prev() == 'r');
-        let start_pos = self.len_consumed();
+        let start_pos = self.pos_within_token();
         let mut possible_terminator_offset = None;
         let mut max_hashes = 0;
 
@@ -778,7 +790,7 @@ impl Cursor<'_> {
                 // Keep track of possible terminators to give a hint about
                 // where there might be a missing terminator
                 possible_terminator_offset =
-                    Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
+                    Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
                 max_hashes = n_end_hashes;
             }
         }
diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 3da6bc146..8f64b5f51 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -93,7 +93,7 @@ where
         // NOTE: Raw strings do not perform any explicit character escaping, here we
         // only translate CRLF to LF and produce errors on bare CR.
         Mode::RawStr | Mode::RawByteStr => {
-            unescape_raw_str_or_byte_str(literal_text, mode, callback)
+            unescape_raw_str_or_raw_byte_str(literal_text, mode, callback)
         }
     }
 }
@@ -105,7 +105,7 @@ pub fn unescape_byte_literal<F>(literal_text: &str, mode: Mode, callback: &mut F
 where
     F: FnMut(Range<usize>, Result<u8, EscapeError>),
 {
-    assert!(mode.is_bytes());
+    debug_assert!(mode.is_bytes());
     unescape_literal(literal_text, mode, &mut |range, result| {
         callback(range, result.map(byte_from_char));
     })
@@ -129,7 +129,7 @@ pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
 }
 
 /// What kind of literal do we parse.
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq)]
 pub enum Mode {
     Char,
     Str,
@@ -140,17 +140,13 @@ pub enum Mode {
 }
 
 impl Mode {
-    pub fn in_single_quotes(self) -> bool {
+    pub fn in_double_quotes(self) -> bool {
         match self {
-            Mode::Char | Mode::Byte => true,
-            Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => false,
+            Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true,
+            Mode::Char | Mode::Byte => false,
         }
     }
 
-    pub fn in_double_quotes(self) -> bool {
-        !self.in_single_quotes()
-    }
-
     pub fn is_bytes(self) -> bool {
         match self {
             Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
@@ -184,7 +180,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
 
             let value = hi * 16 + lo;
 
-            // For a byte literal verify that it is within ASCII range.
+            // For a non-byte literal verify that it is within ASCII range.
             if !mode.is_bytes() && !is_ascii(value) {
                 return Err(EscapeError::OutOfRangeHexEscape);
             }
@@ -263,6 +259,7 @@ fn ascii_check(first_char: char, mode: Mode) -> Result<char, EscapeError> {
 }
 
 fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
+    debug_assert!(mode == Mode::Char || mode == Mode::Byte);
     let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
     let res = match first_char {
         '\\' => scan_escape(chars, mode),
@@ -282,7 +279,7 @@ fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
-    assert!(mode.in_double_quotes());
+    debug_assert!(mode == Mode::Str || mode == Mode::ByteStr);
     let initial_len = src.len();
     let mut chars = src.chars();
     while let Some(first_char) = chars.next() {
@@ -344,11 +341,11 @@ where
 /// sequence of characters or errors.
 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 /// only translate CRLF to LF and produce errors on bare CR.
-fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
+fn unescape_raw_str_or_raw_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
-    assert!(mode.in_double_quotes());
+    debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr);
     let initial_len = literal_text.len();
 
     let mut chars = literal_text.chars();
@@ -368,7 +365,7 @@ where
 
 fn byte_from_char(c: char) -> u8 {
     let res = c as u32;
-    assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr");
+    debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr");
     res as u8
 }