1 files changed, 44 insertions, 32 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index a79c98264..51515976e 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -29,9 +29,11 @@ pub mod unescape;
 #[cfg(test)]
 mod tests;
 
+pub use crate::cursor::Cursor;
+
 use self::LiteralKind::*;
 use self::TokenKind::*;
-use crate::cursor::{Cursor, EOF_CHAR};
+use crate::cursor::EOF_CHAR;
 use std::convert::TryFrom;
 
 /// Parsed token.
@@ -55,29 +57,42 @@ pub enum TokenKind {
     // Multi-char tokens:
     /// "// comment"
     LineComment { doc_style: Option<DocStyle> },
+
     /// `/* block comment */`
     ///
-    /// Block comments can be recursive, so the sequence like `/* /* */`
+    /// Block comments can be recursive, so a sequence like `/* /* */`
     /// will not be considered terminated and will result in a parsing error.
     BlockComment { doc_style: Option<DocStyle>, terminated: bool },
-    /// Any whitespace characters sequence.
+
+    /// Any whitespace character sequence.
     Whitespace,
+
     /// "ident" or "continue"
-    /// At this step keywords are also considered identifiers.
+    ///
+    /// At this step, keywords are also considered identifiers.
     Ident,
+
     /// Like the above, but containing invalid unicode codepoints.
     InvalidIdent,
+
     /// "r#ident"
     RawIdent,
-    /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
+
+    /// An unknown prefix, like `foo#`, `foo'`, `foo"`.
+    ///
+    /// Note that only the
     /// prefix (`foo`) is included in the token, not the separator (which is
     /// lexed as its own distinct token). In Rust 2021 and later, reserved
     /// prefixes are reported as errors; in earlier editions, they result in a
     /// (allowed by default) lint, and are treated as regular identifier
     /// tokens.
     UnknownPrefix,
-    /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
+
+    /// Examples: `"12_u8"`, `"1.0e-40"`, `b"123`.
+    ///
+    /// See [LiteralKind] for more details.
     Literal { kind: LiteralKind, suffix_start: u32 },
+
     /// "'a"
     Lifetime { starts_with_number: bool },
 
@@ -139,6 +154,9 @@ pub enum TokenKind {
 
     /// Unknown token, not expected by the lexer, e.g. "№"
     Unknown,
+
+    /// End of input.
+    Eof,
 }
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -219,13 +237,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
     None
 }
 
-/// Parses the first token from the provided input string.
-#[inline]
-pub fn first_token(input: &str) -> Token {
-    debug_assert!(!input.is_empty());
-    Cursor::new(input).advance_token()
-}
-
 /// Validates a raw string literal. Used for getting more information about a
 /// problem with a `RawStr`/`RawByteStr` with a `None` field.
 #[inline]
@@ -243,12 +254,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
 pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
     let mut cursor = Cursor::new(input);
     std::iter::from_fn(move || {
-        if cursor.is_eof() {
-            None
-        } else {
-            cursor.reset_len_consumed();
-            Some(cursor.advance_token())
-        }
+        let token = cursor.advance_token();
+        if token.kind != TokenKind::Eof { Some(token) } else { None }
     })
 }
 
@@ -311,8 +318,11 @@ pub fn is_ident(string: &str) -> bool {
 
 impl Cursor<'_> {
     /// Parses a token from the input string.
-    fn advance_token(&mut self) -> Token {
-        let first_char = self.bump().unwrap();
+    pub fn advance_token(&mut self) -> Token {
+        let first_char = match self.bump() {
+            Some(c) => c,
+            None => return Token::new(TokenKind::Eof, 0),
+        };
         let token_kind = match first_char {
             // Slash, comment or block comment.
             '/' => match self.first() {
@@ -329,7 +339,7 @@ impl Cursor<'_> {
                 ('#', c1) if is_id_start(c1) => self.raw_ident(),
                 ('#', _) | ('"', _) => {
                     let res = self.raw_double_quoted_string(1);
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                     if res.is_ok() {
                         self.eat_literal_suffix();
                     }
@@ -344,7 +354,7 @@ impl Cursor<'_> {
                 ('\'', _) => {
                     self.bump();
                     let terminated = self.single_quoted_string();
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                     if terminated {
                         self.eat_literal_suffix();
                     }
@@ -354,7 +364,7 @@ impl Cursor<'_> {
                 ('"', _) => {
                     self.bump();
                     let terminated = self.double_quoted_string();
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                     if terminated {
                         self.eat_literal_suffix();
                     }
@@ -364,7 +374,7 @@ impl Cursor<'_> {
                 ('r', '"') | ('r', '#') => {
                     self.bump();
                     let res = self.raw_double_quoted_string(2);
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                     if res.is_ok() {
                         self.eat_literal_suffix();
                     }
@@ -381,7 +391,7 @@ impl Cursor<'_> {
             // Numeric literal.
             c @ '0'..='9' => {
                 let literal_kind = self.number(c);
-                let suffix_start = self.len_consumed();
+                let suffix_start = self.pos_within_token();
                 self.eat_literal_suffix();
                 TokenKind::Literal { kind: literal_kind, suffix_start }
             }
@@ -420,7 +430,7 @@ impl Cursor<'_> {
             // String literal.
             '"' => {
                 let terminated = self.double_quoted_string();
-                let suffix_start = self.len_consumed();
+                let suffix_start = self.pos_within_token();
                 if terminated {
                     self.eat_literal_suffix();
                 }
@@ -433,7 +443,9 @@ impl Cursor<'_> {
             }
             _ => Unknown,
         };
-        Token::new(token_kind, self.len_consumed())
+        let res = Token::new(token_kind, self.pos_within_token());
+        self.reset_pos_within_token();
+        res
     }
 
     fn line_comment(&mut self) -> TokenKind {
@@ -618,7 +630,7 @@ impl Cursor<'_> {
 
         if !can_be_a_lifetime {
             let terminated = self.single_quoted_string();
-            let suffix_start = self.len_consumed();
+            let suffix_start = self.pos_within_token();
             if terminated {
                 self.eat_literal_suffix();
             }
@@ -643,7 +655,7 @@ impl Cursor<'_> {
         if self.first() == '\'' {
             self.bump();
             let kind = Char { terminated: true };
-            Literal { kind, suffix_start: self.len_consumed() }
+            Literal { kind, suffix_start: self.pos_within_token() }
         } else {
             Lifetime { starts_with_number }
         }
@@ -724,7 +736,7 @@ impl Cursor<'_> {
 
     fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
         debug_assert!(self.prev() == 'r');
-        let start_pos = self.len_consumed();
+        let start_pos = self.pos_within_token();
         let mut possible_terminator_offset = None;
         let mut max_hashes = 0;
 
@@ -778,7 +790,7 @@ impl Cursor<'_> {
                 // Keep track of possible terminators to give a hint about
                 // where there might be a missing terminator
                 possible_terminator_offset =
-                    Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
+                    Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
                 max_hashes = n_end_hashes;
             }
         }