summaryrefslogtreecommitdiffstats
path: root/compiler/rustc_lexer
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/rustc_lexer')
-rw-r--r--compiler/rustc_lexer/Cargo.toml1
-rw-r--r--compiler/rustc_lexer/src/cursor.rs16
-rw-r--r--compiler/rustc_lexer/src/lib.rs76
-rw-r--r--compiler/rustc_lexer/src/unescape.rs27
4 files changed, 64 insertions, 56 deletions
diff --git a/compiler/rustc_lexer/Cargo.toml b/compiler/rustc_lexer/Cargo.toml
index 35af11053..ad685c2ad 100644
--- a/compiler/rustc_lexer/Cargo.toml
+++ b/compiler/rustc_lexer/Cargo.toml
@@ -12,7 +12,6 @@ Rust lexer used by rustc. No stability guarantees are provided.
# Note: do not remove this blank `[lib]` section.
# This will be used when publishing this crate as `rustc-ap-rustc_lexer`.
[lib]
-doctest = false
# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs
index 21557a9c8..eceef5980 100644
--- a/compiler/rustc_lexer/src/cursor.rs
+++ b/compiler/rustc_lexer/src/cursor.rs
@@ -4,8 +4,8 @@ use std::str::Chars;
///
/// Next characters can be peeked via `first` method,
/// and position can be shifted forward via `bump` method.
-pub(crate) struct Cursor<'a> {
- initial_len: usize,
+pub struct Cursor<'a> {
+ len_remaining: usize,
/// Iterator over chars. Slightly faster than a &str.
chars: Chars<'a>,
#[cfg(debug_assertions)]
@@ -15,9 +15,9 @@ pub(crate) struct Cursor<'a> {
pub(crate) const EOF_CHAR: char = '\0';
impl<'a> Cursor<'a> {
- pub(crate) fn new(input: &'a str) -> Cursor<'a> {
+ pub fn new(input: &'a str) -> Cursor<'a> {
Cursor {
- initial_len: input.len(),
+ len_remaining: input.len(),
chars: input.chars(),
#[cfg(debug_assertions)]
prev: EOF_CHAR,
@@ -61,13 +61,13 @@ impl<'a> Cursor<'a> {
}
/// Returns amount of already consumed symbols.
- pub(crate) fn len_consumed(&self) -> u32 {
- (self.initial_len - self.chars.as_str().len()) as u32
+ pub(crate) fn pos_within_token(&self) -> u32 {
+ (self.len_remaining - self.chars.as_str().len()) as u32
}
/// Resets the number of bytes consumed to 0.
- pub(crate) fn reset_len_consumed(&mut self) {
- self.initial_len = self.chars.as_str().len();
+ pub(crate) fn reset_pos_within_token(&mut self) {
+ self.len_remaining = self.chars.as_str().len();
}
/// Moves to the next character.
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index a79c98264..51515976e 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -29,9 +29,11 @@ pub mod unescape;
#[cfg(test)]
mod tests;
+pub use crate::cursor::Cursor;
+
use self::LiteralKind::*;
use self::TokenKind::*;
-use crate::cursor::{Cursor, EOF_CHAR};
+use crate::cursor::EOF_CHAR;
use std::convert::TryFrom;
/// Parsed token.
@@ -55,29 +57,42 @@ pub enum TokenKind {
// Multi-char tokens:
/// "// comment"
LineComment { doc_style: Option<DocStyle> },
+
/// `/* block comment */`
///
- /// Block comments can be recursive, so the sequence like `/* /* */`
+ /// Block comments can be recursive, so a sequence like `/* /* */`
/// will not be considered terminated and will result in a parsing error.
BlockComment { doc_style: Option<DocStyle>, terminated: bool },
- /// Any whitespace characters sequence.
+
+ /// Any whitespace character sequence.
Whitespace,
+
/// "ident" or "continue"
- /// At this step keywords are also considered identifiers.
+ ///
+ /// At this step, keywords are also considered identifiers.
Ident,
+
/// Like the above, but containing invalid unicode codepoints.
InvalidIdent,
+
/// "r#ident"
RawIdent,
- /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
+
+ /// An unknown prefix, like `foo#`, `foo'`, `foo"`.
+ ///
+ /// Note that only the
/// prefix (`foo`) is included in the token, not the separator (which is
/// lexed as its own distinct token). In Rust 2021 and later, reserved
/// prefixes are reported as errors; in earlier editions, they result in a
/// (allowed by default) lint, and are treated as regular identifier
/// tokens.
UnknownPrefix,
- /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
+
+ /// Examples: `"12_u8"`, `"1.0e-40"`, `b"123`.
+ ///
+ /// See [LiteralKind] for more details.
Literal { kind: LiteralKind, suffix_start: u32 },
+
/// "'a"
Lifetime { starts_with_number: bool },
@@ -139,6 +154,9 @@ pub enum TokenKind {
/// Unknown token, not expected by the lexer, e.g. "№"
Unknown,
+
+ /// End of input.
+ Eof,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -219,13 +237,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
None
}
-/// Parses the first token from the provided input string.
-#[inline]
-pub fn first_token(input: &str) -> Token {
- debug_assert!(!input.is_empty());
- Cursor::new(input).advance_token()
-}
-
/// Validates a raw string literal. Used for getting more information about a
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
#[inline]
@@ -243,12 +254,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
let mut cursor = Cursor::new(input);
std::iter::from_fn(move || {
- if cursor.is_eof() {
- None
- } else {
- cursor.reset_len_consumed();
- Some(cursor.advance_token())
- }
+ let token = cursor.advance_token();
+ if token.kind != TokenKind::Eof { Some(token) } else { None }
})
}
@@ -311,8 +318,11 @@ pub fn is_ident(string: &str) -> bool {
impl Cursor<'_> {
/// Parses a token from the input string.
- fn advance_token(&mut self) -> Token {
- let first_char = self.bump().unwrap();
+ pub fn advance_token(&mut self) -> Token {
+ let first_char = match self.bump() {
+ Some(c) => c,
+ None => return Token::new(TokenKind::Eof, 0),
+ };
let token_kind = match first_char {
// Slash, comment or block comment.
'/' => match self.first() {
@@ -329,7 +339,7 @@ impl Cursor<'_> {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let res = self.raw_double_quoted_string(1);
- let suffix_start = self.len_consumed();
+ let suffix_start = self.pos_within_token();
if res.is_ok() {
self.eat_literal_suffix();
}
@@ -344,7 +354,7 @@ impl Cursor<'_> {
('\'', _) => {
self.bump();
let terminated = self.single_quoted_string();
- let suffix_start = self.len_consumed();
+ let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
@@ -354,7 +364,7 @@ impl Cursor<'_> {
('"', _) => {
self.bump();
let terminated = self.double_quoted_string();
- let suffix_start = self.len_consumed();
+ let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
@@ -364,7 +374,7 @@ impl Cursor<'_> {
('r', '"') | ('r', '#') => {
self.bump();
let res = self.raw_double_quoted_string(2);
- let suffix_start = self.len_consumed();
+ let suffix_start = self.pos_within_token();
if res.is_ok() {
self.eat_literal_suffix();
}
@@ -381,7 +391,7 @@ impl Cursor<'_> {
// Numeric literal.
c @ '0'..='9' => {
let literal_kind = self.number(c);
- let suffix_start = self.len_consumed();
+ let suffix_start = self.pos_within_token();
self.eat_literal_suffix();
TokenKind::Literal { kind: literal_kind, suffix_start }
}
@@ -420,7 +430,7 @@ impl Cursor<'_> {
// String literal.
'"' => {
let terminated = self.double_quoted_string();
- let suffix_start = self.len_consumed();
+ let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
@@ -433,7 +443,9 @@ impl Cursor<'_> {
}
_ => Unknown,
};
- Token::new(token_kind, self.len_consumed())
+ let res = Token::new(token_kind, self.pos_within_token());
+ self.reset_pos_within_token();
+ res
}
fn line_comment(&mut self) -> TokenKind {
@@ -618,7 +630,7 @@ impl Cursor<'_> {
if !can_be_a_lifetime {
let terminated = self.single_quoted_string();
- let suffix_start = self.len_consumed();
+ let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
@@ -643,7 +655,7 @@ impl Cursor<'_> {
if self.first() == '\'' {
self.bump();
let kind = Char { terminated: true };
- Literal { kind, suffix_start: self.len_consumed() }
+ Literal { kind, suffix_start: self.pos_within_token() }
} else {
Lifetime { starts_with_number }
}
@@ -724,7 +736,7 @@ impl Cursor<'_> {
fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
debug_assert!(self.prev() == 'r');
- let start_pos = self.len_consumed();
+ let start_pos = self.pos_within_token();
let mut possible_terminator_offset = None;
let mut max_hashes = 0;
@@ -778,7 +790,7 @@ impl Cursor<'_> {
// Keep track of possible terminators to give a hint about
// where there might be a missing terminator
possible_terminator_offset =
- Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
+ Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
max_hashes = n_end_hashes;
}
}
diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 3da6bc146..8f64b5f51 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -93,7 +93,7 @@ where
// NOTE: Raw strings do not perform any explicit character escaping, here we
// only translate CRLF to LF and produce errors on bare CR.
Mode::RawStr | Mode::RawByteStr => {
- unescape_raw_str_or_byte_str(literal_text, mode, callback)
+ unescape_raw_str_or_raw_byte_str(literal_text, mode, callback)
}
}
}
@@ -105,7 +105,7 @@ pub fn unescape_byte_literal<F>(literal_text: &str, mode: Mode, callback: &mut F
where
F: FnMut(Range<usize>, Result<u8, EscapeError>),
{
- assert!(mode.is_bytes());
+ debug_assert!(mode.is_bytes());
unescape_literal(literal_text, mode, &mut |range, result| {
callback(range, result.map(byte_from_char));
})
@@ -129,7 +129,7 @@ pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
}
/// What kind of literal do we parse.
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Mode {
Char,
Str,
@@ -140,17 +140,13 @@ pub enum Mode {
}
impl Mode {
- pub fn in_single_quotes(self) -> bool {
+ pub fn in_double_quotes(self) -> bool {
match self {
- Mode::Char | Mode::Byte => true,
- Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => false,
+ Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true,
+ Mode::Char | Mode::Byte => false,
}
}
- pub fn in_double_quotes(self) -> bool {
- !self.in_single_quotes()
- }
-
pub fn is_bytes(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
@@ -184,7 +180,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
let value = hi * 16 + lo;
- // For a byte literal verify that it is within ASCII range.
+ // For a non-byte literal verify that it is within ASCII range.
if !mode.is_bytes() && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
@@ -263,6 +259,7 @@ fn ascii_check(first_char: char, mode: Mode) -> Result<char, EscapeError> {
}
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
+ debug_assert!(mode == Mode::Char || mode == Mode::Byte);
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = match first_char {
'\\' => scan_escape(chars, mode),
@@ -282,7 +279,7 @@ fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
- assert!(mode.in_double_quotes());
+ debug_assert!(mode == Mode::Str || mode == Mode::ByteStr);
let initial_len = src.len();
let mut chars = src.chars();
while let Some(first_char) = chars.next() {
@@ -344,11 +341,11 @@ where
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
-fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
+fn unescape_raw_str_or_raw_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
- assert!(mode.in_double_quotes());
+ debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr);
let initial_len = literal_text.len();
let mut chars = literal_text.chars();
@@ -368,7 +365,7 @@ where
fn byte_from_char(c: char) -> u8 {
let res = c as u32;
- assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr");
+ debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr");
res as u8
}