diff options
Diffstat (limited to 'compiler/rustc_errors/src/markdown/parse.rs')
-rw-r--r-- | compiler/rustc_errors/src/markdown/parse.rs | 588 |
1 files changed, 588 insertions, 0 deletions
diff --git a/compiler/rustc_errors/src/markdown/parse.rs b/compiler/rustc_errors/src/markdown/parse.rs new file mode 100644 index 000000000..362a451fd --- /dev/null +++ b/compiler/rustc_errors/src/markdown/parse.rs @@ -0,0 +1,588 @@ +use crate::markdown::{MdStream, MdTree}; +use std::{iter, mem, str}; + +/// Short aliases that we can use in match patterns. If an end pattern is not +/// included, this type may be variable +const ANC_E: &[u8] = b">"; +const ANC_S: &[u8] = b"<"; +const BRK: &[u8] = b"---"; +const CBK: &[u8] = b"```"; +const CIL: &[u8] = b"`"; +const CMT_E: &[u8] = b"-->"; +const CMT_S: &[u8] = b"<!--"; +const EMP: &[u8] = b"_"; +const HDG: &[u8] = b"#"; +const LNK_CHARS: &str = "$-_.+!*'()/&?=:%"; +const LNK_E: &[u8] = b"]"; +const LNK_S: &[u8] = b"["; +const STG: &[u8] = b"**"; +const STK: &[u8] = b"~~"; +const UL1: &[u8] = b"* "; +const UL2: &[u8] = b"- "; + +/// Pattern replacements +const REPLACEMENTS: &[(&str, &str)] = &[ + ("(c)", "©"), + ("(C)", "©"), + ("(r)", "®"), + ("(R)", "®"), + ("(tm)", "™"), + ("(TM)", "™"), + (":crab:", "🦀"), + ("\n", " "), +]; + +/// `(extracted, remaining)` +type Parsed<'a> = (MdTree<'a>, &'a [u8]); +/// Output of a parse function +type ParseResult<'a> = Option<Parsed<'a>>; + +/// Parsing context +#[derive(Clone, Copy, Debug, PartialEq)] +struct Context { + /// If true, we are at a the topmost level (not recursing a nested tt) + top_block: bool, + /// Previous character + prev: Prev, +} + +/// Character class preceding this one +#[derive(Clone, Copy, Debug, PartialEq)] +enum Prev { + Newline, + /// Whitespace that is not a newline + Whitespace, + Escape, + Any, +} + +impl Default for Context { + /// Most common setting for non top-level parsing: not top block, not at + /// line start (yes leading whitespace, not escaped) + fn default() -> Self { + Self { top_block: false, prev: Prev::Whitespace } + } +} + +/// Flags to simple parser function +#[derive(Clone, Copy, Debug, PartialEq)] +enum ParseOpt { + /// Ignore escapes before closing pattern, trim content + TrimNoEsc, + None, +} + +/// Parse a buffer +pub fn entrypoint(txt: &str) -> MdStream<'_> { + let ctx = Context { top_block: true, prev: Prev::Newline }; + normalize(parse_recursive(txt.trim().as_bytes(), ctx), &mut Vec::new()) +} + +/// Parse a buffer with specified context +fn parse_recursive<'a>(buf: &'a [u8], ctx: Context) -> MdStream<'_> { + use ParseOpt as Po; + use Prev::{Escape, Newline, Whitespace}; + + let mut stream: Vec<MdTree<'a>> = Vec::new(); + let Context { top_block: top_blk, mut prev } = ctx; + + // wip_buf is our entire unprocessed (unpushed) buffer, loop_buf is our to + // check buffer that shrinks with each loop + let mut wip_buf = buf; + let mut loop_buf = wip_buf; + + while !loop_buf.is_empty() { + let next_prev = match loop_buf[0] { + b'\n' => Newline, + b'\\' => Escape, + x if x.is_ascii_whitespace() => Whitespace, + _ => Prev::Any, + }; + + let res: ParseResult<'_> = match (top_blk, prev) { + (_, Newline | Whitespace) if loop_buf.starts_with(CMT_S) => { + parse_simple_pat(loop_buf, CMT_S, CMT_E, Po::TrimNoEsc, MdTree::Comment) + } + (true, Newline) if loop_buf.starts_with(CBK) => Some(parse_codeblock(loop_buf)), + (_, Newline | Whitespace) if loop_buf.starts_with(CIL) => parse_codeinline(loop_buf), + (true, Newline | Whitespace) if loop_buf.starts_with(HDG) => parse_heading(loop_buf), + (true, Newline) if loop_buf.starts_with(BRK) => { + Some((MdTree::HorizontalRule, parse_to_newline(loop_buf).1)) + } + (_, Newline | Whitespace) if loop_buf.starts_with(EMP) => { + parse_simple_pat(loop_buf, EMP, EMP, Po::None, MdTree::Emphasis) + } + (_, Newline | Whitespace) if loop_buf.starts_with(STG) => { + parse_simple_pat(loop_buf, STG, STG, Po::None, MdTree::Strong) + } + (_, Newline | Whitespace) if loop_buf.starts_with(STK) => { + parse_simple_pat(loop_buf, STK, STK, Po::None, MdTree::Strikethrough) + } + (_, Newline | Whitespace) if loop_buf.starts_with(ANC_S) => { + let tt_fn = |link| MdTree::Link { disp: link, link }; + let ret = parse_simple_pat(loop_buf, ANC_S, ANC_E, Po::None, tt_fn); + match ret { + Some((MdTree::Link { disp, .. }, _)) + if disp.chars().all(|ch| LNK_CHARS.contains(ch)) => + { + ret + } + _ => None, + } + } + (_, Newline) if (loop_buf.starts_with(UL1) || loop_buf.starts_with(UL2)) => { + Some(parse_unordered_li(loop_buf)) + } + (_, Newline) if ord_list_start(loop_buf).is_some() => Some(parse_ordered_li(loop_buf)), + (_, Newline | Whitespace) if loop_buf.starts_with(LNK_S) => { + parse_any_link(loop_buf, top_blk && prev == Prev::Newline) + } + (_, Escape | _) => None, + }; + + if let Some((tree, rest)) = res { + // We found something: push our WIP and then push the found tree + let prev_buf = &wip_buf[..(wip_buf.len() - loop_buf.len())]; + if !prev_buf.is_empty() { + let prev_str = str::from_utf8(prev_buf).unwrap(); + stream.push(MdTree::PlainText(prev_str)); + } + stream.push(tree); + + wip_buf = rest; + loop_buf = rest; + } else { + // Just move on to the next character + loop_buf = &loop_buf[1..]; + // If we are at the end and haven't found anything, just push plain text + if loop_buf.is_empty() && !wip_buf.is_empty() { + let final_str = str::from_utf8(wip_buf).unwrap(); + stream.push(MdTree::PlainText(final_str)); + } + }; + + prev = next_prev; + } + + MdStream(stream) +} + +/// The simplest kind of patterns: data within start and end patterns +fn parse_simple_pat<'a, F>( + buf: &'a [u8], + start_pat: &[u8], + end_pat: &[u8], + opts: ParseOpt, + create_tt: F, +) -> ParseResult<'a> +where + F: FnOnce(&'a str) -> MdTree<'a>, +{ + let ignore_esc = matches!(opts, ParseOpt::TrimNoEsc); + let trim = matches!(opts, ParseOpt::TrimNoEsc); + let (txt, rest) = parse_with_end_pat(&buf[start_pat.len()..], end_pat, ignore_esc)?; + let mut txt = str::from_utf8(txt).unwrap(); + if trim { + txt = txt.trim(); + } + Some((create_tt(txt), rest)) +} + +/// Parse backtick-wrapped inline code. Accounts for >1 backtick sets +fn parse_codeinline(buf: &[u8]) -> ParseResult<'_> { + let seps = buf.iter().take_while(|ch| **ch == b'`').count(); + let (txt, rest) = parse_with_end_pat(&buf[seps..], &buf[..seps], true)?; + Some((MdTree::CodeInline(str::from_utf8(txt).unwrap()), rest)) +} + +/// Parse a codeblock. Accounts for >3 backticks and language specification +fn parse_codeblock(buf: &[u8]) -> Parsed<'_> { + // account for ````code```` style + let seps = buf.iter().take_while(|ch| **ch == b'`').count(); + let end_sep = &buf[..seps]; + let mut working = &buf[seps..]; + + // Handle "````rust" style language specifications + let next_ws_idx = working.iter().take_while(|ch| !ch.is_ascii_whitespace()).count(); + + let lang = if next_ws_idx > 0 { + // Munch the lang + let tmp = str::from_utf8(&working[..next_ws_idx]).unwrap(); + working = &working[next_ws_idx..]; + Some(tmp) + } else { + None + }; + + let mut end_pat = vec![b'\n']; + end_pat.extend(end_sep); + + // Find first end pattern with nothing else on its line + let mut found = None; + for idx in (0..working.len()).filter(|idx| working[*idx..].starts_with(&end_pat)) { + let (eol_txt, rest) = parse_to_newline(&working[(idx + end_pat.len())..]); + if !eol_txt.iter().any(u8::is_ascii_whitespace) { + found = Some((&working[..idx], rest)); + break; + } + } + + let (txt, rest) = found.unwrap_or((working, &[])); + let txt = str::from_utf8(txt).unwrap().trim_matches('\n'); + + (MdTree::CodeBlock { txt, lang }, rest) +} + +fn parse_heading(buf: &[u8]) -> ParseResult<'_> { + let level = buf.iter().take_while(|ch| **ch == b'#').count(); + let buf = &buf[level..]; + + if level > 6 || (buf.len() > 1 && !buf[0].is_ascii_whitespace()) { + // Enforce max 6 levels and whitespace following the `##` pattern + return None; + } + + let (txt, rest) = parse_to_newline(&buf[1..]); + let ctx = Context { top_block: false, prev: Prev::Whitespace }; + let stream = parse_recursive(txt, ctx); + + Some((MdTree::Heading(level.try_into().unwrap(), stream), rest)) +} + +/// Bulleted list +fn parse_unordered_li(buf: &[u8]) -> Parsed<'_> { + debug_assert!(buf.starts_with(b"* ") || buf.starts_with(b"- ")); + let (txt, rest) = get_indented_section(&buf[2..]); + let ctx = Context { top_block: false, prev: Prev::Whitespace }; + let stream = parse_recursive(trim_ascii_start(txt), ctx); + (MdTree::UnorderedListItem(stream), rest) +} + +/// Numbered list +fn parse_ordered_li(buf: &[u8]) -> Parsed<'_> { + let (num, pos) = ord_list_start(buf).unwrap(); // success tested in caller + let (txt, rest) = get_indented_section(&buf[pos..]); + let ctx = Context { top_block: false, prev: Prev::Whitespace }; + let stream = parse_recursive(trim_ascii_start(txt), ctx); + (MdTree::OrderedListItem(num, stream), rest) +} + +/// Find first line that isn't empty or doesn't start with whitespace, that will +/// be our contents +fn get_indented_section(buf: &[u8]) -> (&[u8], &[u8]) { + let mut end = buf.len(); + for (idx, window) in buf.windows(2).enumerate() { + let &[ch, next_ch] = window else {unreachable!("always 2 elements")}; + if idx >= buf.len().saturating_sub(2) && next_ch == b'\n' { + // End of stream + end = buf.len().saturating_sub(1); + break; + } else if ch == b'\n' && (!next_ch.is_ascii_whitespace() || next_ch == b'\n') { + end = idx; + break; + } + } + + (&buf[..end], &buf[end..]) +} + +/// Verify a valid ordered list start (e.g. `1.`) and parse it. Returns the +/// parsed number and offset of character after the dot. +fn ord_list_start(buf: &[u8]) -> Option<(u16, usize)> { + let pos = buf.iter().take(10).position(|ch| *ch == b'.')?; + let n = str::from_utf8(&buf[..pos]).ok()?; + if !buf.get(pos + 1)?.is_ascii_whitespace() { + return None; + } + n.parse::<u16>().ok().map(|v| (v, pos + 2)) +} + +/// Parse links. `can_be_def` indicates that a link definition is possible (top +/// level, located at the start of a line) +fn parse_any_link(buf: &[u8], can_be_def: bool) -> ParseResult<'_> { + let (bracketed, rest) = parse_with_end_pat(&buf[1..], LNK_E, true)?; + if rest.is_empty() { + return None; + } + + let disp = str::from_utf8(bracketed).unwrap(); + match (can_be_def, rest[0]) { + (true, b':') => { + let (link, tmp) = parse_to_newline(&rest[1..]); + let link = str::from_utf8(link).unwrap().trim(); + Some((MdTree::LinkDef { id: disp, link }, tmp)) + } + (_, b'(') => parse_simple_pat(rest, b"(", b")", ParseOpt::TrimNoEsc, |link| MdTree::Link { + disp, + link, + }), + (_, b'[') => parse_simple_pat(rest, b"[", b"]", ParseOpt::TrimNoEsc, |id| { + MdTree::RefLink { disp, id: Some(id) } + }), + _ => Some((MdTree::RefLink { disp, id: None }, rest)), + } +} + +/// Find and consume an end pattern, return `(match, residual)` +fn parse_with_end_pat<'a>( + buf: &'a [u8], + end_sep: &[u8], + ignore_esc: bool, +) -> Option<(&'a [u8], &'a [u8])> { + // Find positions that start with the end seperator + for idx in (0..buf.len()).filter(|idx| buf[*idx..].starts_with(end_sep)) { + if !ignore_esc && idx > 0 && buf[idx - 1] == b'\\' { + continue; + } + return Some((&buf[..idx], &buf[idx + end_sep.len()..])); + } + None +} + +/// Resturn `(match, residual)` to end of line. The EOL is returned with the +/// residual. +fn parse_to_newline(buf: &[u8]) -> (&[u8], &[u8]) { + buf.iter().position(|ch| *ch == b'\n').map_or((buf, &[]), |pos| buf.split_at(pos)) +} + +/// Take a parsed stream and fix the little things +fn normalize<'a>(MdStream(stream): MdStream<'a>, linkdefs: &mut Vec<MdTree<'a>>) -> MdStream<'a> { + let mut new_stream = Vec::with_capacity(stream.len()); + let new_defs = stream.iter().filter(|tt| matches!(tt, MdTree::LinkDef { .. })); + linkdefs.extend(new_defs.cloned()); + + // Run plaintest expansions on types that need it, call this function on nested types + for item in stream { + match item { + MdTree::PlainText(txt) => expand_plaintext(txt, &mut new_stream, MdTree::PlainText), + MdTree::Strong(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Strong), + MdTree::Emphasis(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Emphasis), + MdTree::Strikethrough(txt) => { + expand_plaintext(txt, &mut new_stream, MdTree::Strikethrough); + } + MdTree::RefLink { disp, id } => new_stream.push(match_reflink(linkdefs, disp, id)), + MdTree::OrderedListItem(n, st) => { + new_stream.push(MdTree::OrderedListItem(n, normalize(st, linkdefs))); + } + MdTree::UnorderedListItem(st) => { + new_stream.push(MdTree::UnorderedListItem(normalize(st, linkdefs))); + } + MdTree::Heading(n, st) => new_stream.push(MdTree::Heading(n, normalize(st, linkdefs))), + _ => new_stream.push(item), + } + } + + // Remove non printing types, duplicate paragraph breaks, and breaks at start/end + new_stream.retain(|x| !matches!(x, MdTree::Comment(_) | MdTree::LinkDef { .. })); + new_stream.dedup_by(|r, l| matches!((r, l), (MdTree::ParagraphBreak, MdTree::ParagraphBreak))); + + if new_stream.first().is_some_and(is_break_ty) { + new_stream.remove(0); + } + if new_stream.last().is_some_and(is_break_ty) { + new_stream.pop(); + } + + // Remove paragraph breaks that shouldn't be there. w[1] is what will be + // removed in these cases. Note that these are the items to keep, not delete + // (for `retain`) + let to_keep: Vec<bool> = new_stream + .windows(3) + .map(|w| { + !((matches!(&w[1], MdTree::ParagraphBreak) + && matches!(should_break(&w[0], &w[2]), BreakRule::Always(1) | BreakRule::Never)) + || (matches!(&w[1], MdTree::PlainText(txt) if txt.trim().is_empty()) + && matches!( + should_break(&w[0], &w[2]), + BreakRule::Always(_) | BreakRule::Never + ))) + }) + .collect(); + let mut iter = iter::once(true).chain(to_keep).chain(iter::once(true)); + new_stream.retain(|_| iter.next().unwrap()); + + // Insert line or paragraph breaks where there should be some + let mut insertions = 0; + let to_insert: Vec<(usize, MdTree<'_>)> = new_stream + .windows(2) + .enumerate() + .filter_map(|(idx, w)| match should_break(&w[0], &w[1]) { + BreakRule::Always(1) => Some((idx, MdTree::LineBreak)), + BreakRule::Always(2) => Some((idx, MdTree::ParagraphBreak)), + _ => None, + }) + .map(|(idx, tt)| { + insertions += 1; + (idx + insertions, tt) + }) + .collect(); + to_insert.into_iter().for_each(|(idx, tt)| new_stream.insert(idx, tt)); + + MdStream(new_stream) +} + +/// Whether two types should or shouldn't have a paragraph break between them +#[derive(Clone, Copy, Debug, PartialEq)] +enum BreakRule { + Always(u8), + Never, + Optional, +} + +/// Blocks that automatically handle their own text wrapping +fn should_break(left: &MdTree<'_>, right: &MdTree<'_>) -> BreakRule { + use MdTree::*; + + match (left, right) { + // Separate these types with a single line + (HorizontalRule, _) + | (_, HorizontalRule) + | (OrderedListItem(_, _), OrderedListItem(_, _)) + | (UnorderedListItem(_), UnorderedListItem(_)) => BreakRule::Always(1), + // Condensed types shouldn't have an extra break on either side + (Comment(_) | ParagraphBreak | Heading(_, _), _) | (_, Comment(_) | ParagraphBreak) => { + BreakRule::Never + } + // Block types should always be separated by full breaks + (CodeBlock { .. } | OrderedListItem(_, _) | UnorderedListItem(_), _) + | (_, CodeBlock { .. } | Heading(_, _) | OrderedListItem(_, _) | UnorderedListItem(_)) => { + BreakRule::Always(2) + } + // Text types may or may not be separated by a break + ( + CodeInline(_) + | Strong(_) + | Emphasis(_) + | Strikethrough(_) + | PlainText(_) + | Link { .. } + | RefLink { .. } + | LinkDef { .. }, + CodeInline(_) + | Strong(_) + | Emphasis(_) + | Strikethrough(_) + | PlainText(_) + | Link { .. } + | RefLink { .. } + | LinkDef { .. }, + ) => BreakRule::Optional, + (LineBreak, _) | (_, LineBreak) => { + unreachable!("should have been removed during deduplication") + } + } +} + +/// Types that indicate some form of break +fn is_break_ty(val: &MdTree<'_>) -> bool { + matches!(val, MdTree::ParagraphBreak | MdTree::LineBreak) + // >1 break between paragraphs acts as a break + || matches!(val, MdTree::PlainText(txt) if txt.trim().is_empty()) +} + +/// Perform tranformations to text. This splits paragraphs, replaces patterns, +/// and corrects newlines. +/// +/// To avoid allocating strings (and using a different heavier tt type), our +/// replace method means split into three and append each. For this reason, any +/// viewer should treat consecutive `PlainText` types as belonging to the same +/// paragraph. +fn expand_plaintext<'a>( + txt: &'a str, + stream: &mut Vec<MdTree<'a>>, + mut f: fn(&'a str) -> MdTree<'a>, +) { + if txt.is_empty() { + return; + } else if txt == "\n" { + if let Some(tt) = stream.last() { + let tmp = MdTree::PlainText(" "); + if should_break(tt, &tmp) == BreakRule::Optional { + stream.push(tmp); + } + } + return; + } + let mut queue1 = Vec::new(); + let mut queue2 = Vec::new(); + let stream_start_len = stream.len(); + for paragraph in txt.split("\n\n") { + if paragraph.is_empty() { + stream.push(MdTree::ParagraphBreak); + continue; + } + let paragraph = trim_extra_ws(paragraph); + + queue1.clear(); + queue1.push(paragraph); + + for (from, to) in REPLACEMENTS { + queue2.clear(); + for item in &queue1 { + for s in item.split(from) { + queue2.extend(&[s, to]); + } + if queue2.len() > 1 { + let _ = queue2.pop(); // remove last unnecessary intersperse + } + } + mem::swap(&mut queue1, &mut queue2); + } + + // Make sure we don't double whitespace + queue1.retain(|s| !s.is_empty()); + for idx in 0..queue1.len() { + queue1[idx] = trim_extra_ws(queue1[idx]); + if idx < queue1.len() - 1 + && queue1[idx].ends_with(char::is_whitespace) + && queue1[idx + 1].starts_with(char::is_whitespace) + { + queue1[idx] = queue1[idx].trim_end(); + } + } + stream.extend(queue1.iter().copied().filter(|txt| !txt.is_empty()).map(&mut f)); + stream.push(MdTree::ParagraphBreak); + } + + if stream.len() - stream_start_len > 1 { + let _ = stream.pop(); // remove last unnecessary intersperse + } +} + +/// Turn reflinks (links with reference IDs) into normal standalone links using +/// listed link definitions +fn match_reflink<'a>(linkdefs: &[MdTree<'a>], disp: &'a str, match_id: Option<&str>) -> MdTree<'a> { + let to_match = match_id.unwrap_or(disp); // Match with the display name if there isn't an id + for def in linkdefs { + if let MdTree::LinkDef { id, link } = def { + if *id == to_match { + return MdTree::Link { disp, link }; + } + } + } + MdTree::Link { disp, link: "" } // link not found +} + +/// If there is more than one whitespace char at start or end, trim the extras +fn trim_extra_ws(mut txt: &str) -> &str { + let start_ws = + txt.bytes().position(|ch| !ch.is_ascii_whitespace()).unwrap_or(txt.len()).saturating_sub(1); + txt = &txt[start_ws..]; + let end_ws = txt + .bytes() + .rev() + .position(|ch| !ch.is_ascii_whitespace()) + .unwrap_or(txt.len()) + .saturating_sub(1); + &txt[..txt.len() - end_ws] +} + +/// If there is more than one whitespace char at start, trim the extras +fn trim_ascii_start(buf: &[u8]) -> &[u8] { + let count = buf.iter().take_while(|ch| ch.is_ascii_whitespace()).count(); + &buf[count..] +} + +#[cfg(test)] +#[path = "tests/parse.rs"] +mod tests; |