summaryrefslogtreecommitdiffstats
path: root/vendor/pulldown-cmark/src/firstpass.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/pulldown-cmark/src/firstpass.rs')
-rw-r--r--vendor/pulldown-cmark/src/firstpass.rs1927
1 files changed, 1927 insertions, 0 deletions
diff --git a/vendor/pulldown-cmark/src/firstpass.rs b/vendor/pulldown-cmark/src/firstpass.rs
new file mode 100644
index 000000000..cf3cfbf53
--- /dev/null
+++ b/vendor/pulldown-cmark/src/firstpass.rs
@@ -0,0 +1,1927 @@
+//! The first pass resolves all block structure, generating an AST. Within a block, items
+//! are in a linear chain with potential inline markup identified.
+
+use std::cmp::max;
+use std::ops::Range;
+
+use crate::parse::{scan_containers, Allocations, HeadingAttributes, Item, ItemBody, LinkDef};
+use crate::scanners::*;
+use crate::strings::CowStr;
+use crate::tree::{Tree, TreeIndex};
+use crate::Options;
+use crate::{
+ linklabel::{scan_link_label_rest, LinkLabel},
+ HeadingLevel,
+};
+
+use unicase::UniCase;
+
+/// Runs the first pass, which resolves the block structure of the document,
+/// and returns the resulting tree.
+pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations) {
+ // This is a very naive heuristic for the number of nodes
+ // we'll need.
+ let start_capacity = max(128, text.len() / 32);
+ let lookup_table = &create_lut(&options);
+ let first_pass = FirstPass {
+ text,
+ tree: Tree::with_capacity(start_capacity),
+ begin_list_item: false,
+ last_line_blank: false,
+ allocs: Allocations::new(),
+ options,
+ lookup_table,
+ };
+ first_pass.run()
+}
+
+/// State for the first parsing pass.
+struct FirstPass<'a, 'b> {
+ text: &'a str,
+ tree: Tree<Item>,
+ begin_list_item: bool,
+ last_line_blank: bool,
+ allocs: Allocations<'a>,
+ options: Options,
+ lookup_table: &'b LookupTable,
+}
+
+impl<'a, 'b> FirstPass<'a, 'b> {
+ fn run(mut self) -> (Tree<Item>, Allocations<'a>) {
+ let mut ix = 0;
+ while ix < self.text.len() {
+ ix = self.parse_block(ix);
+ }
+ for _ in 0..self.tree.spine_len() {
+ self.pop(ix);
+ }
+ (self.tree, self.allocs)
+ }
+
+ /// Returns offset after block.
+ fn parse_block(&mut self, mut start_ix: usize) -> usize {
+ let bytes = self.text.as_bytes();
+ let mut line_start = LineStart::new(&bytes[start_ix..]);
+
+ let i = scan_containers(&self.tree, &mut line_start);
+ for _ in i..self.tree.spine_len() {
+ self.pop(start_ix);
+ }
+
+ if self.options.contains(Options::ENABLE_FOOTNOTES) {
+ // finish footnote if it's still open and was preceded by blank line
+ if let Some(node_ix) = self.tree.peek_up() {
+ if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
+ if self.last_line_blank {
+ self.pop(start_ix);
+ }
+ }
+ }
+
+ // Footnote definitions of the form
+ // [^bar]:
+ // * anything really
+ let container_start = start_ix + line_start.bytes_scanned();
+ if let Some(bytecount) = self.parse_footnote(container_start) {
+ start_ix = container_start + bytecount;
+ start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0);
+ line_start = LineStart::new(&bytes[start_ix..]);
+ }
+ }
+
+ // Process new containers
+ loop {
+ let container_start = start_ix + line_start.bytes_scanned();
+ if let Some((ch, index, indent)) = line_start.scan_list_marker() {
+ let after_marker_index = start_ix + line_start.bytes_scanned();
+ self.continue_list(container_start, ch, index);
+ self.tree.append(Item {
+ start: container_start,
+ end: after_marker_index, // will get updated later if item not empty
+ body: ItemBody::ListItem(indent),
+ });
+ self.tree.push();
+ if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
+ self.begin_list_item = true;
+ return after_marker_index + n;
+ }
+ if self.options.contains(Options::ENABLE_TASKLISTS) {
+ if let Some(is_checked) = line_start.scan_task_list_marker() {
+ self.tree.append(Item {
+ start: after_marker_index,
+ end: start_ix + line_start.bytes_scanned(),
+ body: ItemBody::TaskListMarker(is_checked),
+ });
+ }
+ }
+ } else if line_start.scan_blockquote_marker() {
+ self.finish_list(start_ix);
+ self.tree.append(Item {
+ start: container_start,
+ end: 0, // will get set later
+ body: ItemBody::BlockQuote,
+ });
+ self.tree.push();
+ } else {
+ break;
+ }
+ }
+
+ let ix = start_ix + line_start.bytes_scanned();
+
+ if let Some(n) = scan_blank_line(&bytes[ix..]) {
+ if let Some(node_ix) = self.tree.peek_up() {
+ match self.tree[node_ix].item.body {
+ ItemBody::BlockQuote => (),
+ _ => {
+ if self.begin_list_item {
+ // A list item can begin with at most one blank line.
+ self.pop(start_ix);
+ }
+ self.last_line_blank = true;
+ }
+ }
+ }
+ return ix + n;
+ }
+
+ self.begin_list_item = false;
+ self.finish_list(start_ix);
+
+ // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks
+ let remaining_space = line_start.remaining_space();
+
+ let indent = line_start.scan_space_upto(4);
+ if indent == 4 {
+ let ix = start_ix + line_start.bytes_scanned();
+ let remaining_space = line_start.remaining_space();
+ return self.parse_indented_code_block(ix, remaining_space);
+ }
+
+ let ix = start_ix + line_start.bytes_scanned();
+
+ // HTML Blocks
+ if bytes[ix] == b'<' {
+ // Types 1-5 are all detected by one function and all end with the same
+ // pattern
+ if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) {
+ return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space);
+ }
+
+ // Detect type 6
+ if starts_html_block_type_6(&bytes[(ix + 1)..]) {
+ return self.parse_html_block_type_6_or_7(ix, remaining_space);
+ }
+
+ // Detect type 7
+ if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
+ return self.parse_html_block_type_6_or_7(ix, remaining_space);
+ }
+ }
+
+ if let Ok(n) = scan_hrule(&bytes[ix..]) {
+ return self.parse_hrule(n, ix);
+ }
+
+ if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) {
+ return self.parse_atx_heading(ix, atx_size);
+ }
+
+ // parse refdef
+ if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) {
+ self.allocs.refdefs.0.entry(label).or_insert(link_def);
+ let ix = ix + bytecount;
+ // try to read trailing whitespace or it will register as a completely blank line
+ // TODO: shouldn't we do this for all block level items?
+ return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0);
+ }
+
+ if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) {
+ return self.parse_fenced_code_block(ix, indent, fence_ch, n);
+ }
+ self.parse_paragraph(ix)
+ }
+
+ /// Returns the offset of the first line after the table.
+ /// Assumptions: current focus is a table element and the table header
+ /// matches the separator line (same number of columns).
+ fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize {
+ // parse header. this shouldn't fail because we made sure the table header is ok
+ let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols);
+ self.tree[thead_ix].item.body = ItemBody::TableHead;
+
+ // parse body
+ let mut ix = body_start;
+ while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) {
+ ix = next_ix;
+ }
+
+ self.pop(ix);
+ ix
+ }
+
+ /// Call this when containers are taken care of.
+ /// Returns bytes scanned, row_ix
+ fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) {
+ let bytes = self.text.as_bytes();
+ let mut cells = 0;
+ let mut final_cell_ix = None;
+
+ let row_ix = self.tree.append(Item {
+ start: ix,
+ end: 0, // set at end of this function
+ body: ItemBody::TableRow,
+ });
+ self.tree.push();
+
+ loop {
+ ix += scan_ch(&bytes[ix..], b'|');
+ let start_ix = ix;
+ ix += scan_whitespace_no_nl(&bytes[ix..]);
+
+ if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
+ ix += eol_bytes;
+ break;
+ }
+
+ let cell_ix = self.tree.append(Item {
+ start: start_ix,
+ end: ix,
+ body: ItemBody::TableCell,
+ });
+ self.tree.push();
+ let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active);
+
+ if let Some(cur_ix) = self.tree.cur() {
+ let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace);
+ self.tree[cur_ix].item.end -= trailing_whitespace;
+ }
+
+ self.tree[cell_ix].item.end = next_ix;
+ self.tree.pop();
+
+ ix = next_ix;
+ cells += 1;
+
+ if cells == row_cells {
+ final_cell_ix = Some(cell_ix);
+ }
+ }
+
+ // fill empty cells if needed
+ // note: this is where GFM and commonmark-extra diverge. we follow
+ // GFM here
+ for _ in cells..row_cells {
+ self.tree.append(Item {
+ start: ix,
+ end: ix,
+ body: ItemBody::TableCell,
+ });
+ }
+
+ // drop excess cells
+ if let Some(cell_ix) = final_cell_ix {
+ self.tree[cell_ix].next = None;
+ }
+
+ self.pop(ix);
+
+ (ix, row_ix)
+ }
+
+ /// Returns first offset after the row and the tree index of the row.
+ fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> {
+ let bytes = self.text.as_bytes();
+ let mut line_start = LineStart::new(&bytes[ix..]);
+ let current_container =
+ scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
+ if !current_container {
+ return None;
+ }
+ line_start.scan_all_space();
+ ix += line_start.bytes_scanned();
+ if scan_paragraph_interrupt(&bytes[ix..], current_container) {
+ return None;
+ }
+
+ let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells);
+ Some((ix, row_ix))
+ }
+
+ /// Returns offset of line start after paragraph.
+ fn parse_paragraph(&mut self, start_ix: usize) -> usize {
+ let node_ix = self.tree.append(Item {
+ start: start_ix,
+ end: 0, // will get set later
+ body: ItemBody::Paragraph,
+ });
+ self.tree.push();
+ let bytes = self.text.as_bytes();
+
+ let mut ix = start_ix;
+ loop {
+ let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix {
+ TableParseMode::Scan
+ } else {
+ TableParseMode::Disabled
+ };
+ let (next_ix, brk) = self.parse_line(ix, None, scan_mode);
+
+ // break out when we find a table
+ if let Some(Item {
+ body: ItemBody::Table(alignment_ix),
+ ..
+ }) = brk
+ {
+ let table_cols = self.allocs[alignment_ix].len();
+ self.tree[node_ix].item.body = ItemBody::Table(alignment_ix);
+ // this clears out any stuff we may have appended - but there may
+ // be a cleaner way
+ self.tree[node_ix].child = None;
+ self.tree.pop();
+ self.tree.push();
+ return self.parse_table(table_cols, ix, next_ix);
+ }
+
+ ix = next_ix;
+ let mut line_start = LineStart::new(&bytes[ix..]);
+ let current_container =
+ scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
+ if !line_start.scan_space(4) {
+ let ix_new = ix + line_start.bytes_scanned();
+ if current_container {
+ let trailing_backslash_pos = match brk {
+ Some(Item {
+ start,
+ body: ItemBody::HardBreak,
+ ..
+ }) if bytes[start] == b'\\' => Some(start),
+ _ => None,
+ };
+ if let Some(ix_setext) =
+ self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some())
+ {
+ if let Some(pos) = trailing_backslash_pos {
+ self.tree.append_text(pos, pos + 1);
+ }
+ ix = ix_setext;
+ break;
+ }
+ }
+ // first check for non-empty lists, then for other interrupts
+ let suffix = &bytes[ix_new..];
+ if scan_paragraph_interrupt(suffix, current_container) {
+ break;
+ }
+ }
+ line_start.scan_all_space();
+ if line_start.is_at_eol() {
+ break;
+ }
+ ix = next_ix + line_start.bytes_scanned();
+ if let Some(item) = brk {
+ self.tree.append(item);
+ }
+ }
+
+ self.pop(ix);
+ ix
+ }
+
+ /// Returns end ix of setext_heading on success.
+ fn parse_setext_heading(
+ &mut self,
+ ix: usize,
+ node_ix: TreeIndex,
+ has_trailing_content: bool,
+ ) -> Option<usize> {
+ let bytes = self.text.as_bytes();
+ let (n, level) = scan_setext_heading(&bytes[ix..])?;
+ let mut attrs = None;
+
+ if let Some(cur_ix) = self.tree.cur() {
+ let parent_ix = self.tree.peek_up().unwrap();
+ let header_start = self.tree[parent_ix].item.start;
+ // Note that `self.tree[parent_ix].item.end` might be zero at this point.
+ // Use the end position of the current node (i.e. the last known child
+ // of the parent) instead.
+ let header_end = self.tree[cur_ix].item.end;
+
+ // extract the trailing attribute block
+ let (content_end, attrs_) =
+ self.extract_and_parse_heading_attribute_block(header_start, header_end);
+ attrs = attrs_;
+
+ // strip trailing whitespace
+ let new_end = if has_trailing_content {
+ content_end
+ } else {
+ let trailing_ws =
+ scan_rev_while(&bytes[header_start..content_end], is_ascii_whitespace_no_nl);
+ content_end - trailing_ws
+ };
+
+ if attrs.is_some() {
+ // remove trailing block attributes
+ self.tree.truncate_siblings(self.text.as_bytes(), new_end);
+ }
+
+ if let Some(cur_ix) = self.tree.cur() {
+ self.tree[cur_ix].item.end = new_end;
+ }
+ }
+
+ self.tree[node_ix].item.body = ItemBody::Heading(
+ level,
+ attrs.map(|attrs| self.allocs.allocate_heading(attrs)),
+ );
+
+ Some(ix + n)
+ }
+
+ /// Parse a line of input, appending text and items to tree.
+ ///
+ /// Returns: index after line and an item representing the break.
+ fn parse_line(
+ &mut self,
+ start: usize,
+ end: Option<usize>,
+ mode: TableParseMode,
+ ) -> (usize, Option<Item>) {
+ let bytes = self.text.as_bytes();
+ let bytes = match end {
+ Some(end) => &bytes[..end],
+ None => bytes,
+ };
+ let bytes_len = bytes.len();
+ let mut pipes = 0;
+ let mut last_pipe_ix = start;
+ let mut begin_text = start;
+
+ let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, |ix, byte| {
+ match byte {
+ b'\n' | b'\r' => {
+ if let TableParseMode::Active = mode {
+ return LoopInstruction::BreakAtWith(ix, None);
+ }
+
+ let mut i = ix;
+ let eol_bytes = scan_eol(&bytes[ix..]).unwrap();
+ if mode == TableParseMode::Scan && pipes > 0 {
+ // check if we may be parsing a table
+ let next_line_ix = ix + eol_bytes;
+ let mut line_start = LineStart::new(&bytes[next_line_ix..]);
+ if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len() {
+ let table_head_ix = next_line_ix + line_start.bytes_scanned();
+ let (table_head_bytes, alignment) =
+ scan_table_head(&bytes[table_head_ix..]);
+
+ if table_head_bytes > 0 {
+ // computing header count from number of pipes
+ let header_count =
+ count_header_cols(bytes, pipes, start, last_pipe_ix);
+
+ // make sure they match the number of columns we find in separator line
+ if alignment.len() == header_count {
+ let alignment_ix = self.allocs.allocate_alignment(alignment);
+ let end_ix = table_head_ix + table_head_bytes;
+ return LoopInstruction::BreakAtWith(
+ end_ix,
+ Some(Item {
+ start: i,
+ end: end_ix, // must update later
+ body: ItemBody::Table(alignment_ix),
+ }),
+ );
+ }
+ }
+ }
+ }
+
+ let end_ix = ix + eol_bytes;
+ let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b'\\');
+ if trailing_backslashes % 2 == 1 && end_ix < bytes_len {
+ i -= 1;
+ self.tree.append_text(begin_text, i);
+ return LoopInstruction::BreakAtWith(
+ end_ix,
+ Some(Item {
+ start: i,
+ end: end_ix,
+ body: ItemBody::HardBreak,
+ }),
+ );
+ }
+ let trailing_whitespace =
+ scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl);
+ if trailing_whitespace >= 2 {
+ i -= trailing_whitespace;
+ self.tree.append_text(begin_text, i);
+ return LoopInstruction::BreakAtWith(
+ end_ix,
+ Some(Item {
+ start: i,
+ end: end_ix,
+ body: ItemBody::HardBreak,
+ }),
+ );
+ }
+
+ self.tree.append_text(begin_text, ix);
+ LoopInstruction::BreakAtWith(
+ end_ix,
+ Some(Item {
+ start: i,
+ end: end_ix,
+ body: ItemBody::SoftBreak,
+ }),
+ )
+ }
+ b'\\' => {
+ if ix + 1 < bytes_len && is_ascii_punctuation(bytes[ix + 1]) {
+ self.tree.append_text(begin_text, ix);
+ if bytes[ix + 1] == b'`' {
+ let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`');
+ self.tree.append(Item {
+ start: ix + 1,
+ end: ix + count + 1,
+ body: ItemBody::MaybeCode(count, true),
+ });
+ begin_text = ix + 1 + count;
+ LoopInstruction::ContinueAndSkip(count)
+ } else {
+ begin_text = ix + 1;
+ LoopInstruction::ContinueAndSkip(1)
+ }
+ } else {
+ LoopInstruction::ContinueAndSkip(0)
+ }
+ }
+ c @ b'*' | c @ b'_' | c @ b'~' => {
+ let string_suffix = &self.text[ix..];
+ let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c);
+ let can_open = delim_run_can_open(self.text, string_suffix, count, ix);
+ let can_close = delim_run_can_close(self.text, string_suffix, count, ix);
+ let is_valid_seq = c != b'~' || count == 2;
+
+ if (can_open || can_close) && is_valid_seq {
+ self.tree.append_text(begin_text, ix);
+ for i in 0..count {
+ self.tree.append(Item {
+ start: ix + i,
+ end: ix + i + 1,
+ body: ItemBody::MaybeEmphasis(count - i, can_open, can_close),
+ });
+ }
+ begin_text = ix + count;
+ }
+ LoopInstruction::ContinueAndSkip(count - 1)
+ }
+ b'`' => {
+ self.tree.append_text(begin_text, ix);
+ let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`');
+ self.tree.append(Item {
+ start: ix,
+ end: ix + count,
+ body: ItemBody::MaybeCode(count, false),
+ });
+ begin_text = ix + count;
+ LoopInstruction::ContinueAndSkip(count - 1)
+ }
+ b'<' => {
+ // Note: could detect some non-HTML cases and early escape here, but not
+ // clear that's a win.
+ self.tree.append_text(begin_text, ix);
+ self.tree.append(Item {
+ start: ix,
+ end: ix + 1,
+ body: ItemBody::MaybeHtml,
+ });
+ begin_text = ix + 1;
+ LoopInstruction::ContinueAndSkip(0)
+ }
+ b'!' => {
+ if ix + 1 < bytes_len && bytes[ix + 1] == b'[' {
+ self.tree.append_text(begin_text, ix);
+ self.tree.append(Item {
+ start: ix,
+ end: ix + 2,
+ body: ItemBody::MaybeImage,
+ });
+ begin_text = ix + 2;
+ LoopInstruction::ContinueAndSkip(1)
+ } else {
+ LoopInstruction::ContinueAndSkip(0)
+ }
+ }
+ b'[' => {
+ self.tree.append_text(begin_text, ix);
+ self.tree.append(Item {
+ start: ix,
+ end: ix + 1,
+ body: ItemBody::MaybeLinkOpen,
+ });
+ begin_text = ix + 1;
+ LoopInstruction::ContinueAndSkip(0)
+ }
+ b']' => {
+ self.tree.append_text(begin_text, ix);
+ self.tree.append(Item {
+ start: ix,
+ end: ix + 1,
+ body: ItemBody::MaybeLinkClose(true),
+ });
+ begin_text = ix + 1;
+ LoopInstruction::ContinueAndSkip(0)
+ }
+ b'&' => match scan_entity(&bytes[ix..]) {
+ (n, Some(value)) => {
+ self.tree.append_text(begin_text, ix);
+ self.tree.append(Item {
+ start: ix,
+ end: ix + n,
+ body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)),
+ });
+ begin_text = ix + n;
+ LoopInstruction::ContinueAndSkip(n - 1)
+ }
+ _ => LoopInstruction::ContinueAndSkip(0),
+ },
+ b'|' => {
+ if let TableParseMode::Active = mode {
+ LoopInstruction::BreakAtWith(ix, None)
+ } else {
+ last_pipe_ix = ix;
+ pipes += 1;
+ LoopInstruction::ContinueAndSkip(0)
+ }
+ }
+ b'.' => {
+ if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' {
+ self.tree.append_text(begin_text, ix);
+ self.tree.append(Item {
+ start: ix,
+ end: ix + 3,
+ body: ItemBody::SynthesizeChar('…'),
+ });
+ begin_text = ix + 3;
+ LoopInstruction::ContinueAndSkip(2)
+ } else {
+ LoopInstruction::ContinueAndSkip(0)
+ }
+ }
+ b'-' => {
+ let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-');
+ if count == 1 {
+ LoopInstruction::ContinueAndSkip(0)
+ } else {
+ let itembody = if count == 2 {
+ ItemBody::SynthesizeChar('–')
+ } else if count == 3 {
+ ItemBody::SynthesizeChar('—')
+ } else {
+ let (ems, ens) = match count % 6 {
+ 0 | 3 => (count / 3, 0),
+ 2 | 4 => (0, count / 2),
+ 1 => (count / 3 - 1, 2),
+ _ => (count / 3, 1),
+ };
+ // – and — are 3 bytes each in utf8
+ let mut buf = String::with_capacity(3 * (ems + ens));
+ for _ in 0..ems {
+ buf.push('—');
+ }
+ for _ in 0..ens {
+ buf.push('–');
+ }
+ ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into()))
+ };
+
+ self.tree.append_text(begin_text, ix);
+ self.tree.append(Item {
+ start: ix,
+ end: ix + count,
+ body: itembody,
+ });
+ begin_text = ix + count;
+ LoopInstruction::ContinueAndSkip(count - 1)
+ }
+ }
+ c @ b'\'' | c @ b'"' => {
+ let string_suffix = &self.text[ix..];
+ let can_open = delim_run_can_open(self.text, string_suffix, 1, ix);
+ let can_close = delim_run_can_close(self.text, string_suffix, 1, ix);
+
+ self.tree.append_text(begin_text, ix);
+ self.tree.append(Item {
+ start: ix,
+ end: ix + 1,
+ body: ItemBody::MaybeSmartQuote(c, can_open, can_close),
+ });
+ begin_text = ix + 1;
+
+ LoopInstruction::ContinueAndSkip(0)
+ }
+ _ => LoopInstruction::ContinueAndSkip(0),
+ }
+ });
+
+ if brk.is_none() {
+ // need to close text at eof
+ self.tree.append_text(begin_text, final_ix);
+ }
+ (final_ix, brk)
+ }
+
+ /// When start_ix is at the beginning of an HTML block of type 1 to 5,
+ /// this will find the end of the block, adding the block itself to the
+ /// tree and also keeping track of the lines of HTML within the block.
+ ///
+ /// The html_end_tag is the tag that must be found on a line to end the block.
+ fn parse_html_block_type_1_to_5(
+ &mut self,
+ start_ix: usize,
+ html_end_tag: &str,
+ mut remaining_space: usize,
+ ) -> usize {
+ let bytes = self.text.as_bytes();
+ let mut ix = start_ix;
+ loop {
+ let line_start_ix = ix;
+ ix += scan_nextline(&bytes[ix..]);
+ self.append_html_line(remaining_space, line_start_ix, ix);
+
+ let mut line_start = LineStart::new(&bytes[ix..]);
+ let n_containers = scan_containers(&self.tree, &mut line_start);
+ if n_containers < self.tree.spine_len() {
+ break;
+ }
+
+ if (&self.text[line_start_ix..ix]).contains(html_end_tag) {
+ break;
+ }
+
+ let next_line_ix = ix + line_start.bytes_scanned();
+ if next_line_ix == self.text.len() {
+ break;
+ }
+ ix = next_line_ix;
+ remaining_space = line_start.remaining_space();
+ }
+ ix
+ }
+
+ /// When start_ix is at the beginning of an HTML block of type 6 or 7,
+ /// this will consume lines until there is a blank line and keep track of
+ /// the HTML within the block.
+ fn parse_html_block_type_6_or_7(
+ &mut self,
+ start_ix: usize,
+ mut remaining_space: usize,
+ ) -> usize {
+ let bytes = self.text.as_bytes();
+ let mut ix = start_ix;
+ loop {
+ let line_start_ix = ix;
+ ix += scan_nextline(&bytes[ix..]);
+ self.append_html_line(remaining_space, line_start_ix, ix);
+
+ let mut line_start = LineStart::new(&bytes[ix..]);
+ let n_containers = scan_containers(&self.tree, &mut line_start);
+ if n_containers < self.tree.spine_len() || line_start.is_at_eol() {
+ break;
+ }
+
+ let next_line_ix = ix + line_start.bytes_scanned();
+ if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some()
+ {
+ break;
+ }
+ ix = next_line_ix;
+ remaining_space = line_start.remaining_space();
+ }
+ ix
+ }
+
+ fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize {
+ self.tree.append(Item {
+ start: start_ix,
+ end: 0, // will get set later
+ body: ItemBody::IndentCodeBlock,
+ });
+ self.tree.push();
+ let bytes = self.text.as_bytes();
+ let mut last_nonblank_child = None;
+ let mut last_nonblank_ix = 0;
+ let mut end_ix = 0;
+ let mut last_line_blank = false;
+
+ let mut ix = start_ix;
+ loop {
+ let line_start_ix = ix;
+ ix += scan_nextline(&bytes[ix..]);
+ self.append_code_text(remaining_space, line_start_ix, ix);
+ // TODO(spec clarification): should we synthesize newline at EOF?
+
+ if !last_line_blank {
+ last_nonblank_child = self.tree.cur();
+ last_nonblank_ix = ix;
+ end_ix = ix;
+ }
+
+ let mut line_start = LineStart::new(&bytes[ix..]);
+ let n_containers = scan_containers(&self.tree, &mut line_start);
+ if n_containers < self.tree.spine_len()
+ || !(line_start.scan_space(4) || line_start.is_at_eol())
+ {
+ break;
+ }
+ let next_line_ix = ix + line_start.bytes_scanned();
+ if next_line_ix == self.text.len() {
+ break;
+ }
+ ix = next_line_ix;
+ remaining_space = line_start.remaining_space();
+ last_line_blank = scan_blank_line(&bytes[ix..]).is_some();
+ }
+
+ // Trim trailing blank lines.
+ if let Some(child) = last_nonblank_child {
+ self.tree[child].next = None;
+ self.tree[child].item.end = last_nonblank_ix;
+ }
+ self.pop(end_ix);
+ ix
+ }
+
+ fn parse_fenced_code_block(
+ &mut self,
+ start_ix: usize,
+ indent: usize,
+ fence_ch: u8,
+ n_fence_char: usize,
+ ) -> usize {
+ let bytes = self.text.as_bytes();
+ let mut info_start = start_ix + n_fence_char;
+ info_start += scan_whitespace_no_nl(&bytes[info_start..]);
+ // TODO: info strings are typically very short. wouldn't it be faster
+ // to just do a forward scan here?
+ let mut ix = info_start + scan_nextline(&bytes[info_start..]);
+ let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace);
+ let info_string = unescape(&self.text[info_start..info_end]);
+ self.tree.append(Item {
+ start: start_ix,
+ end: 0, // will get set later
+ body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)),
+ });
+ self.tree.push();
+ loop {
+ let mut line_start = LineStart::new(&bytes[ix..]);
+ let n_containers = scan_containers(&self.tree, &mut line_start);
+ if n_containers < self.tree.spine_len() {
+ break;
+ }
+ line_start.scan_space(indent);
+ let mut close_line_start = line_start.clone();
+ if !close_line_start.scan_space(4) {
+ let close_ix = ix + close_line_start.bytes_scanned();
+ if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char)
+ {
+ ix = close_ix + n;
+ break;
+ }
+ }
+ let remaining_space = line_start.remaining_space();
+ ix += line_start.bytes_scanned();
+ let next_ix = ix + scan_nextline(&bytes[ix..]);
+ self.append_code_text(remaining_space, ix, next_ix);
+ ix = next_ix;
+ }
+
+ self.pop(ix);
+
+ // try to read trailing whitespace or it will register as a completely blank line
+ ix + scan_blank_line(&bytes[ix..]).unwrap_or(0)
+ }
+
+ fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) {
+ if remaining_space > 0 {
+ let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
+ self.tree.append(Item {
+ start,
+ end: start,
+ body: ItemBody::SynthesizeText(cow_ix),
+ });
+ }
+ if self.text.as_bytes()[end - 2] == b'\r' {
+ // Normalize CRLF to LF
+ self.tree.append_text(start, end - 2);
+ self.tree.append_text(end - 1, end);
+ } else {
+ self.tree.append_text(start, end);
+ }
+ }
+
+ /// Appends a line of HTML to the tree.
+ fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) {
+ if remaining_space > 0 {
+ let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
+ self.tree.append(Item {
+ start,
+ end: start,
+ // TODO: maybe this should synthesize to html rather than text?
+ body: ItemBody::SynthesizeText(cow_ix),
+ });
+ }
+ if self.text.as_bytes()[end - 2] == b'\r' {
+ // Normalize CRLF to LF
+ self.tree.append(Item {
+ start,
+ end: end - 2,
+ body: ItemBody::Html,
+ });
+ self.tree.append(Item {
+ start: end - 1,
+ end,
+ body: ItemBody::Html,
+ });
+ } else {
+ self.tree.append(Item {
+ start,
+ end,
+ body: ItemBody::Html,
+ });
+ }
+ }
+
+ /// Pop a container, setting its end.
+ fn pop(&mut self, ix: usize) {
+ let cur_ix = self.tree.pop().unwrap();
+ self.tree[cur_ix].item.end = ix;
+ if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body {
+ surgerize_tight_list(&mut self.tree, cur_ix);
+ }
+ }
+
+ /// Close a list if it's open. Also set loose if last line was blank
+ fn finish_list(&mut self, ix: usize) {
+ if let Some(node_ix) = self.tree.peek_up() {
+ if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body {
+ self.pop(ix);
+ }
+ }
+ if self.last_line_blank {
+ if let Some(node_ix) = self.tree.peek_grandparent() {
+ if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body {
+ *is_tight = false;
+ }
+ }
+ self.last_line_blank = false;
+ }
+ }
+
+ /// Continue an existing list or start a new one if there's not an open
+ /// list that matches.
+ fn continue_list(&mut self, start: usize, ch: u8, index: u64) {
+ if let Some(node_ix) = self.tree.peek_up() {
+ if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body {
+ if existing_ch == ch {
+ if self.last_line_blank {
+ *is_tight = false;
+ self.last_line_blank = false;
+ }
+ return;
+ }
+ }
+ // TODO: this is not the best choice for end; maybe get end from last list item.
+ self.finish_list(start);
+ }
+ self.tree.append(Item {
+ start,
+ end: 0, // will get set later
+ body: ItemBody::List(true, ch, index),
+ });
+ self.tree.push();
+ self.last_line_blank = false;
+ }
+
+ /// Parse a thematic break.
+ ///
+ /// Returns index of start of next line.
+ fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize {
+ self.tree.append(Item {
+ start: ix,
+ end: ix + hrule_size,
+ body: ItemBody::Rule,
+ });
+ ix + hrule_size
+ }
+
+ /// Parse an ATX heading.
+ ///
+ /// Returns index of start of next line.
+ fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize {
+ let mut ix = start;
+ let heading_ix = self.tree.append(Item {
+ start,
+ end: 0, // set later
+ body: ItemBody::default(), // set later
+ });
+ ix += atx_level as usize;
+ // next char is space or eol (guaranteed by scan_atx_heading)
+ let bytes = self.text.as_bytes();
+ if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
+ self.tree[heading_ix].item.end = ix + eol_bytes;
+ self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None);
+ return ix + eol_bytes;
+ }
+ // skip leading spaces
+ let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]);
+ ix += skip_spaces;
+
+ // now handle the header text
+ let header_start = ix;
+ let header_node_idx = self.tree.push(); // so that we can set the endpoint later
+
+ // trim the trailing attribute block before parsing the entire line, if necessary
+ let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES)
+ {
+ // the start of the next line is the end of the header since the
+ // header cannot have line breaks
+ let header_end = header_start + scan_nextline(&bytes[header_start..]);
+ let (content_end, attrs) =
+ self.extract_and_parse_heading_attribute_block(header_start, header_end);
+ self.parse_line(ix, Some(content_end), TableParseMode::Disabled);
+ (header_end, content_end, attrs)
+ } else {
+ ix = self.parse_line(ix, None, TableParseMode::Disabled).0;
+ (ix, ix, None)
+ };
+ self.tree[header_node_idx].item.end = end;
+
+ // remove trailing matter from header text
+ if let Some(cur_ix) = self.tree.cur() {
+ // remove closing of the ATX heading
+ let header_text = &bytes[header_start..content_end];
+ let mut limit = header_text
+ .iter()
+ .rposition(|&b| !(b == b'\n' || b == b'\r' || b == b' '))
+ .map_or(0, |i| i + 1);
+ let closer = header_text[..limit]
+ .iter()
+ .rposition(|&b| b != b'#')
+ .map_or(0, |i| i + 1);
+ if closer == 0 {
+ limit = closer;
+ } else {
+ let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ');
+ if spaces > 0 {
+ limit = closer - spaces;
+ }
+ }
+ self.tree[cur_ix].item.end = limit + header_start;
+ }
+
+ self.tree.pop();
+ self.tree[heading_ix].item.body = ItemBody::Heading(
+ atx_level,
+ attrs.map(|attrs| self.allocs.allocate_heading(attrs)),
+ );
+ end
+ }
+
+ /// Returns the number of bytes scanned on success.
+ fn parse_footnote(&mut self, start: usize) -> Option<usize> {
+ let bytes = &self.text.as_bytes()[start..];
+ if !bytes.starts_with(b"[^") {
+ return None;
+ }
+ let (mut i, label) = self.parse_refdef_label(start + 2)?;
+ i += 2;
+ if scan_ch(&bytes[i..], b':') == 0 {
+ return None;
+ }
+ i += 1;
+ self.finish_list(start);
+ self.tree.append(Item {
+ start,
+ end: 0, // will get set later
+ // TODO: check whether the label here is strictly necessary
+ body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)),
+ });
+ self.tree.push();
+ Some(i)
+ }
+
+ /// Tries to parse a reference label, which can be interrupted by new blocks.
+ /// On success, returns the number of bytes of the label and the label itself.
+ fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> {
+ scan_link_label_rest(&self.text[start..], &|bytes| {
+ let mut line_start = LineStart::new(bytes);
+ let current_container =
+ scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
+ let bytes_scanned = line_start.bytes_scanned();
+ let suffix = &bytes[bytes_scanned..];
+ if scan_paragraph_interrupt(suffix, current_container) {
+ None
+ } else {
+ Some(bytes_scanned)
+ }
+ })
+ }
+
+ /// Returns number of bytes scanned, label and definition on success.
+ fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> {
+ let bytes = &self.text.as_bytes()[start..];
+ if scan_ch(bytes, b'[') == 0 {
+ return None;
+ }
+ let (mut i, label) = self.parse_refdef_label(start + 1)?;
+ i += 1;
+ if scan_ch(&bytes[i..], b':') == 0 {
+ return None;
+ }
+ i += 1;
+ let (bytecount, link_def) = self.scan_refdef(start, start + i)?;
+ Some((bytecount + i, UniCase::new(label), link_def))
+ }
+
+ /// Returns number of bytes and number of newlines
+ fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> {
+ let mut newlines = 0;
+ loop {
+ let whitespaces = scan_whitespace_no_nl(&bytes[i..]);
+ i += whitespaces;
+ if let Some(eol_bytes) = scan_eol(&bytes[i..]) {
+ i += eol_bytes;
+ newlines += 1;
+ if newlines > 1 {
+ return None;
+ }
+ } else {
+ break;
+ }
+ let mut line_start = LineStart::new(&bytes[i..]);
+ if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) {
+ return None;
+ }
+ i += line_start.bytes_scanned();
+ }
+ Some((i, newlines))
+ }
+
+ /// Returns # of bytes and definition.
+ /// Assumes the label of the reference including colon has already been scanned.
+ fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> {
+ let bytes = self.text.as_bytes();
+
+ // whitespace between label and url (including up to one newline)
+ let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?;
+
+ // scan link dest
+ let (dest_length, dest) = scan_link_dest(self.text, i, 1)?;
+ if dest_length == 0 {
+ return None;
+ }
+ let dest = unescape(dest);
+ i += dest_length;
+
+ // no title
+ let mut backup = (
+ i - start,
+ LinkDef {
+ dest,
+ title: None,
+ span: span_start..i,
+ },
+ );
+
+ // scan whitespace between dest and label
+ let (mut i, newlines) =
+ if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) {
+ if i == self.text.len() {
+ newlines += 1;
+ }
+ if new_i == i && newlines == 0 {
+ return None;
+ }
+ if newlines > 1 {
+ return Some(backup);
+ };
+ (new_i, newlines)
+ } else {
+ return Some(backup);
+ };
+
+ // scan title
+ // if this fails but newline == 1, return also a refdef without title
+ if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) {
+ i += title_length;
+ backup.1.span = span_start..i;
+ backup.1.title = Some(unescape(title));
+ } else if newlines > 0 {
+ return Some(backup);
+ } else {
+ return None;
+ };
+
+ // scan EOL
+ if let Some(bytes) = scan_blank_line(&bytes[i..]) {
+ backup.0 = i + bytes - start;
+ Some(backup)
+ } else if newlines > 0 {
+ Some(backup)
+ } else {
+ None
+ }
+ }
+
+ /// Extracts and parses a heading attribute block if exists.
+ ///
+ /// Returns `(end_offset_of_heading_content, (id, classes))`.
+ ///
+ /// If `header_end` is less than or equal to `header_start`, the given
+ /// input is considered as empty.
+ fn extract_and_parse_heading_attribute_block(
+ &mut self,
+ header_start: usize,
+ header_end: usize,
+ ) -> (usize, Option<HeadingAttributes<'a>>) {
+ if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) {
+ return (header_end, None);
+ }
+
+ // extract the trailing attribute block
+ let header_bytes = &self.text.as_bytes()[header_start..header_end];
+ let (content_len, attr_block_range_rel) =
+ extract_attribute_block_content_from_header_text(header_bytes);
+ let content_end = header_start + content_len;
+ let attrs = attr_block_range_rel.and_then(|r| {
+ parse_inside_attribute_block(
+ &self.text[(header_start + r.start)..(header_start + r.end)],
+ )
+ });
+ (content_end, attrs)
+ }
+}
+
+/// Scanning modes for `Parser`'s `parse_line` method.
+#[derive(PartialEq, Eq, Copy, Clone)]
+enum TableParseMode {
+ /// Inside a paragraph, scanning for table headers.
+ Scan,
+ /// Inside a table.
+ Active,
+ /// Inside a paragraph, not scanning for table headers.
+ Disabled,
+}
+
+/// Computes the number of header columns in a table line by computing the number of dividing pipes
+/// that aren't followed or preceded by whitespace.
+fn count_header_cols(
+ bytes: &[u8],
+ mut pipes: usize,
+ mut start: usize,
+ last_pipe_ix: usize,
+) -> usize {
+ // was first pipe preceded by whitespace? if so, subtract one
+ start += scan_whitespace_no_nl(&bytes[start..]);
+ if bytes[start] == b'|' {
+ pipes -= 1;
+ }
+
+ // was last pipe followed by whitespace? if so, sub one
+ if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() {
+ pipes
+ } else {
+ pipes + 1
+ }
+}
+
+/// Checks whether we should break a paragraph on the given input.
+fn scan_paragraph_interrupt(bytes: &[u8], current_container: bool) -> bool {
+ scan_eol(bytes).is_some()
+ || scan_hrule(bytes).is_ok()
+ || scan_atx_heading(bytes).is_some()
+ || scan_code_fence(bytes).is_some()
+ || scan_blockquote_start(bytes).is_some()
+ || scan_listitem(bytes).map_or(false, |(ix, delim, index, _)| {
+ ! current_container ||
+ // we don't allow interruption by either empty lists or
+ // numbered lists starting at an index other than 1
+ (delim == b'*' || delim == b'-' || delim == b'+' || index == 1)
+ && !scan_empty_list(&bytes[ix..])
+ })
+ || bytes.starts_with(b"<")
+ && (get_html_end_tag(&bytes[1..]).is_some() || starts_html_block_type_6(&bytes[1..]))
+}
+
+/// Assumes `text_bytes` is preceded by `<`.
+fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> {
+ static BEGIN_TAGS: &[&[u8]; 4] = &[b"pre", b"style", b"script", b"textarea"];
+ static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--", b"?", b"![CDATA["];
+
+ for (beg_tag, end_tag) in BEGIN_TAGS
+ .iter()
+ .zip(["</pre>", "</style>", "</script>", "</textarea>"].iter())
+ {
+ let tag_len = beg_tag.len();
+
+ if text_bytes.len() < tag_len {
+ // begin tags are increasing in size
+ break;
+ }
+
+ if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) {
+ continue;
+ }
+
+ // Must either be the end of the line...
+ if text_bytes.len() == tag_len {
+ return Some(end_tag);
+ }
+
+ // ...or be followed by whitespace, newline, or '>'.
+ let s = text_bytes[tag_len];
+ if is_ascii_whitespace(s) || s == b'>' {
+ return Some(end_tag);
+ }
+ }
+
+ for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->", "?>", "]]>"].iter()) {
+ if text_bytes.starts_with(beg_tag) {
+ return Some(end_tag);
+ }
+ }
+
+ if text_bytes.len() > 1
+ && text_bytes[0] == b'!'
+ && text_bytes[1] >= b'A'
+ && text_bytes[1] <= b'Z'
+ {
+ Some(">")
+ } else {
+ None
+ }
+}
+
+// https://english.stackexchange.com/a/285573
+fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
+ let mut list_item = tree[list_ix].child;
+ while let Some(listitem_ix) = list_item {
+ // first child is special, controls how we repoint list_item.child
+ let list_item_firstborn = tree[listitem_ix].child;
+
+ // Check that list item has children - this is not necessarily the case!
+ if let Some(firstborn_ix) = list_item_firstborn {
+ if let ItemBody::Paragraph = tree[firstborn_ix].item.body {
+ tree[listitem_ix].child = tree[firstborn_ix].child;
+ }
+
+ let mut list_item_child = Some(firstborn_ix);
+ let mut node_to_repoint = None;
+ while let Some(child_ix) = list_item_child {
+ // surgerize paragraphs
+ let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body {
+ if let Some(child_firstborn) = tree[child_ix].child {
+ if let Some(repoint_ix) = node_to_repoint {
+ tree[repoint_ix].next = Some(child_firstborn);
+ }
+ let mut child_lastborn = child_firstborn;
+ while let Some(lastborn_next_ix) = tree[child_lastborn].next {
+ child_lastborn = lastborn_next_ix;
+ }
+ child_lastborn
+ } else {
+ child_ix
+ }
+ } else {
+ child_ix
+ };
+
+ node_to_repoint = Some(repoint_ix);
+ tree[repoint_ix].next = tree[child_ix].next;
+ list_item_child = tree[child_ix].next;
+ }
+ }
+
+ list_item = tree[listitem_ix].next;
+ }
+}
+
+/// Determines whether the delimiter run starting at given index is
+/// left-flanking, as defined by the commonmark spec (and isn't intraword
+/// for _ delims).
+/// suffix is &s[ix..], which is passed in as an optimization, since taking
+/// a string subslice is O(n).
+fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
+ let next_char = if let Some(c) = suffix.chars().nth(run_len) {
+ c
+ } else {
+ return false;
+ };
+ if next_char.is_whitespace() {
+ return false;
+ }
+ if ix == 0 {
+ return true;
+ }
+ let delim = suffix.chars().next().unwrap();
+ if delim == '*' && !is_punctuation(next_char) {
+ return true;
+ }
+
+ let prev_char = s[..ix].chars().last().unwrap();
+
+ prev_char.is_whitespace()
+ || is_punctuation(prev_char) && (delim != '\'' || ![']', ')'].contains(&prev_char))
+}
+
+/// Determines whether the delimiter run starting at given index is
+/// left-flanking, as defined by the commonmark spec (and isn't intraword
+/// for _ delims)
+fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
+ if ix == 0 {
+ return false;
+ }
+ let prev_char = s[..ix].chars().last().unwrap();
+ if prev_char.is_whitespace() {
+ return false;
+ }
+ let next_char = if let Some(c) = suffix.chars().nth(run_len) {
+ c
+ } else {
+ return true;
+ };
+ let delim = suffix.chars().next().unwrap();
+ if delim == '*' && !is_punctuation(prev_char) {
+ return true;
+ }
+
+ next_char.is_whitespace() || is_punctuation(next_char)
+}
+
+fn create_lut(options: &Options) -> LookupTable {
+ #[cfg(all(target_arch = "x86_64", feature = "simd"))]
+ {
+ LookupTable {
+ simd: simd::compute_lookup(options),
+ scalar: special_bytes(options),
+ }
+ }
+ #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
+ {
+ special_bytes(options)
+ }
+}
+
+fn special_bytes(options: &Options) -> [bool; 256] {
+ let mut bytes = [false; 256];
+ let standard_bytes = [
+ b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
+ ];
+
+ for &byte in &standard_bytes {
+ bytes[byte as usize] = true;
+ }
+ if options.contains(Options::ENABLE_TABLES) {
+ bytes[b'|' as usize] = true;
+ }
+ if options.contains(Options::ENABLE_STRIKETHROUGH) {
+ bytes[b'~' as usize] = true;
+ }
+ if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
+ for &byte in &[b'.', b'-', b'"', b'\''] {
+ bytes[byte as usize] = true;
+ }
+ }
+
+ bytes
+}
+
+enum LoopInstruction<T> {
+ /// Continue looking for more special bytes, but skip next few bytes.
+ ContinueAndSkip(usize),
+ /// Break looping immediately, returning with the given index and value.
+ BreakAtWith(usize, T),
+}
+
+#[cfg(all(target_arch = "x86_64", feature = "simd"))]
+struct LookupTable {
+ simd: [u8; 16],
+ scalar: [bool; 256],
+}
+
+#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
+type LookupTable = [bool; 256];
+
+/// This function walks the byte slices from the given index and
+/// calls the callback function on all bytes (and their indices) that are in the following set:
+/// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n`
+/// It is guaranteed not call the callback on other bytes.
+/// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback
+/// will not be called with an index that is less than `ix + n + 1`.
+/// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be
+/// called and the function returns immediately with the return value `(end_ix, opt_val)`.
+/// If `BreakAtWith(..)` is never returned, this function will return the first
+/// index that is outside the byteslice bound and a `None` value.
+fn iterate_special_bytes<F, T>(
+ lut: &LookupTable,
+ bytes: &[u8],
+ ix: usize,
+ callback: F,
+) -> (usize, Option<T>)
+where
+ F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
+{
+ #[cfg(all(target_arch = "x86_64", feature = "simd"))]
+ {
+ simd::iterate_special_bytes(lut, bytes, ix, callback)
+ }
+ #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
+ {
+ scalar_iterate_special_bytes(lut, bytes, ix, callback)
+ }
+}
+
+fn scalar_iterate_special_bytes<F, T>(
+ lut: &[bool; 256],
+ bytes: &[u8],
+ mut ix: usize,
+ mut callback: F,
+) -> (usize, Option<T>)
+where
+ F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
+{
+ while ix < bytes.len() {
+ let b = bytes[ix];
+ if lut[b as usize] {
+ match callback(ix, b) {
+ LoopInstruction::ContinueAndSkip(skip) => {
+ ix += skip;
+ }
+ LoopInstruction::BreakAtWith(ix, val) => {
+ return (ix, val);
+ }
+ }
+ }
+ ix += 1;
+ }
+
+ (ix, None)
+}
+
+/// Split the usual heading content range and the content inside the trailing attribute block.
+///
+/// Returns `(leading_content_len, Option<trailing_attr_block_range>)`.
+///
+/// Note that `trailing_attr_block_range` will be empty range when the block
+/// is `{}`, since the range is content inside the wrapping `{` and `}`.
+///
+/// The closing `}` of an attribute block can have trailing whitespaces.
+/// They are automatically trimmed when the attribute block is being searched.
+///
+/// However, this method does not trim the trailing whitespaces of heading content.
+/// It is callers' responsibility to trim them if necessary.
+fn extract_attribute_block_content_from_header_text(
+ heading: &[u8],
+) -> (usize, Option<Range<usize>>) {
+ let heading_len = heading.len();
+ let mut ix = heading_len;
+ ix -= scan_rev_while(heading, |b| {
+ b == b'\n' || b == b'\r' || b == b' ' || b == b'\t'
+ });
+ if ix == 0 {
+ return (heading_len, None);
+ }
+
+ let attr_block_close = ix - 1;
+ if heading.get(attr_block_close) != Some(&b'}') {
+ // The last character is not `}`. No attribute blocks found.
+ return (heading_len, None);
+ }
+ // move cursor before the closing right brace (`}`)
+ ix -= 1;
+
+ ix -= scan_rev_while(&heading[..ix], |b| {
+ // Characters to be excluded:
+ // * `{` and `}`: special characters to open and close an attribute block.
+ // * `\\`: a special character to escape many characters and disable some syntaxes.
+ // + Handling of this escape character differs among markdown processors.
+ // + Escaped characters will be separate text node from neighbors, so
+ // it is not easy to handle unescaped string and trim the trailing block.
+ // * `<` and `>`: special characters to start and end HTML tag.
+ // + No known processors converts `{#<i>foo</i>}` into
+ // `id="&lt;i&gt;foo&lt;/&gt;"` as of this writing, so hopefully
+ // this restriction won't cause compatibility issues.
+ // * `\n` and `\r`: a newline character.
+ // + Setext heading can have multiple lines. However it is hard to support
+ // attribute blocks that have newline inside, since the parsing proceeds line by
+ // line and lines will be separate nodes even they are logically a single text.
+ !matches!(b, b'{' | b'}' | b'<' | b'>' | b'\\' | b'\n' | b'\r')
+ });
+ if ix == 0 {
+ // `{` is not found. No attribute blocks available.
+ return (heading_len, None);
+ }
+ let attr_block_open = ix - 1;
+ if heading[attr_block_open] != b'{' {
+ // `{` is not found. No attribute blocks available.
+ return (heading_len, None);
+ }
+
+ (attr_block_open, Some(ix..attr_block_close))
+}
+
+/// Parses an attribute block content, such as `.class1 #id .class2`.
+///
+/// Returns `(id, classes)`.
+///
+/// It is callers' responsibility to find opening and closing characters of the attribute
+/// block. Usually [`extract_attribute_block_content_from_header_text`] function does it for you.
+///
+/// Note that this parsing requires explicit whitespace separators between
+/// attributes. This is intentional design with the reasons below:
+///
+/// * to keep conversion simple and easy to understand for any possible input,
+/// * to avoid adding less obvious conversion rule that can reduce compatibility
+/// with other implementations more, and
+/// * to follow the major design of implementations with the support for the
+/// attribute blocks extension (as of this writing).
+///
+/// See also: [`Options::ENABLE_HEADING_ATTRIBUTES`].
+///
+/// [`Options::ENABLE_HEADING_ATTRIBUTES`]: `crate::Options::ENABLE_HEADING_ATTRIBUTES`
+fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes> {
+ let mut id = None;
+ let mut classes = Vec::new();
+
+ for attr in inside_attr_block.split_ascii_whitespace() {
+ // iterator returned by `str::split_ascii_whitespace` never emits empty
+ // strings, so taking first byte won't panic.
+ if attr.len() > 1 {
+ let first_byte = attr.as_bytes()[0];
+ if first_byte == b'#' {
+ id = Some(&attr[1..]);
+ } else if first_byte == b'.' {
+ classes.push(&attr[1..]);
+ }
+ }
+ }
+
+ Some(HeadingAttributes { id, classes })
+}
+
+#[cfg(all(target_arch = "x86_64", feature = "simd"))]
+mod simd {
+ //! SIMD byte scanning logic.
+ //!
+ //! This module provides functions that allow walking through byteslices, calling
+ //! provided callback functions on special bytes and their indices using SIMD.
+ //! The byteset is defined in `compute_lookup`.
+ //!
+ //! The idea is to load in a chunk of 16 bytes and perform a lookup into a set of
+ //! bytes on all the bytes in this chunk simultaneously. We produce a 16 bit bitmask
+ //! from this and call the callback on every index corresponding to a 1 in this mask
+ //! before moving on to the next chunk. This allows us to move quickly when there
+ //! are no or few matches.
+ //!
+ //! The table lookup is inspired by this [great overview]. However, since all of the
+ //! bytes we're interested in are ASCII, we don't quite need the full generality of
+ //! the universal algorithm and are hence able to skip a few instructions.
+ //!
+ //! [great overview]: http://0x80.pl/articles/simd-byte-lookup.html
+
+ use super::{LookupTable, LoopInstruction};
+ use crate::Options;
+ use core::arch::x86_64::*;
+
+ const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>();
+
+ /// Generates a lookup table containing the bitmaps for our
+ /// special marker bytes. This is effectively a 128 element 2d bitvector,
+ /// that can be indexed by a four bit row index (the lower nibble)
+ /// and a three bit column index (upper nibble).
+ pub(super) fn compute_lookup(options: &Options) -> [u8; 16] {
+ let mut lookup = [0u8; 16];
+ let standard_bytes = [
+ b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
+ ];
+
+ for &byte in &standard_bytes {
+ add_lookup_byte(&mut lookup, byte);
+ }
+ if options.contains(Options::ENABLE_TABLES) {
+ add_lookup_byte(&mut lookup, b'|');
+ }
+ if options.contains(Options::ENABLE_STRIKETHROUGH) {
+ add_lookup_byte(&mut lookup, b'~');
+ }
+ if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
+ for &byte in &[b'.', b'-', b'"', b'\''] {
+ add_lookup_byte(&mut lookup, byte);
+ }
+ }
+
+ lookup
+ }
+
+ fn add_lookup_byte(lookup: &mut [u8; 16], byte: u8) {
+ lookup[(byte & 0x0f) as usize] |= 1 << (byte >> 4);
+ }
+
+ /// Computes a bit mask for the given byteslice starting from the given index,
+ /// where the 16 least significant bits indicate (by value of 1) whether or not
+ /// there is a special character at that byte position. The least significant bit
+ /// corresponds to `bytes[ix]` and the most significant bit corresponds to
+ /// `bytes[ix + 15]`.
+ /// It is only safe to call this function when `bytes.len() >= ix + VECTOR_SIZE`.
+ #[target_feature(enable = "ssse3")]
+ #[inline]
+ unsafe fn compute_mask(lut: &[u8; 16], bytes: &[u8], ix: usize) -> i32 {
+ debug_assert!(bytes.len() >= ix + VECTOR_SIZE);
+
+ let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i);
+ // Small lookup table to compute single bit bitshifts
+ // for 16 bytes at once.
+ let bitmask_lookup =
+ _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, -1, -1, -1, -1, -1, -1, -1, -1);
+
+ // Load input from memory.
+ let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i;
+ let input = _mm_loadu_si128(raw_ptr);
+ // Compute the bitmap using the bottom nibble as an index
+ // into the lookup table. Note that non-ascii bytes will have
+ // their most significant bit set and will map to lookup[0].
+ let bitset = _mm_shuffle_epi8(bitmap, input);
+ // Compute the high nibbles of the input using a 16-bit rightshift of four
+ // and a mask to prevent most-significant bit issues.
+ let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f));
+ // Create a bitmask for the bitmap by perform a left shift of the value
+ // of the higher nibble. Bytes with their most significant set are mapped
+ // to -1 (all ones).
+ let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles);
+ // Test the bit of the bitmap by AND'ing the bitmap and the mask together.
+ let tmp = _mm_and_si128(bitset, bitmask);
+ // Check whether the result was not null. NEQ is not a SIMD intrinsic,
+ // but comparing to the bitmask is logically equivalent. This also prevents us
+ // from matching any non-ASCII bytes since none of the bitmaps were all ones
+ // (-1).
+ let result = _mm_cmpeq_epi8(tmp, bitmask);
+
+ // Return the resulting bitmask.
+ _mm_movemask_epi8(result)
+ }
+
+ /// Calls callback on byte indices and their value.
+ /// Breaks when callback returns LoopInstruction::BreakAtWith(ix, val). And skips the
+ /// number of bytes in callback return value otherwise.
+ /// Returns the final index and a possible break value.
+ pub(super) fn iterate_special_bytes<F, T>(
+ lut: &LookupTable,
+ bytes: &[u8],
+ ix: usize,
+ callback: F,
+ ) -> (usize, Option<T>)
+ where
+ F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
+ {
+ if is_x86_feature_detected!("ssse3") && bytes.len() >= VECTOR_SIZE {
+ unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) }
+ } else {
+ super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback)
+ }
+ }
+
+ /// Calls the callback function for every 1 in the given bitmask with
+ /// the index `offset + ix`, where `ix` is the position of the 1 in the mask.
+ /// Returns `Ok(ix)` to continue from index `ix`, `Err((end_ix, opt_val)` to break with
+ /// final index `end_ix` and optional value `opt_val`.
+ unsafe fn process_mask<F, T>(
+ mut mask: i32,
+ bytes: &[u8],
+ mut offset: usize,
+ callback: &mut F,
+ ) -> Result<usize, (usize, Option<T>)>
+ where
+ F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
+ {
+ while mask != 0 {
+ let mask_ix = mask.trailing_zeros() as usize;
+ offset += mask_ix;
+ match callback(offset, *bytes.get_unchecked(offset)) {
+ LoopInstruction::ContinueAndSkip(skip) => {
+ offset += skip + 1;
+ mask >>= skip + 1 + mask_ix;
+ }
+ LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)),
+ }
+ }
+ Ok(offset)
+ }
+
+ #[target_feature(enable = "ssse3")]
+ /// Important: only call this function when `bytes.len() >= 16`. Doing
+ /// so otherwise may exhibit undefined behaviour.
+ unsafe fn simd_iterate_special_bytes<F, T>(
+ lut: &[u8; 16],
+ bytes: &[u8],
+ mut ix: usize,
+ mut callback: F,
+ ) -> (usize, Option<T>)
+ where
+ F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
+ {
+ debug_assert!(bytes.len() >= VECTOR_SIZE);
+ let upperbound = bytes.len() - VECTOR_SIZE;
+
+ while ix < upperbound {
+ let mask = compute_mask(lut, bytes, ix);
+ let block_start = ix;
+ ix = match process_mask(mask, bytes, ix, &mut callback) {
+ Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start),
+ Err((end_ix, val)) => return (end_ix, val),
+ };
+ }
+
+ if bytes.len() > ix {
+ // shift off the bytes at start we have already scanned
+ let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound;
+ if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) {
+ return (end_ix, val);
+ }
+ }
+
+ (bytes.len(), None)
+ }
+
+ #[cfg(test)]
+ mod simd_test {
+ use super::super::create_lut;
+ use super::{iterate_special_bytes, LoopInstruction};
+ use crate::Options;
+
+ fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) {
+ let mut opts = Options::empty();
+ opts.insert(Options::ENABLE_TABLES);
+ opts.insert(Options::ENABLE_FOOTNOTES);
+ opts.insert(Options::ENABLE_STRIKETHROUGH);
+ opts.insert(Options::ENABLE_TASKLISTS);
+
+ let lut = create_lut(&opts);
+ let mut indices = vec![];
+
+ iterate_special_bytes::<_, i32>(&lut, bytes, 0, |ix, _byte_ty| {
+ indices.push(ix);
+ LoopInstruction::ContinueAndSkip(skip)
+ });
+
+ assert_eq!(&indices[..], expected);
+ }
+
+ #[test]
+ fn simple_no_match() {
+ check_expected_indices("abcdef0123456789".as_bytes(), &[], 0);
+ }
+
+ #[test]
+ fn simple_match() {
+ check_expected_indices("*bcd&f0123456789".as_bytes(), &[0, 4], 0);
+ }
+
+ #[test]
+ fn single_open_fish() {
+ check_expected_indices("<".as_bytes(), &[0], 0);
+ }
+
+ #[test]
+ fn long_match() {
+ check_expected_indices("0123456789abcde~*bcd&f0".as_bytes(), &[15, 16, 20], 0);
+ }
+
+ #[test]
+ fn border_skip() {
+ check_expected_indices("0123456789abcde~~~~d&f0".as_bytes(), &[15, 20], 3);
+ }
+
+ #[test]
+ fn exhaustive_search() {
+ let chars = [
+ b'\n', b'\r', b'*', b'_', b'~', b'|', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
+ ];
+
+ for &c in &chars {
+ for i in 0u8..=255 {
+ if !chars.contains(&i) {
+ // full match
+ let mut buf = [i; 18];
+ buf[3] = c;
+ buf[6] = c;
+
+ check_expected_indices(&buf[..], &[3, 6], 0);
+ }
+ }
+ }
+ }
+ }
+}