ZIm/crates/markdown/src/parser.rs
CharlesChen0823 134463f043
markdown: Fix parse inline code display (#30937)
inline code startswith "\`", "\`\`", "\`\`\`", should all valid. current
implement only work with startswith "\`".

Release Notes:

- N/A
2025-05-26 11:28:44 +00:00

803 lines
31 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use gpui::SharedString;
use linkify::LinkFinder;
pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
use pulldown_cmark::{
Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
};
use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
use crate::path_range::PathWithRange;
const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
.union(Options::ENABLE_FOOTNOTES)
.union(Options::ENABLE_STRIKETHROUGH)
.union(Options::ENABLE_TASKLISTS)
.union(Options::ENABLE_SMART_PUNCTUATION)
.union(Options::ENABLE_HEADING_ATTRIBUTES)
.union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
.union(Options::ENABLE_OLD_FOOTNOTES)
.union(Options::ENABLE_GFM);
pub fn parse_markdown(
text: &str,
) -> (
Vec<(Range<usize>, MarkdownEvent)>,
HashSet<SharedString>,
HashSet<Arc<Path>>,
) {
let mut events = Vec::new();
let mut language_names = HashSet::new();
let mut language_paths = HashSet::new();
let mut within_link = false;
let mut within_metadata = false;
let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
.into_offset_iter()
.peekable();
while let Some((pulldown_event, range)) = parser.next() {
if within_metadata {
if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
pulldown_event
{
within_metadata = false;
}
continue;
}
match pulldown_event {
pulldown_cmark::Event::Start(tag) => {
let tag = match tag {
pulldown_cmark::Tag::Link {
link_type,
dest_url,
title,
id,
} => {
within_link = true;
MarkdownTag::Link {
link_type,
dest_url: SharedString::from(dest_url.into_string()),
title: SharedString::from(title.into_string()),
id: SharedString::from(id.into_string()),
}
}
pulldown_cmark::Tag::MetadataBlock(kind) => {
within_metadata = true;
MarkdownTag::MetadataBlock(kind)
}
pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
MarkdownTag::CodeBlock {
kind: CodeBlockKind::Indented,
metadata: CodeBlockMetadata {
content_range: range.start + 1..range.end + 1,
line_count: 1,
},
}
}
pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
ref info,
)) => {
let content_range = extract_code_block_content_range(&text[range.clone()]);
let content_range =
content_range.start + range.start..content_range.end + range.start;
// Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
let line_count = text[content_range.clone()]
.bytes()
.filter(|c| *c == b'\n')
.count();
let metadata = CodeBlockMetadata {
content_range,
line_count,
};
let info = info.trim();
let kind = if info.is_empty() {
CodeBlockKind::Fenced
// Languages should never contain a slash, and PathRanges always should.
// (Models are told to specify them relative to a workspace root.)
} else if info.contains('/') {
let path_range = PathWithRange::new(info);
language_paths.insert(path_range.path.clone());
CodeBlockKind::FencedSrc(path_range)
} else {
let language = SharedString::from(info.to_string());
language_names.insert(language.clone());
CodeBlockKind::FencedLang(language)
};
MarkdownTag::CodeBlock { kind, metadata }
}
pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
pulldown_cmark::Tag::Heading {
level,
id,
classes,
attrs,
} => {
let id = id.map(|id| SharedString::from(id.into_string()));
let classes = classes
.into_iter()
.map(|c| SharedString::from(c.into_string()))
.collect();
let attrs = attrs
.into_iter()
.map(|(key, value)| {
(
SharedString::from(key.into_string()),
value.map(|v| SharedString::from(v.into_string())),
)
})
.collect();
MarkdownTag::Heading {
level,
id,
classes,
attrs,
}
}
pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
pulldown_cmark::Tag::Item => MarkdownTag::Item,
pulldown_cmark::Tag::FootnoteDefinition(label) => {
MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
}
pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
pulldown_cmark::Tag::Image {
link_type,
dest_url,
title,
id,
} => MarkdownTag::Image {
link_type,
dest_url: SharedString::from(dest_url.into_string()),
title: SharedString::from(title.into_string()),
id: SharedString::from(id.into_string()),
},
pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
pulldown_cmark::Tag::DefinitionListDefinition => {
MarkdownTag::DefinitionListDefinition
}
};
events.push((range, MarkdownEvent::Start(tag)))
}
pulldown_cmark::Event::End(tag) => {
if let pulldown_cmark::TagEnd::Link = tag {
within_link = false;
}
events.push((range, MarkdownEvent::End(tag)));
}
pulldown_cmark::Event::Text(parsed) => {
fn event_for(
text: &str,
range: Range<usize>,
str: &str,
) -> (Range<usize>, MarkdownEvent) {
if str == &text[range.clone()] {
(range, MarkdownEvent::Text)
} else {
(range, MarkdownEvent::SubstitutedText(str.to_owned()))
}
}
#[derive(Debug)]
struct TextRange<'a> {
source_range: Range<usize>,
merged_range: Range<usize>,
parsed: CowStr<'a>,
}
let mut last_len = parsed.len();
let mut ranges = vec![TextRange {
source_range: range.clone(),
merged_range: 0..last_len,
parsed,
}];
while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
else {
unreachable!()
};
let next_len = last_len + next_event.len();
ranges.push(TextRange {
source_range: next_range.clone(),
merged_range: last_len..next_len,
parsed: next_event,
});
last_len = next_len;
}
let mut merged_text =
String::with_capacity(ranges.last().unwrap().merged_range.end);
for range in &ranges {
merged_text.push_str(&range.parsed);
}
let mut ranges = ranges.into_iter().peekable();
if !within_link {
let mut finder = LinkFinder::new();
finder.kinds(&[linkify::LinkKind::Url]);
// Find links in the merged text
for link in finder.links(&merged_text) {
let link_start_in_merged = link.start();
let link_end_in_merged = link.end();
while ranges
.peek()
.is_some_and(|range| range.merged_range.end <= link_start_in_merged)
{
let range = ranges.next().unwrap();
events.push(event_for(text, range.source_range, &range.parsed));
}
let Some(range) = ranges.peek_mut() else {
continue;
};
let prefix_len = link_start_in_merged - range.merged_range.start;
if prefix_len > 0 {
let (head, tail) = range.parsed.split_at(prefix_len);
events.push(event_for(
text,
range.source_range.start..range.source_range.start + prefix_len,
&head,
));
range.parsed = CowStr::Boxed(tail.into());
range.merged_range.start += prefix_len;
range.source_range.start += prefix_len;
}
let link_start_in_source = range.source_range.start;
let mut link_end_in_source = range.source_range.end;
let mut link_events = Vec::new();
while ranges
.peek()
.is_some_and(|range| range.merged_range.end <= link_end_in_merged)
{
let range = ranges.next().unwrap();
link_end_in_source = range.source_range.end;
link_events.push(event_for(text, range.source_range, &range.parsed));
}
if let Some(range) = ranges.peek_mut() {
let prefix_len = link_end_in_merged - range.merged_range.start;
if prefix_len > 0 {
let (head, tail) = range.parsed.split_at(prefix_len);
link_events.push(event_for(
text,
range.source_range.start..range.source_range.start + prefix_len,
head,
));
range.parsed = CowStr::Boxed(tail.into());
range.merged_range.start += prefix_len;
range.source_range.start += prefix_len;
link_end_in_source = range.source_range.start;
}
}
let link_range = link_start_in_source..link_end_in_source;
events.push((
link_range.clone(),
MarkdownEvent::Start(MarkdownTag::Link {
link_type: LinkType::Autolink,
dest_url: SharedString::from(link.as_str().to_string()),
title: SharedString::default(),
id: SharedString::default(),
}),
));
events.extend(link_events);
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
}
}
for range in ranges {
events.push(event_for(text, range.source_range, &range.parsed));
}
}
pulldown_cmark::Event::Code(_) => {
let content_range = extract_code_content_range(&text[range.clone()]);
let content_range =
content_range.start + range.start..content_range.end + range.start;
events.push((content_range, MarkdownEvent::Code))
}
pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
pulldown_cmark::Event::FootnoteReference(_) => {
events.push((range, MarkdownEvent::FootnoteReference))
}
pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
pulldown_cmark::Event::TaskListMarker(checked) => {
events.push((range, MarkdownEvent::TaskListMarker(checked)))
}
pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
}
}
(events, language_names, language_paths)
}
pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
let mut events = Vec::new();
let mut finder = LinkFinder::new();
finder.kinds(&[linkify::LinkKind::Url]);
let mut text_range = Range {
start: 0,
end: text.len(),
};
for link in finder.links(text) {
let link_range = link.start()..link.end();
if link_range.start > text_range.start {
events.push((text_range.start..link_range.start, MarkdownEvent::Text));
}
events.push((
link_range.clone(),
MarkdownEvent::Start(MarkdownTag::Link {
link_type: LinkType::Autolink,
dest_url: SharedString::from(link.as_str().to_string()),
title: SharedString::default(),
id: SharedString::default(),
}),
));
events.push((link_range.clone(), MarkdownEvent::Text));
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
text_range.start = link_range.end;
}
if text_range.end > text_range.start {
events.push((text_range, MarkdownEvent::Text));
}
events
}
/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
/// parse result for rendering without resorting to unsafe lifetime coercion.
#[derive(Clone, Debug, PartialEq)]
pub enum MarkdownEvent {
/// Start of a tagged element. Events that are yielded after this event
/// and before its corresponding `End` event are inside this element.
/// Start and end events are guaranteed to be balanced.
Start(MarkdownTag),
/// End of a tagged element.
End(MarkdownTagEnd),
/// Text that uses the associated range from the markdown source.
Text,
/// Text that differs from the markdown source - typically due to substitution of HTML entities
/// and smart punctuation.
SubstitutedText(String),
/// An inline code node.
Code,
/// An HTML node.
Html,
/// An inline HTML node.
InlineHtml,
/// A reference to a footnote with given label, which may or may not be defined
/// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
/// occur in any order.
FootnoteReference,
/// A soft line break.
SoftBreak,
/// A hard line break.
HardBreak,
/// A horizontal ruler.
Rule,
/// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
TaskListMarker(bool),
}
/// Tags for elements that can contain other elements.
#[derive(Clone, Debug, PartialEq)]
pub enum MarkdownTag {
/// A paragraph of text and other inline elements.
Paragraph,
/// A heading, with optional identifier, classes and custom attributes.
/// The identifier is prefixed with `#` and the last one in the attributes
/// list is chosen, classes are prefixed with `.` and custom attributes
/// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
Heading {
level: HeadingLevel,
id: Option<SharedString>,
classes: Vec<SharedString>,
/// The first item of the tuple is the attr and second one the value.
attrs: Vec<(SharedString, Option<SharedString>)>,
},
BlockQuote,
/// A code block.
CodeBlock {
kind: CodeBlockKind,
metadata: CodeBlockMetadata,
},
/// A HTML block.
HtmlBlock,
/// A list. If the list is ordered the field indicates the number of the first item.
/// Contains only list items.
List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
/// A list item.
Item,
/// A footnote definition. The value contained is the footnote's label by which it can
/// be referred to.
FootnoteDefinition(SharedString),
/// A table. Contains a vector describing the text-alignment for each of its columns.
Table(Vec<Alignment>),
/// A table header. Contains only `TableCell`s. Note that the table body starts immediately
/// after the closure of the `TableHead` tag. There is no `TableBody` tag.
TableHead,
/// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
TableRow,
TableCell,
// span-level tags
Emphasis,
Strong,
Strikethrough,
/// A link.
Link {
link_type: LinkType,
dest_url: SharedString,
title: SharedString,
/// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
id: SharedString,
},
/// An image. The first field is the link type, the second the destination URL and the third is a title,
/// the fourth is the link identifier.
Image {
link_type: LinkType,
dest_url: SharedString,
title: SharedString,
/// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
id: SharedString,
},
/// A metadata block.
MetadataBlock(MetadataBlockKind),
DefinitionList,
DefinitionListTitle,
DefinitionListDefinition,
}
#[derive(Clone, Debug, PartialEq)]
pub enum CodeBlockKind {
Indented,
/// "Fenced" means "surrounded by triple backticks."
/// There can optionally be either a language after the backticks (like in traditional Markdown)
/// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
/// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
Fenced,
FencedLang(SharedString),
FencedSrc(PathWithRange),
}
#[derive(Default, Clone, Debug, PartialEq)]
pub struct CodeBlockMetadata {
pub content_range: Range<usize>,
pub line_count: usize,
}
fn extract_code_content_range(text: &str) -> Range<usize> {
let text_len = text.len();
if text_len == 0 {
return 0..0;
}
let start_ticks = text.chars().take_while(|&c| c == '`').count();
if start_ticks == 0 || start_ticks > text_len {
return 0..text_len;
}
let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
return 0..text_len;
}
start_ticks..text_len - end_ticks
}
pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
let mut range = 0..text.len();
if text.starts_with("```") {
range.start += 3;
if let Some(newline_ix) = text[range.clone()].find('\n') {
range.start += newline_ix + 1;
}
}
if !range.is_empty() && text.ends_with("```") {
range.end -= 3;
}
if range.start > range.end {
range.end = range.start;
}
range
}
#[cfg(test)]
mod tests {
use super::MarkdownEvent::*;
use super::MarkdownTag::*;
use super::*;
const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
.union(Options::ENABLE_MATH)
.union(Options::ENABLE_DEFINITION_LIST);
#[test]
fn all_options_considered() {
// The purpose of this is to fail when new options are added to pulldown_cmark, so that they
// can be evaluated for inclusion.
assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
}
#[test]
fn wanted_and_unwanted_options_disjoint() {
assert_eq!(
PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
Options::empty()
);
}
#[test]
fn test_html_comments() {
assert_eq!(
parse_markdown(" <!--\nrdoc-file=string.c\n-->\nReturns"),
(
vec![
(2..30, Start(HtmlBlock)),
(2..2, SubstitutedText(" ".into())),
(2..7, Html),
(7..26, Html),
(26..30, Html),
(2..30, End(MarkdownTagEnd::HtmlBlock)),
(30..37, Start(Paragraph)),
(30..37, Text),
(30..37, End(MarkdownTagEnd::Paragraph))
],
HashSet::new(),
HashSet::new()
)
)
}
#[test]
fn test_plain_urls_and_escaped_text() {
assert_eq!(
parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
(
vec![
(0..51, Start(Paragraph)),
(0..6, SubstitutedText("\u{a0}".into())),
(6..12, SubstitutedText("\u{a0}".into())),
(12..13, Text),
(
13..29,
Start(Link {
link_type: LinkType::Autolink,
dest_url: "https://some.url".into(),
title: "".into(),
id: "".into(),
})
),
(13..29, Text),
(13..29, End(MarkdownTagEnd::Link)),
(29..35, Text),
(36..37, Text), // Escaped backtick
(37..44, SubstitutedText("".into())),
(45..46, Text), // Escaped backtick
(46..51, Text),
(0..51, End(MarkdownTagEnd::Paragraph))
],
HashSet::new(),
HashSet::new()
)
);
}
#[test]
fn test_incomplete_link() {
assert_eq!(
parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0,
vec![
(0..62, Start(Paragraph)),
(0..16, Text),
(16..17, Text),
(17..34, Text),
(34..35, Text),
(35..36, Text),
(
36..62,
Start(Link {
link_type: LinkType::Autolink,
dest_url: "https://docs.github.com/en".into(),
title: "".into(),
id: "".into()
})
),
(36..62, Text),
(36..62, End(MarkdownTagEnd::Link)),
(0..62, End(MarkdownTagEnd::Paragraph))
],
);
}
#[test]
fn test_smart_punctuation() {
assert_eq!(
parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
(
vec![
(0..53, Start(Paragraph)),
(0..2, SubstitutedText("".into())),
(2..3, Text),
(3..6, SubstitutedText("".into())),
(6..7, Text),
(7..10, SubstitutedText("".into())),
(10..11, Text),
(11..12, SubstitutedText("".into())),
(12..25, Text),
(25..26, SubstitutedText("".into())),
(26..27, Text),
(27..28, SubstitutedText("".into())),
(28..41, Text),
(41..42, SubstitutedText("".into())),
(42..43, Text),
(43..53, SubstitutedText("".into())),
(0..53, End(MarkdownTagEnd::Paragraph))
],
HashSet::new(),
HashSet::new()
)
)
}
#[test]
fn test_code_block_metadata() {
assert_eq!(
parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
(
vec![
(
0..37,
Start(CodeBlock {
kind: CodeBlockKind::FencedLang("rust".into()),
metadata: CodeBlockMetadata {
content_range: 8..34,
line_count: 3
}
})
),
(8..34, Text),
(0..37, End(MarkdownTagEnd::CodeBlock)),
],
HashSet::from(["rust".into()]),
HashSet::new()
)
)
}
#[test]
fn test_extract_code_content_range() {
let input = "```let x = 5;```";
assert_eq!(extract_code_content_range(input), 3..13);
let input = "``let x = 5;``";
assert_eq!(extract_code_content_range(input), 2..12);
let input = "`let x = 5;`";
assert_eq!(extract_code_content_range(input), 1..11);
let input = "plain text";
assert_eq!(extract_code_content_range(input), 0..10);
let input = "``let x = 5;`";
assert_eq!(extract_code_content_range(input), 0..13);
}
#[test]
fn test_extract_code_block_content_range() {
let input = "```rust\nlet x = 5;\n```";
assert_eq!(extract_code_block_content_range(input), 8..19);
let input = "plain text";
assert_eq!(extract_code_block_content_range(input), 0..10);
let input = "```python\nprint('hello')\nprint('world')\n```";
assert_eq!(extract_code_block_content_range(input), 10..40);
// Malformed input
let input = "`````";
assert_eq!(extract_code_block_content_range(input), 3..3);
}
#[test]
fn test_links_split_across_fragments() {
// This test verifies that links split across multiple text fragments due to escaping or other issues
// are correctly detected and processed
// Note: In real usage, pulldown_cmark creates separate text events for the escaped character
// We're verifying our parser can handle this correctly
assert_eq!(
parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
vec![
(0..62, Start(Paragraph)),
(
0..20,
Start(Link {
link_type: LinkType::Autolink,
dest_url: "https://example.com".into(),
title: "".into(),
id: "".into()
})
),
(0..7, Text),
(8..20, Text),
(0..20, End(MarkdownTagEnd::Link)),
(20..38, Text),
(
38..61,
Start(Link {
link_type: LinkType::Autolink,
dest_url: "https://example.com".into(),
title: "".into(),
id: "".into()
})
),
(38..53, Text),
(53..58, SubstitutedText(".".into())),
(58..61, Text),
(38..61, End(MarkdownTagEnd::Link)),
(61..62, Text),
(0..62, End(MarkdownTagEnd::Paragraph))
],
);
assert_eq!(
parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
[
(0..55, Start(Paragraph)),
(0..6, Text),
(
6..43,
Start(Link {
link_type: LinkType::Autolink,
dest_url: "https://example.com/cat/é\u{200d}".into(),
title: "".into(),
id: "".into()
})
),
(6..29, Text),
(30..33, Text),
(33..40, SubstitutedText("\u{200d}".into())),
(40..43, Text),
(6..43, End(MarkdownTagEnd::Link)),
(43..55, Text),
(0..55, End(MarkdownTagEnd::Paragraph))
]
);
}
}