
For some reason `pulldown_cmark` treats \````` as a codeblock, meaning that we could end up with an invalid range generated from `extract_code_block_content_range` (`3..2`) Closes #30495 Release Notes: - agent: Fix an edge case where the editor would crash when model generated malformed markdown
763 lines
29 KiB
Rust
763 lines
29 KiB
Rust
use gpui::SharedString;
|
||
use linkify::LinkFinder;
|
||
pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
|
||
use pulldown_cmark::{
|
||
Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
|
||
};
|
||
use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
|
||
|
||
use crate::path_range::PathWithRange;
|
||
|
||
const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
|
||
.union(Options::ENABLE_FOOTNOTES)
|
||
.union(Options::ENABLE_STRIKETHROUGH)
|
||
.union(Options::ENABLE_TASKLISTS)
|
||
.union(Options::ENABLE_SMART_PUNCTUATION)
|
||
.union(Options::ENABLE_HEADING_ATTRIBUTES)
|
||
.union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
|
||
.union(Options::ENABLE_OLD_FOOTNOTES)
|
||
.union(Options::ENABLE_GFM);
|
||
|
||
pub fn parse_markdown(
|
||
text: &str,
|
||
) -> (
|
||
Vec<(Range<usize>, MarkdownEvent)>,
|
||
HashSet<SharedString>,
|
||
HashSet<Arc<Path>>,
|
||
) {
|
||
let mut events = Vec::new();
|
||
let mut language_names = HashSet::new();
|
||
let mut language_paths = HashSet::new();
|
||
let mut within_link = false;
|
||
let mut within_metadata = false;
|
||
let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
|
||
.into_offset_iter()
|
||
.peekable();
|
||
while let Some((pulldown_event, mut range)) = parser.next() {
|
||
if within_metadata {
|
||
if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
|
||
pulldown_event
|
||
{
|
||
within_metadata = false;
|
||
}
|
||
continue;
|
||
}
|
||
match pulldown_event {
|
||
pulldown_cmark::Event::Start(tag) => {
|
||
let tag = match tag {
|
||
pulldown_cmark::Tag::Link {
|
||
link_type,
|
||
dest_url,
|
||
title,
|
||
id,
|
||
} => {
|
||
within_link = true;
|
||
MarkdownTag::Link {
|
||
link_type,
|
||
dest_url: SharedString::from(dest_url.into_string()),
|
||
title: SharedString::from(title.into_string()),
|
||
id: SharedString::from(id.into_string()),
|
||
}
|
||
}
|
||
pulldown_cmark::Tag::MetadataBlock(kind) => {
|
||
within_metadata = true;
|
||
MarkdownTag::MetadataBlock(kind)
|
||
}
|
||
pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
|
||
MarkdownTag::CodeBlock {
|
||
kind: CodeBlockKind::Indented,
|
||
metadata: CodeBlockMetadata {
|
||
content_range: range.start + 1..range.end + 1,
|
||
line_count: 1,
|
||
},
|
||
}
|
||
}
|
||
pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
|
||
ref info,
|
||
)) => {
|
||
let content_range = extract_code_block_content_range(&text[range.clone()]);
|
||
let content_range =
|
||
content_range.start + range.start..content_range.end + range.start;
|
||
|
||
// Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
|
||
let line_count = text[content_range.clone()]
|
||
.bytes()
|
||
.filter(|c| *c == b'\n')
|
||
.count();
|
||
let metadata = CodeBlockMetadata {
|
||
content_range,
|
||
line_count,
|
||
};
|
||
|
||
let info = info.trim();
|
||
let kind = if info.is_empty() {
|
||
CodeBlockKind::Fenced
|
||
// Languages should never contain a slash, and PathRanges always should.
|
||
// (Models are told to specify them relative to a workspace root.)
|
||
} else if info.contains('/') {
|
||
let path_range = PathWithRange::new(info);
|
||
language_paths.insert(path_range.path.clone());
|
||
CodeBlockKind::FencedSrc(path_range)
|
||
} else {
|
||
let language = SharedString::from(info.to_string());
|
||
language_names.insert(language.clone());
|
||
CodeBlockKind::FencedLang(language)
|
||
};
|
||
|
||
MarkdownTag::CodeBlock { kind, metadata }
|
||
}
|
||
pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
|
||
pulldown_cmark::Tag::Heading {
|
||
level,
|
||
id,
|
||
classes,
|
||
attrs,
|
||
} => {
|
||
let id = id.map(|id| SharedString::from(id.into_string()));
|
||
let classes = classes
|
||
.into_iter()
|
||
.map(|c| SharedString::from(c.into_string()))
|
||
.collect();
|
||
let attrs = attrs
|
||
.into_iter()
|
||
.map(|(key, value)| {
|
||
(
|
||
SharedString::from(key.into_string()),
|
||
value.map(|v| SharedString::from(v.into_string())),
|
||
)
|
||
})
|
||
.collect();
|
||
MarkdownTag::Heading {
|
||
level,
|
||
id,
|
||
classes,
|
||
attrs,
|
||
}
|
||
}
|
||
pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
|
||
pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
|
||
pulldown_cmark::Tag::Item => MarkdownTag::Item,
|
||
pulldown_cmark::Tag::FootnoteDefinition(label) => {
|
||
MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
|
||
}
|
||
pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
|
||
pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
|
||
pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
|
||
pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
|
||
pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
|
||
pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
|
||
pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
|
||
pulldown_cmark::Tag::Image {
|
||
link_type,
|
||
dest_url,
|
||
title,
|
||
id,
|
||
} => MarkdownTag::Image {
|
||
link_type,
|
||
dest_url: SharedString::from(dest_url.into_string()),
|
||
title: SharedString::from(title.into_string()),
|
||
id: SharedString::from(id.into_string()),
|
||
},
|
||
pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
|
||
pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
|
||
pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
|
||
pulldown_cmark::Tag::DefinitionListDefinition => {
|
||
MarkdownTag::DefinitionListDefinition
|
||
}
|
||
};
|
||
events.push((range, MarkdownEvent::Start(tag)))
|
||
}
|
||
pulldown_cmark::Event::End(tag) => {
|
||
if let pulldown_cmark::TagEnd::Link = tag {
|
||
within_link = false;
|
||
}
|
||
events.push((range, MarkdownEvent::End(tag)));
|
||
}
|
||
pulldown_cmark::Event::Text(parsed) => {
|
||
fn event_for(
|
||
text: &str,
|
||
range: Range<usize>,
|
||
str: &str,
|
||
) -> (Range<usize>, MarkdownEvent) {
|
||
if str == &text[range.clone()] {
|
||
(range, MarkdownEvent::Text)
|
||
} else {
|
||
(range, MarkdownEvent::SubstitutedText(str.to_owned()))
|
||
}
|
||
}
|
||
#[derive(Debug)]
|
||
struct TextRange<'a> {
|
||
source_range: Range<usize>,
|
||
merged_range: Range<usize>,
|
||
parsed: CowStr<'a>,
|
||
}
|
||
|
||
let mut last_len = parsed.len();
|
||
let mut ranges = vec![TextRange {
|
||
source_range: range.clone(),
|
||
merged_range: 0..last_len,
|
||
parsed,
|
||
}];
|
||
|
||
while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
|
||
let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
|
||
else {
|
||
unreachable!()
|
||
};
|
||
let next_len = last_len + next_event.len();
|
||
ranges.push(TextRange {
|
||
source_range: next_range.clone(),
|
||
merged_range: last_len..next_len,
|
||
parsed: next_event,
|
||
});
|
||
last_len = next_len;
|
||
}
|
||
|
||
let mut merged_text =
|
||
String::with_capacity(ranges.last().unwrap().merged_range.end);
|
||
for range in &ranges {
|
||
merged_text.push_str(&range.parsed);
|
||
}
|
||
|
||
let mut ranges = ranges.into_iter().peekable();
|
||
|
||
if !within_link {
|
||
let mut finder = LinkFinder::new();
|
||
finder.kinds(&[linkify::LinkKind::Url]);
|
||
|
||
// Find links in the merged text
|
||
for link in finder.links(&merged_text) {
|
||
let link_start_in_merged = link.start();
|
||
let link_end_in_merged = link.end();
|
||
|
||
while ranges
|
||
.peek()
|
||
.is_some_and(|range| range.merged_range.end <= link_start_in_merged)
|
||
{
|
||
let range = ranges.next().unwrap();
|
||
events.push(event_for(text, range.source_range, &range.parsed));
|
||
}
|
||
|
||
let Some(range) = ranges.peek_mut() else {
|
||
continue;
|
||
};
|
||
let prefix_len = link_start_in_merged - range.merged_range.start;
|
||
if prefix_len > 0 {
|
||
let (head, tail) = range.parsed.split_at(prefix_len);
|
||
events.push(event_for(
|
||
text,
|
||
range.source_range.start..range.source_range.start + prefix_len,
|
||
&head,
|
||
));
|
||
range.parsed = CowStr::Boxed(tail.into());
|
||
range.merged_range.start += prefix_len;
|
||
range.source_range.start += prefix_len;
|
||
}
|
||
|
||
let link_start_in_source = range.source_range.start;
|
||
let mut link_end_in_source = range.source_range.end;
|
||
let mut link_events = Vec::new();
|
||
|
||
while ranges
|
||
.peek()
|
||
.is_some_and(|range| range.merged_range.end <= link_end_in_merged)
|
||
{
|
||
let range = ranges.next().unwrap();
|
||
link_end_in_source = range.source_range.end;
|
||
link_events.push(event_for(text, range.source_range, &range.parsed));
|
||
}
|
||
|
||
if let Some(range) = ranges.peek_mut() {
|
||
let prefix_len = link_end_in_merged - range.merged_range.start;
|
||
if prefix_len > 0 {
|
||
let (head, tail) = range.parsed.split_at(prefix_len);
|
||
link_events.push(event_for(
|
||
text,
|
||
range.source_range.start..range.source_range.start + prefix_len,
|
||
head,
|
||
));
|
||
range.parsed = CowStr::Boxed(tail.into());
|
||
range.merged_range.start += prefix_len;
|
||
range.source_range.start += prefix_len;
|
||
link_end_in_source = range.source_range.start;
|
||
}
|
||
}
|
||
let link_range = link_start_in_source..link_end_in_source;
|
||
|
||
events.push((
|
||
link_range.clone(),
|
||
MarkdownEvent::Start(MarkdownTag::Link {
|
||
link_type: LinkType::Autolink,
|
||
dest_url: SharedString::from(link.as_str().to_string()),
|
||
title: SharedString::default(),
|
||
id: SharedString::default(),
|
||
}),
|
||
));
|
||
events.extend(link_events);
|
||
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
|
||
}
|
||
}
|
||
|
||
for range in ranges {
|
||
events.push(event_for(text, range.source_range, &range.parsed));
|
||
}
|
||
}
|
||
pulldown_cmark::Event::Code(_) => {
|
||
range.start += 1;
|
||
range.end -= 1;
|
||
events.push((range, MarkdownEvent::Code))
|
||
}
|
||
pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
|
||
pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
|
||
pulldown_cmark::Event::FootnoteReference(_) => {
|
||
events.push((range, MarkdownEvent::FootnoteReference))
|
||
}
|
||
pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
|
||
pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
|
||
pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
|
||
pulldown_cmark::Event::TaskListMarker(checked) => {
|
||
events.push((range, MarkdownEvent::TaskListMarker(checked)))
|
||
}
|
||
pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
|
||
}
|
||
}
|
||
(events, language_names, language_paths)
|
||
}
|
||
|
||
pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
|
||
let mut events = Vec::new();
|
||
let mut finder = LinkFinder::new();
|
||
finder.kinds(&[linkify::LinkKind::Url]);
|
||
let mut text_range = Range {
|
||
start: 0,
|
||
end: text.len(),
|
||
};
|
||
for link in finder.links(text) {
|
||
let link_range = link.start()..link.end();
|
||
|
||
if link_range.start > text_range.start {
|
||
events.push((text_range.start..link_range.start, MarkdownEvent::Text));
|
||
}
|
||
|
||
events.push((
|
||
link_range.clone(),
|
||
MarkdownEvent::Start(MarkdownTag::Link {
|
||
link_type: LinkType::Autolink,
|
||
dest_url: SharedString::from(link.as_str().to_string()),
|
||
title: SharedString::default(),
|
||
id: SharedString::default(),
|
||
}),
|
||
));
|
||
events.push((link_range.clone(), MarkdownEvent::Text));
|
||
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
|
||
|
||
text_range.start = link_range.end;
|
||
}
|
||
|
||
if text_range.end > text_range.start {
|
||
events.push((text_range, MarkdownEvent::Text));
|
||
}
|
||
|
||
events
|
||
}
|
||
|
||
/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
|
||
/// parse result for rendering without resorting to unsafe lifetime coercion.
|
||
#[derive(Clone, Debug, PartialEq)]
|
||
pub enum MarkdownEvent {
|
||
/// Start of a tagged element. Events that are yielded after this event
|
||
/// and before its corresponding `End` event are inside this element.
|
||
/// Start and end events are guaranteed to be balanced.
|
||
Start(MarkdownTag),
|
||
/// End of a tagged element.
|
||
End(MarkdownTagEnd),
|
||
/// Text that uses the associated range from the markdown source.
|
||
Text,
|
||
/// Text that differs from the markdown source - typically due to substitution of HTML entities
|
||
/// and smart punctuation.
|
||
SubstitutedText(String),
|
||
/// An inline code node.
|
||
Code,
|
||
/// An HTML node.
|
||
Html,
|
||
/// An inline HTML node.
|
||
InlineHtml,
|
||
/// A reference to a footnote with given label, which may or may not be defined
|
||
/// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
|
||
/// occur in any order.
|
||
FootnoteReference,
|
||
/// A soft line break.
|
||
SoftBreak,
|
||
/// A hard line break.
|
||
HardBreak,
|
||
/// A horizontal ruler.
|
||
Rule,
|
||
/// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
|
||
TaskListMarker(bool),
|
||
}
|
||
|
||
/// Tags for elements that can contain other elements.
|
||
#[derive(Clone, Debug, PartialEq)]
|
||
pub enum MarkdownTag {
|
||
/// A paragraph of text and other inline elements.
|
||
Paragraph,
|
||
|
||
/// A heading, with optional identifier, classes and custom attributes.
|
||
/// The identifier is prefixed with `#` and the last one in the attributes
|
||
/// list is chosen, classes are prefixed with `.` and custom attributes
|
||
/// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
|
||
Heading {
|
||
level: HeadingLevel,
|
||
id: Option<SharedString>,
|
||
classes: Vec<SharedString>,
|
||
/// The first item of the tuple is the attr and second one the value.
|
||
attrs: Vec<(SharedString, Option<SharedString>)>,
|
||
},
|
||
|
||
BlockQuote,
|
||
|
||
/// A code block.
|
||
CodeBlock {
|
||
kind: CodeBlockKind,
|
||
metadata: CodeBlockMetadata,
|
||
},
|
||
|
||
/// A HTML block.
|
||
HtmlBlock,
|
||
|
||
/// A list. If the list is ordered the field indicates the number of the first item.
|
||
/// Contains only list items.
|
||
List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
|
||
|
||
/// A list item.
|
||
Item,
|
||
|
||
/// A footnote definition. The value contained is the footnote's label by which it can
|
||
/// be referred to.
|
||
FootnoteDefinition(SharedString),
|
||
|
||
/// A table. Contains a vector describing the text-alignment for each of its columns.
|
||
Table(Vec<Alignment>),
|
||
|
||
/// A table header. Contains only `TableCell`s. Note that the table body starts immediately
|
||
/// after the closure of the `TableHead` tag. There is no `TableBody` tag.
|
||
TableHead,
|
||
|
||
/// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
|
||
TableRow,
|
||
TableCell,
|
||
|
||
// span-level tags
|
||
Emphasis,
|
||
Strong,
|
||
Strikethrough,
|
||
|
||
/// A link.
|
||
Link {
|
||
link_type: LinkType,
|
||
dest_url: SharedString,
|
||
title: SharedString,
|
||
/// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
|
||
id: SharedString,
|
||
},
|
||
|
||
/// An image. The first field is the link type, the second the destination URL and the third is a title,
|
||
/// the fourth is the link identifier.
|
||
Image {
|
||
link_type: LinkType,
|
||
dest_url: SharedString,
|
||
title: SharedString,
|
||
/// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
|
||
id: SharedString,
|
||
},
|
||
|
||
/// A metadata block.
|
||
MetadataBlock(MetadataBlockKind),
|
||
|
||
DefinitionList,
|
||
DefinitionListTitle,
|
||
DefinitionListDefinition,
|
||
}
|
||
|
||
#[derive(Clone, Debug, PartialEq)]
|
||
pub enum CodeBlockKind {
|
||
Indented,
|
||
/// "Fenced" means "surrounded by triple backticks."
|
||
/// There can optionally be either a language after the backticks (like in traditional Markdown)
|
||
/// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
|
||
/// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
|
||
Fenced,
|
||
FencedLang(SharedString),
|
||
FencedSrc(PathWithRange),
|
||
}
|
||
|
||
#[derive(Default, Clone, Debug, PartialEq)]
|
||
pub struct CodeBlockMetadata {
|
||
pub content_range: Range<usize>,
|
||
pub line_count: usize,
|
||
}
|
||
|
||
pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
|
||
let mut range = 0..text.len();
|
||
if text.starts_with("```") {
|
||
range.start += 3;
|
||
|
||
if let Some(newline_ix) = text[range.clone()].find('\n') {
|
||
range.start += newline_ix + 1;
|
||
}
|
||
}
|
||
|
||
if !range.is_empty() && text.ends_with("```") {
|
||
range.end -= 3;
|
||
}
|
||
if range.start > range.end {
|
||
range.end = range.start;
|
||
}
|
||
range
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::MarkdownEvent::*;
|
||
use super::MarkdownTag::*;
|
||
use super::*;
|
||
|
||
const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
|
||
.union(Options::ENABLE_MATH)
|
||
.union(Options::ENABLE_DEFINITION_LIST);
|
||
|
||
#[test]
|
||
fn all_options_considered() {
|
||
// The purpose of this is to fail when new options are added to pulldown_cmark, so that they
|
||
// can be evaluated for inclusion.
|
||
assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
|
||
}
|
||
|
||
#[test]
|
||
fn wanted_and_unwanted_options_disjoint() {
|
||
assert_eq!(
|
||
PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
|
||
Options::empty()
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_html_comments() {
|
||
assert_eq!(
|
||
parse_markdown(" <!--\nrdoc-file=string.c\n-->\nReturns"),
|
||
(
|
||
vec![
|
||
(2..30, Start(HtmlBlock)),
|
||
(2..2, SubstitutedText(" ".into())),
|
||
(2..7, Html),
|
||
(7..26, Html),
|
||
(26..30, Html),
|
||
(2..30, End(MarkdownTagEnd::HtmlBlock)),
|
||
(30..37, Start(Paragraph)),
|
||
(30..37, Text),
|
||
(30..37, End(MarkdownTagEnd::Paragraph))
|
||
],
|
||
HashSet::new(),
|
||
HashSet::new()
|
||
)
|
||
)
|
||
}
|
||
|
||
#[test]
|
||
fn test_plain_urls_and_escaped_text() {
|
||
assert_eq!(
|
||
parse_markdown(" https://some.url some \\`►\\` text"),
|
||
(
|
||
vec![
|
||
(0..51, Start(Paragraph)),
|
||
(0..6, SubstitutedText("\u{a0}".into())),
|
||
(6..12, SubstitutedText("\u{a0}".into())),
|
||
(12..13, Text),
|
||
(
|
||
13..29,
|
||
Start(Link {
|
||
link_type: LinkType::Autolink,
|
||
dest_url: "https://some.url".into(),
|
||
title: "".into(),
|
||
id: "".into(),
|
||
})
|
||
),
|
||
(13..29, Text),
|
||
(13..29, End(MarkdownTagEnd::Link)),
|
||
(29..35, Text),
|
||
(36..37, Text), // Escaped backtick
|
||
(37..44, SubstitutedText("►".into())),
|
||
(45..46, Text), // Escaped backtick
|
||
(46..51, Text),
|
||
(0..51, End(MarkdownTagEnd::Paragraph))
|
||
],
|
||
HashSet::new(),
|
||
HashSet::new()
|
||
)
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_incomplete_link() {
|
||
assert_eq!(
|
||
parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0,
|
||
vec![
|
||
(0..62, Start(Paragraph)),
|
||
(0..16, Text),
|
||
(16..17, Text),
|
||
(17..34, Text),
|
||
(34..35, Text),
|
||
(35..36, Text),
|
||
(
|
||
36..62,
|
||
Start(Link {
|
||
link_type: LinkType::Autolink,
|
||
dest_url: "https://docs.github.com/en".into(),
|
||
title: "".into(),
|
||
id: "".into()
|
||
})
|
||
),
|
||
(36..62, Text),
|
||
(36..62, End(MarkdownTagEnd::Link)),
|
||
(0..62, End(MarkdownTagEnd::Paragraph))
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_smart_punctuation() {
|
||
assert_eq!(
|
||
parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
|
||
(
|
||
vec![
|
||
(0..53, Start(Paragraph)),
|
||
(0..2, SubstitutedText("–".into())),
|
||
(2..3, Text),
|
||
(3..6, SubstitutedText("—".into())),
|
||
(6..7, Text),
|
||
(7..10, SubstitutedText("…".into())),
|
||
(10..11, Text),
|
||
(11..12, SubstitutedText("“".into())),
|
||
(12..25, Text),
|
||
(25..26, SubstitutedText("”".into())),
|
||
(26..27, Text),
|
||
(27..28, SubstitutedText("‘".into())),
|
||
(28..41, Text),
|
||
(41..42, SubstitutedText("’".into())),
|
||
(42..43, Text),
|
||
(43..53, SubstitutedText("–––––".into())),
|
||
(0..53, End(MarkdownTagEnd::Paragraph))
|
||
],
|
||
HashSet::new(),
|
||
HashSet::new()
|
||
)
|
||
)
|
||
}
|
||
|
||
#[test]
|
||
fn test_code_block_metadata() {
|
||
assert_eq!(
|
||
parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
|
||
(
|
||
vec![
|
||
(
|
||
0..37,
|
||
Start(CodeBlock {
|
||
kind: CodeBlockKind::FencedLang("rust".into()),
|
||
metadata: CodeBlockMetadata {
|
||
content_range: 8..34,
|
||
line_count: 3
|
||
}
|
||
})
|
||
),
|
||
(8..34, Text),
|
||
(0..37, End(MarkdownTagEnd::CodeBlock)),
|
||
],
|
||
HashSet::from(["rust".into()]),
|
||
HashSet::new()
|
||
)
|
||
)
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_code_block_content_range() {
|
||
let input = "```rust\nlet x = 5;\n```";
|
||
assert_eq!(extract_code_block_content_range(input), 8..19);
|
||
|
||
let input = "plain text";
|
||
assert_eq!(extract_code_block_content_range(input), 0..10);
|
||
|
||
let input = "```python\nprint('hello')\nprint('world')\n```";
|
||
assert_eq!(extract_code_block_content_range(input), 10..40);
|
||
|
||
// Malformed input
|
||
let input = "`````";
|
||
assert_eq!(extract_code_block_content_range(input), 3..3);
|
||
}
|
||
|
||
#[test]
|
||
fn test_links_split_across_fragments() {
|
||
// This test verifies that links split across multiple text fragments due to escaping or other issues
|
||
// are correctly detected and processed
|
||
// Note: In real usage, pulldown_cmark creates separate text events for the escaped character
|
||
// We're verifying our parser can handle this correctly
|
||
assert_eq!(
|
||
parse_markdown("https:/\\/example.com is equivalent to https://example.com!").0,
|
||
vec![
|
||
(0..62, Start(Paragraph)),
|
||
(
|
||
0..20,
|
||
Start(Link {
|
||
link_type: LinkType::Autolink,
|
||
dest_url: "https://example.com".into(),
|
||
title: "".into(),
|
||
id: "".into()
|
||
})
|
||
),
|
||
(0..7, Text),
|
||
(8..20, Text),
|
||
(0..20, End(MarkdownTagEnd::Link)),
|
||
(20..38, Text),
|
||
(
|
||
38..61,
|
||
Start(Link {
|
||
link_type: LinkType::Autolink,
|
||
dest_url: "https://example.com".into(),
|
||
title: "".into(),
|
||
id: "".into()
|
||
})
|
||
),
|
||
(38..53, Text),
|
||
(53..58, SubstitutedText(".".into())),
|
||
(58..61, Text),
|
||
(38..61, End(MarkdownTagEnd::Link)),
|
||
(61..62, Text),
|
||
(0..62, End(MarkdownTagEnd::Paragraph))
|
||
],
|
||
);
|
||
|
||
assert_eq!(
|
||
parse_markdown("Visit https://example.com/cat\\/é‍☕ for coffee!").0,
|
||
[
|
||
(0..55, Start(Paragraph)),
|
||
(0..6, Text),
|
||
(
|
||
6..43,
|
||
Start(Link {
|
||
link_type: LinkType::Autolink,
|
||
dest_url: "https://example.com/cat/é\u{200d}☕".into(),
|
||
title: "".into(),
|
||
id: "".into()
|
||
})
|
||
),
|
||
(6..29, Text),
|
||
(30..33, Text),
|
||
(33..40, SubstitutedText("\u{200d}".into())),
|
||
(40..43, Text),
|
||
(6..43, End(MarkdownTagEnd::Link)),
|
||
(43..55, Text),
|
||
(0..55, End(MarkdownTagEnd::Paragraph))
|
||
]
|
||
);
|
||
}
|
||
}
|