use gpui::SharedString; use linkify::LinkFinder; pub use pulldown_cmark::TagEnd as MarkdownTagEnd; use pulldown_cmark::{ Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser, }; use std::{collections::HashSet, ops::Range, path::Path, sync::Arc}; use crate::path_range::PathWithRange; const PARSE_OPTIONS: Options = Options::ENABLE_TABLES .union(Options::ENABLE_FOOTNOTES) .union(Options::ENABLE_STRIKETHROUGH) .union(Options::ENABLE_TASKLISTS) .union(Options::ENABLE_SMART_PUNCTUATION) .union(Options::ENABLE_HEADING_ATTRIBUTES) .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS) .union(Options::ENABLE_OLD_FOOTNOTES) .union(Options::ENABLE_GFM); pub fn parse_markdown( text: &str, ) -> ( Vec<(Range, MarkdownEvent)>, HashSet, HashSet>, ) { let mut events = Vec::new(); let mut language_names = HashSet::new(); let mut language_paths = HashSet::new(); let mut within_link = false; let mut within_metadata = false; let mut parser = Parser::new_ext(text, PARSE_OPTIONS) .into_offset_iter() .peekable(); while let Some((pulldown_event, range)) = parser.next() { if within_metadata { if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) = pulldown_event { within_metadata = false; } continue; } match pulldown_event { pulldown_cmark::Event::Start(tag) => { let tag = match tag { pulldown_cmark::Tag::Link { link_type, dest_url, title, id, } => { within_link = true; MarkdownTag::Link { link_type, dest_url: SharedString::from(dest_url.into_string()), title: SharedString::from(title.into_string()), id: SharedString::from(id.into_string()), } } pulldown_cmark::Tag::MetadataBlock(kind) => { within_metadata = true; MarkdownTag::MetadataBlock(kind) } pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => { MarkdownTag::CodeBlock { kind: CodeBlockKind::Indented, metadata: CodeBlockMetadata { content_range: range.start + 1..range.end + 1, line_count: 1, }, } } pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced( ref info, )) => { let content_range = extract_code_block_content_range(&text[range.clone()]); let content_range = content_range.start + range.start..content_range.end + range.start; // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars. let line_count = text[content_range.clone()] .bytes() .filter(|c| *c == b'\n') .count(); let metadata = CodeBlockMetadata { content_range, line_count, }; let info = info.trim(); let kind = if info.is_empty() { CodeBlockKind::Fenced // Languages should never contain a slash, and PathRanges always should. // (Models are told to specify them relative to a workspace root.) } else if info.contains('/') { let path_range = PathWithRange::new(info); language_paths.insert(path_range.path.clone()); CodeBlockKind::FencedSrc(path_range) } else { let language = SharedString::from(info.to_string()); language_names.insert(language.clone()); CodeBlockKind::FencedLang(language) }; MarkdownTag::CodeBlock { kind, metadata } } pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph, pulldown_cmark::Tag::Heading { level, id, classes, attrs, } => { let id = id.map(|id| SharedString::from(id.into_string())); let classes = classes .into_iter() .map(|c| SharedString::from(c.into_string())) .collect(); let attrs = attrs .into_iter() .map(|(key, value)| { ( SharedString::from(key.into_string()), value.map(|v| SharedString::from(v.into_string())), ) }) .collect(); MarkdownTag::Heading { level, id, classes, attrs, } } pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote, pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number), pulldown_cmark::Tag::Item => MarkdownTag::Item, pulldown_cmark::Tag::FootnoteDefinition(label) => { MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string())) } pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments), pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead, pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow, pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell, pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis, pulldown_cmark::Tag::Strong => MarkdownTag::Strong, pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough, pulldown_cmark::Tag::Image { link_type, dest_url, title, id, } => MarkdownTag::Image { link_type, dest_url: SharedString::from(dest_url.into_string()), title: SharedString::from(title.into_string()), id: SharedString::from(id.into_string()), }, pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList, pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle, pulldown_cmark::Tag::DefinitionListDefinition => { MarkdownTag::DefinitionListDefinition } }; events.push((range, MarkdownEvent::Start(tag))) } pulldown_cmark::Event::End(tag) => { if let pulldown_cmark::TagEnd::Link = tag { within_link = false; } events.push((range, MarkdownEvent::End(tag))); } pulldown_cmark::Event::Text(parsed) => { fn event_for( text: &str, range: Range, str: &str, ) -> (Range, MarkdownEvent) { if str == &text[range.clone()] { (range, MarkdownEvent::Text) } else { (range, MarkdownEvent::SubstitutedText(str.to_owned())) } } #[derive(Debug)] struct TextRange<'a> { source_range: Range, merged_range: Range, parsed: CowStr<'a>, } let mut last_len = parsed.len(); let mut ranges = vec![TextRange { source_range: range.clone(), merged_range: 0..last_len, parsed, }]; while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) { let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next() else { unreachable!() }; let next_len = last_len + next_event.len(); ranges.push(TextRange { source_range: next_range.clone(), merged_range: last_len..next_len, parsed: next_event, }); last_len = next_len; } let mut merged_text = String::with_capacity(ranges.last().unwrap().merged_range.end); for range in &ranges { merged_text.push_str(&range.parsed); } let mut ranges = ranges.into_iter().peekable(); if !within_link { let mut finder = LinkFinder::new(); finder.kinds(&[linkify::LinkKind::Url]); // Find links in the merged text for link in finder.links(&merged_text) { let link_start_in_merged = link.start(); let link_end_in_merged = link.end(); while ranges .peek() .is_some_and(|range| range.merged_range.end <= link_start_in_merged) { let range = ranges.next().unwrap(); events.push(event_for(text, range.source_range, &range.parsed)); } let Some(range) = ranges.peek_mut() else { continue; }; let prefix_len = link_start_in_merged - range.merged_range.start; if prefix_len > 0 { let (head, tail) = range.parsed.split_at(prefix_len); events.push(event_for( text, range.source_range.start..range.source_range.start + prefix_len, &head, )); range.parsed = CowStr::Boxed(tail.into()); range.merged_range.start += prefix_len; range.source_range.start += prefix_len; } let link_start_in_source = range.source_range.start; let mut link_end_in_source = range.source_range.end; let mut link_events = Vec::new(); while ranges .peek() .is_some_and(|range| range.merged_range.end <= link_end_in_merged) { let range = ranges.next().unwrap(); link_end_in_source = range.source_range.end; link_events.push(event_for(text, range.source_range, &range.parsed)); } if let Some(range) = ranges.peek_mut() { let prefix_len = link_end_in_merged - range.merged_range.start; if prefix_len > 0 { let (head, tail) = range.parsed.split_at(prefix_len); link_events.push(event_for( text, range.source_range.start..range.source_range.start + prefix_len, head, )); range.parsed = CowStr::Boxed(tail.into()); range.merged_range.start += prefix_len; range.source_range.start += prefix_len; link_end_in_source = range.source_range.start; } } let link_range = link_start_in_source..link_end_in_source; events.push(( link_range.clone(), MarkdownEvent::Start(MarkdownTag::Link { link_type: LinkType::Autolink, dest_url: SharedString::from(link.as_str().to_string()), title: SharedString::default(), id: SharedString::default(), }), )); events.extend(link_events); events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link))); } } for range in ranges { events.push(event_for(text, range.source_range, &range.parsed)); } } pulldown_cmark::Event::Code(_) => { let content_range = extract_code_content_range(&text[range.clone()]); let content_range = content_range.start + range.start..content_range.end + range.start; events.push((content_range, MarkdownEvent::Code)) } pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)), pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)), pulldown_cmark::Event::FootnoteReference(_) => { events.push((range, MarkdownEvent::FootnoteReference)) } pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)), pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)), pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)), pulldown_cmark::Event::TaskListMarker(checked) => { events.push((range, MarkdownEvent::TaskListMarker(checked))) } pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {} } } (events, language_names, language_paths) } pub fn parse_links_only(text: &str) -> Vec<(Range, MarkdownEvent)> { let mut events = Vec::new(); let mut finder = LinkFinder::new(); finder.kinds(&[linkify::LinkKind::Url]); let mut text_range = Range { start: 0, end: text.len(), }; for link in finder.links(text) { let link_range = link.start()..link.end(); if link_range.start > text_range.start { events.push((text_range.start..link_range.start, MarkdownEvent::Text)); } events.push(( link_range.clone(), MarkdownEvent::Start(MarkdownTag::Link { link_type: LinkType::Autolink, dest_url: SharedString::from(link.as_str().to_string()), title: SharedString::default(), id: SharedString::default(), }), )); events.push((link_range.clone(), MarkdownEvent::Text)); events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link))); text_range.start = link_range.end; } if text_range.end > text_range.start { events.push((text_range, MarkdownEvent::Text)); } events } /// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the /// parse result for rendering without resorting to unsafe lifetime coercion. #[derive(Clone, Debug, PartialEq)] pub enum MarkdownEvent { /// Start of a tagged element. Events that are yielded after this event /// and before its corresponding `End` event are inside this element. /// Start and end events are guaranteed to be balanced. Start(MarkdownTag), /// End of a tagged element. End(MarkdownTagEnd), /// Text that uses the associated range from the markdown source. Text, /// Text that differs from the markdown source - typically due to substitution of HTML entities /// and smart punctuation. SubstitutedText(String), /// An inline code node. Code, /// An HTML node. Html, /// An inline HTML node. InlineHtml, /// A reference to a footnote with given label, which may or may not be defined /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may /// occur in any order. FootnoteReference, /// A soft line break. SoftBreak, /// A hard line break. HardBreak, /// A horizontal ruler. Rule, /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked. TaskListMarker(bool), } /// Tags for elements that can contain other elements. #[derive(Clone, Debug, PartialEq)] pub enum MarkdownTag { /// A paragraph of text and other inline elements. Paragraph, /// A heading, with optional identifier, classes and custom attributes. /// The identifier is prefixed with `#` and the last one in the attributes /// list is chosen, classes are prefixed with `.` and custom attributes /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`). Heading { level: HeadingLevel, id: Option, classes: Vec, /// The first item of the tuple is the attr and second one the value. attrs: Vec<(SharedString, Option)>, }, BlockQuote, /// A code block. CodeBlock { kind: CodeBlockKind, metadata: CodeBlockMetadata, }, /// A HTML block. HtmlBlock, /// A list. If the list is ordered the field indicates the number of the first item. /// Contains only list items. List(Option), // TODO: add delim and tight for ast (not needed for html) /// A list item. Item, /// A footnote definition. The value contained is the footnote's label by which it can /// be referred to. FootnoteDefinition(SharedString), /// A table. Contains a vector describing the text-alignment for each of its columns. Table(Vec), /// A table header. Contains only `TableCell`s. Note that the table body starts immediately /// after the closure of the `TableHead` tag. There is no `TableBody` tag. TableHead, /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s. TableRow, TableCell, // span-level tags Emphasis, Strong, Strikethrough, /// A link. Link { link_type: LinkType, dest_url: SharedString, title: SharedString, /// Identifier of reference links, e.g. `world` in the link `[hello][world]`. id: SharedString, }, /// An image. The first field is the link type, the second the destination URL and the third is a title, /// the fourth is the link identifier. Image { link_type: LinkType, dest_url: SharedString, title: SharedString, /// Identifier of reference links, e.g. `world` in the link `[hello][world]`. id: SharedString, }, /// A metadata block. MetadataBlock(MetadataBlockKind), DefinitionList, DefinitionListTitle, DefinitionListDefinition, } #[derive(Clone, Debug, PartialEq)] pub enum CodeBlockKind { Indented, /// "Fenced" means "surrounded by triple backticks." /// There can optionally be either a language after the backticks (like in traditional Markdown) /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange, /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust Fenced, FencedLang(SharedString), FencedSrc(PathWithRange), } #[derive(Default, Clone, Debug, PartialEq)] pub struct CodeBlockMetadata { pub content_range: Range, pub line_count: usize, } fn extract_code_content_range(text: &str) -> Range { let text_len = text.len(); if text_len == 0 { return 0..0; } let start_ticks = text.chars().take_while(|&c| c == '`').count(); if start_ticks == 0 || start_ticks > text_len { return 0..text_len; } let end_ticks = text.chars().rev().take_while(|&c| c == '`').count(); if end_ticks != start_ticks || text_len < start_ticks + end_ticks { return 0..text_len; } start_ticks..text_len - end_ticks } pub(crate) fn extract_code_block_content_range(text: &str) -> Range { let mut range = 0..text.len(); if text.starts_with("```") { range.start += 3; if let Some(newline_ix) = text[range.clone()].find('\n') { range.start += newline_ix + 1; } } if !range.is_empty() && text.ends_with("```") { range.end -= 3; } if range.start > range.end { range.end = range.start; } range } #[cfg(test)] mod tests { use super::MarkdownEvent::*; use super::MarkdownTag::*; use super::*; const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS .union(Options::ENABLE_MATH) .union(Options::ENABLE_DEFINITION_LIST); #[test] fn all_options_considered() { // The purpose of this is to fail when new options are added to pulldown_cmark, so that they // can be evaluated for inclusion. assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all()); } #[test] fn wanted_and_unwanted_options_disjoint() { assert_eq!( PARSE_OPTIONS.intersection(UNWANTED_OPTIONS), Options::empty() ); } #[test] fn test_html_comments() { assert_eq!( parse_markdown(" \nReturns"), ( vec![ (2..30, Start(HtmlBlock)), (2..2, SubstitutedText(" ".into())), (2..7, Html), (7..26, Html), (26..30, Html), (2..30, End(MarkdownTagEnd::HtmlBlock)), (30..37, Start(Paragraph)), (30..37, Text), (30..37, End(MarkdownTagEnd::Paragraph)) ], HashSet::new(), HashSet::new() ) ) } #[test] fn test_plain_urls_and_escaped_text() { assert_eq!( parse_markdown("   https://some.url some \\`►\\` text"), ( vec![ (0..51, Start(Paragraph)), (0..6, SubstitutedText("\u{a0}".into())), (6..12, SubstitutedText("\u{a0}".into())), (12..13, Text), ( 13..29, Start(Link { link_type: LinkType::Autolink, dest_url: "https://some.url".into(), title: "".into(), id: "".into(), }) ), (13..29, Text), (13..29, End(MarkdownTagEnd::Link)), (29..35, Text), (36..37, Text), // Escaped backtick (37..44, SubstitutedText("►".into())), (45..46, Text), // Escaped backtick (46..51, Text), (0..51, End(MarkdownTagEnd::Paragraph)) ], HashSet::new(), HashSet::new() ) ); } #[test] fn test_incomplete_link() { assert_eq!( parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0, vec![ (0..62, Start(Paragraph)), (0..16, Text), (16..17, Text), (17..34, Text), (34..35, Text), (35..36, Text), ( 36..62, Start(Link { link_type: LinkType::Autolink, dest_url: "https://docs.github.com/en".into(), title: "".into(), id: "".into() }) ), (36..62, Text), (36..62, End(MarkdownTagEnd::Link)), (0..62, End(MarkdownTagEnd::Paragraph)) ], ); } #[test] fn test_smart_punctuation() { assert_eq!( parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"), ( vec![ (0..53, Start(Paragraph)), (0..2, SubstitutedText("–".into())), (2..3, Text), (3..6, SubstitutedText("—".into())), (6..7, Text), (7..10, SubstitutedText("…".into())), (10..11, Text), (11..12, SubstitutedText("“".into())), (12..25, Text), (25..26, SubstitutedText("”".into())), (26..27, Text), (27..28, SubstitutedText("‘".into())), (28..41, Text), (41..42, SubstitutedText("’".into())), (42..43, Text), (43..53, SubstitutedText("–––––".into())), (0..53, End(MarkdownTagEnd::Paragraph)) ], HashSet::new(), HashSet::new() ) ) } #[test] fn test_code_block_metadata() { assert_eq!( parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"), ( vec![ ( 0..37, Start(CodeBlock { kind: CodeBlockKind::FencedLang("rust".into()), metadata: CodeBlockMetadata { content_range: 8..34, line_count: 3 } }) ), (8..34, Text), (0..37, End(MarkdownTagEnd::CodeBlock)), ], HashSet::from(["rust".into()]), HashSet::new() ) ) } #[test] fn test_extract_code_content_range() { let input = "```let x = 5;```"; assert_eq!(extract_code_content_range(input), 3..13); let input = "``let x = 5;``"; assert_eq!(extract_code_content_range(input), 2..12); let input = "`let x = 5;`"; assert_eq!(extract_code_content_range(input), 1..11); let input = "plain text"; assert_eq!(extract_code_content_range(input), 0..10); let input = "``let x = 5;`"; assert_eq!(extract_code_content_range(input), 0..13); } #[test] fn test_extract_code_block_content_range() { let input = "```rust\nlet x = 5;\n```"; assert_eq!(extract_code_block_content_range(input), 8..19); let input = "plain text"; assert_eq!(extract_code_block_content_range(input), 0..10); let input = "```python\nprint('hello')\nprint('world')\n```"; assert_eq!(extract_code_block_content_range(input), 10..40); // Malformed input let input = "`````"; assert_eq!(extract_code_block_content_range(input), 3..3); } #[test] fn test_links_split_across_fragments() { // This test verifies that links split across multiple text fragments due to escaping or other issues // are correctly detected and processed // Note: In real usage, pulldown_cmark creates separate text events for the escaped character // We're verifying our parser can handle this correctly assert_eq!( parse_markdown("https:/\\/example.com is equivalent to https://example.com!").0, vec![ (0..62, Start(Paragraph)), ( 0..20, Start(Link { link_type: LinkType::Autolink, dest_url: "https://example.com".into(), title: "".into(), id: "".into() }) ), (0..7, Text), (8..20, Text), (0..20, End(MarkdownTagEnd::Link)), (20..38, Text), ( 38..61, Start(Link { link_type: LinkType::Autolink, dest_url: "https://example.com".into(), title: "".into(), id: "".into() }) ), (38..53, Text), (53..58, SubstitutedText(".".into())), (58..61, Text), (38..61, End(MarkdownTagEnd::Link)), (61..62, Text), (0..62, End(MarkdownTagEnd::Paragraph)) ], ); assert_eq!( parse_markdown("Visit https://example.com/cat\\/é‍☕ for coffee!").0, [ (0..55, Start(Paragraph)), (0..6, Text), ( 6..43, Start(Link { link_type: LinkType::Autolink, dest_url: "https://example.com/cat/é\u{200d}☕".into(), title: "".into(), id: "".into() }) ), (6..29, Text), (30..33, Text), (33..40, SubstitutedText("\u{200d}".into())), (40..43, Text), (6..43, End(MarkdownTagEnd::Link)), (43..55, Text), (0..55, End(MarkdownTagEnd::Paragraph)) ] ); } }