Fix markdown escaping

Closes #29255

Release Notes:

- Improved handling of markdown escape sequences
This commit is contained in:
Conrad Irwin 2025-04-25 16:53:44 -06:00 committed by GitHub
parent d23024609f
commit 053fafa90e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 200 additions and 112 deletions

View file

@ -21,7 +21,6 @@ function a(b: T) {
} }
``` ```
Remember, markdown processors may have slight differences and extensions, so always refer to the specific documentation or guides relevant to your platform or editor for the best practices and additional features. Remember, markdown processors may have slight differences and extensions, so always refer to the specific documentation or guides relevant to your platform or editor for the best practices and additional features.
"#; "#;

View file

@ -221,7 +221,12 @@ impl Markdown {
.count(); .count();
if count > 0 { if count > 0 {
let mut output = String::with_capacity(s.len() + count); let mut output = String::with_capacity(s.len() + count);
let mut is_newline = false;
for c in s.chars() { for c in s.chars() {
if is_newline && c == ' ' {
continue;
}
is_newline = c == '\n';
if c == '\n' { if c == '\n' {
output.push('\n') output.push('\n')
} else if c.is_ascii_punctuation() { } else if c.is_ascii_punctuation() {
@ -1722,6 +1727,15 @@ mod tests {
rendered.text rendered.text
} }
#[test]
fn test_escape() {
assert_eq!(Markdown::escape("hello `world`"), "hello \\`world\\`");
assert_eq!(
Markdown::escape("hello\n cool world"),
"hello\n\ncool world"
);
}
#[track_caller] #[track_caller]
fn assert_mappings(rendered: &RenderedText, expected: Vec<Vec<(usize, usize)>>) { fn assert_mappings(rendered: &RenderedText, expected: Vec<Vec<(usize, usize)>>) {
assert_eq!(rendered.lines.len(), expected.len(), "line count mismatch"); assert_eq!(rendered.lines.len(), expected.len(), "line count mismatch");

View file

@ -2,14 +2,9 @@ use gpui::SharedString;
use linkify::LinkFinder; use linkify::LinkFinder;
pub use pulldown_cmark::TagEnd as MarkdownTagEnd; pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
use pulldown_cmark::{ use pulldown_cmark::{
Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser, Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
};
use std::{
collections::HashSet,
ops::{Deref, Range},
path::Path,
sync::Arc,
}; };
use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
use crate::path_range::PathWithRange; use crate::path_range::PathWithRange;
@ -35,7 +30,10 @@ pub fn parse_markdown(
let mut language_paths = HashSet::new(); let mut language_paths = HashSet::new();
let mut within_link = false; let mut within_link = false;
let mut within_metadata = false; let mut within_metadata = false;
for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() { let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
.into_offset_iter()
.peekable();
while let Some((pulldown_event, mut range)) = parser.next() {
if within_metadata { if within_metadata {
if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) = if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
pulldown_event pulldown_event
@ -175,47 +173,125 @@ pub fn parse_markdown(
events.push((range, MarkdownEvent::End(tag))); events.push((range, MarkdownEvent::End(tag)));
} }
pulldown_cmark::Event::Text(parsed) => { pulldown_cmark::Event::Text(parsed) => {
// `parsed` will share bytes with the input unless a substitution like handling of fn event_for(
// HTML entities or smart punctuation has occurred. When these substitutions occur, text: &str,
// `parsed` only consists of the result of a single substitution. range: Range<usize>,
if !cow_str_points_inside(&parsed, text) { str: &str,
events.push((range, MarkdownEvent::SubstitutedText(parsed.into()))); ) -> (Range<usize>, MarkdownEvent) {
} else { if str == &text[range.clone()] {
// Automatically detect links in text if not already within a markdown link. (range, MarkdownEvent::Text)
if !within_link { } else {
let mut finder = LinkFinder::new(); (range, MarkdownEvent::SubstitutedText(str.to_owned()))
finder.kinds(&[linkify::LinkKind::Url]); }
let text_range = range.clone(); }
for link in finder.links(&text[text_range.clone()]) { struct TextRange<'a> {
let link_range = source_range: Range<usize>,
text_range.start + link.start()..text_range.start + link.end(); merged_range: Range<usize>,
parsed: CowStr<'a>,
}
if link_range.start > range.start { let mut last_len = parsed.len();
events.push((range.start..link_range.start, MarkdownEvent::Text)); let mut ranges = vec![TextRange {
} source_range: range.clone(),
merged_range: 0..last_len,
parsed,
}];
events.push(( while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
link_range.clone(), let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
MarkdownEvent::Start(MarkdownTag::Link { else {
link_type: LinkType::Autolink, unreachable!()
dest_url: SharedString::from(link.as_str().to_string()), };
title: SharedString::default(), let next_len = last_len + next_event.len();
id: SharedString::default(), ranges.push(TextRange {
}), source_range: next_range.clone(),
)); merged_range: last_len..next_len,
parsed: next_event,
});
last_len = next_len;
}
events.push((link_range.clone(), MarkdownEvent::Text)); let mut merged_text =
events.push(( String::with_capacity(ranges.last().unwrap().merged_range.end);
link_range.clone(), for range in &ranges {
MarkdownEvent::End(MarkdownTagEnd::Link), merged_text.push_str(&range.parsed);
)); }
range.start = link_range.end; let mut ranges = ranges.into_iter().peekable();
if !within_link {
let mut finder = LinkFinder::new();
finder.kinds(&[linkify::LinkKind::Url]);
// Find links in the merged text
for link in finder.links(&merged_text) {
let link_start_in_merged = link.start();
let link_end_in_merged = link.end();
while ranges
.peek()
.is_some_and(|range| range.merged_range.end <= link_start_in_merged)
{
let range = ranges.next().unwrap();
events.push(event_for(text, range.source_range, &range.parsed));
} }
let range = ranges.peek_mut().unwrap();
let prefix_len = link_start_in_merged - range.merged_range.start;
if prefix_len > 0 {
let (head, tail) = range.parsed.split_at(prefix_len);
events.push(event_for(
text,
range.source_range.start..range.source_range.start + prefix_len,
&head,
));
range.parsed = CowStr::Boxed(tail.into());
range.merged_range.start += prefix_len;
range.source_range.start += prefix_len;
}
let link_start_in_source = range.source_range.start;
let mut link_events = Vec::new();
while ranges
.peek()
.is_some_and(|range| range.merged_range.end <= link_end_in_merged)
{
let range = ranges.next().unwrap();
link_events.push(event_for(text, range.source_range, &range.parsed));
}
let range = ranges.peek_mut().unwrap();
let prefix_len = link_end_in_merged - range.merged_range.start;
if prefix_len > 0 {
let (head, tail) = range.parsed.split_at(prefix_len);
link_events.push(event_for(
text,
range.source_range.start..range.source_range.start + prefix_len,
head,
));
range.parsed = CowStr::Boxed(tail.into());
range.merged_range.start += prefix_len;
range.source_range.start += prefix_len;
}
let link_range = link_start_in_source..range.source_range.start;
events.push((
link_range.clone(),
MarkdownEvent::Start(MarkdownTag::Link {
link_type: LinkType::Autolink,
dest_url: SharedString::from(link.as_str().to_string()),
title: SharedString::default(),
id: SharedString::default(),
}),
));
events.extend(link_events);
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
} }
if range.start < range.end { }
events.push((range, MarkdownEvent::Text));
} for range in ranges {
events.push(event_for(text, range.source_range, &range.parsed));
} }
} }
pulldown_cmark::Event::Code(_) => { pulldown_cmark::Event::Code(_) => {
@ -291,7 +367,7 @@ pub enum MarkdownEvent {
Text, Text,
/// Text that differs from the markdown source - typically due to substitution of HTML entities /// Text that differs from the markdown source - typically due to substitution of HTML entities
/// and smart punctuation. /// and smart punctuation.
SubstitutedText(CompactStr), SubstitutedText(String),
/// An inline code node. /// An inline code node.
Code, Code,
/// An HTML node. /// An HTML node.
@ -429,73 +505,6 @@ pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
range range
} }
/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
///
/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
#[derive(Clone)]
pub enum CompactStr {
Boxed(Box<str>),
Inlined(InlineStr),
}
impl std::fmt::Debug for CompactStr {
fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
self.deref().fmt(formatter)
}
}
impl Deref for CompactStr {
type Target = str;
fn deref(&self) -> &str {
match self {
CompactStr::Boxed(b) => b,
CompactStr::Inlined(i) => i,
}
}
}
impl From<&str> for CompactStr {
fn from(s: &str) -> Self {
if let Ok(inlined) = s.try_into() {
CompactStr::Inlined(inlined)
} else {
CompactStr::Boxed(s.into())
}
}
}
impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
fn from(cow_str: pulldown_cmark::CowStr) -> Self {
match cow_str {
pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
pulldown_cmark::CowStr::Borrowed(b) => b.into(),
pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
}
}
}
impl PartialEq for CompactStr {
fn eq(&self, other: &Self) -> bool {
self.deref() == other.deref()
}
}
fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
match substring {
pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
pulldown_cmark::CowStr::Inlined(_) => false,
}
}
fn str_points_inside(substring: &str, container: &str) -> bool {
let substring_ptr = substring.as_ptr();
let container_ptr = container.as_ptr();
unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::MarkdownEvent::*; use super::MarkdownEvent::*;
@ -621,4 +630,70 @@ mod tests {
let input = "```python\nprint('hello')\nprint('world')\n```"; let input = "```python\nprint('hello')\nprint('world')\n```";
assert_eq!(extract_code_block_content_range(input), 10..40); assert_eq!(extract_code_block_content_range(input), 10..40);
} }
#[test]
fn test_links_split_across_fragments() {
// This test verifies that links split across multiple text fragments due to escaping or other issues
// are correctly detected and processed
// Note: In real usage, pulldown_cmark creates separate text events for the escaped character
// We're verifying our parser can handle this correctly
assert_eq!(
parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
vec![
(0..62, Start(Paragraph)),
(
0..20,
Start(Link {
link_type: LinkType::Autolink,
dest_url: "https://example.com".into(),
title: "".into(),
id: "".into()
})
),
(0..7, Text),
(8..20, Text),
(0..20, End(MarkdownTagEnd::Link)),
(20..38, Text),
(
38..61,
Start(Link {
link_type: LinkType::Autolink,
dest_url: "https://example.com".into(),
title: "".into(),
id: "".into()
})
),
(38..53, Text),
(53..58, SubstitutedText(".".into())),
(58..61, Text),
(38..61, End(MarkdownTagEnd::Link)),
(61..62, Text),
(0..62, End(MarkdownTagEnd::Paragraph))
],
);
assert_eq!(
parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
[
(0..55, Start(Paragraph)),
(0..6, Text),
(
6..43,
Start(Link {
link_type: LinkType::Autolink,
dest_url: "https://example.com/cat/é\u{200d}".into(),
title: "".into(),
id: "".into()
})
),
(6..29, Text),
(30..33, Text),
(33..40, SubstitutedText("\u{200d}".into())),
(40..43, Text),
(6..43, End(MarkdownTagEnd::Link)),
(43..55, Text),
(0..55, End(MarkdownTagEnd::Paragraph))
]
);
}
} }