markdown: Track code block metadata in parser (#28543)

This allows us to not scan the codeblock content for newlines on every
frame in `active_thread`

Release Notes:

- N/A
This commit is contained in:
Bennet Bo Fenner 2025-04-10 15:49:08 -06:00 committed by GitHub
parent 73305ce45e
commit 44cb8e582b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 247 additions and 203 deletions

View file

@ -18,6 +18,7 @@ use gpui::{
TextStyleRefinement, actions, point, quad,
};
use language::{Language, LanguageRegistry, Rope};
use parser::CodeBlockMetadata;
use parser::{MarkdownEvent, MarkdownTag, MarkdownTagEnd, parse_links_only, parse_markdown};
use pulldown_cmark::Alignment;
use sum_tree::TreeMap;
@ -99,10 +100,19 @@ pub enum CodeBlockRenderer {
},
}
pub type CodeBlockRenderFn =
Arc<dyn Fn(&CodeBlockKind, &ParsedMarkdown, Range<usize>, &mut Window, &App) -> Div>;
pub type CodeBlockRenderFn = Arc<
dyn Fn(
&CodeBlockKind,
&ParsedMarkdown,
Range<usize>,
CodeBlockMetadata,
&mut Window,
&App,
) -> Div,
>;
pub type CodeBlockTransformFn = Arc<dyn Fn(AnyDiv, Range<usize>, &mut Window, &App) -> AnyDiv>;
pub type CodeBlockTransformFn =
Arc<dyn Fn(AnyDiv, Range<usize>, CodeBlockMetadata, &mut Window, &App) -> AnyDiv>;
actions!(markdown, [Copy, CopyAsMarkdown]);
@ -603,6 +613,8 @@ impl Element for MarkdownElement {
0
};
let mut current_code_block_metadata = None;
for (range, event) in parsed_markdown.events.iter() {
match event {
MarkdownEvent::Start(tag) => {
@ -641,7 +653,7 @@ impl Element for MarkdownElement {
markdown_end,
);
}
MarkdownTag::CodeBlock(kind) => {
MarkdownTag::CodeBlock { kind, metadata } => {
let language = match kind {
CodeBlockKind::Fenced => None,
CodeBlockKind::FencedLang(language) => {
@ -654,6 +666,8 @@ impl Element for MarkdownElement {
_ => None,
};
current_code_block_metadata = Some(metadata.clone());
let is_indented = matches!(kind, CodeBlockKind::Indented);
match (&self.code_block_renderer, is_indented) {
@ -686,8 +700,14 @@ impl Element for MarkdownElement {
builder.push_div(code_block, range, markdown_end);
}
(CodeBlockRenderer::Custom { render, .. }, _) => {
let parent_container =
render(kind, &parsed_markdown, range.clone(), window, cx);
let parent_container = render(
kind,
&parsed_markdown,
range.clone(),
metadata.clone(),
window,
cx,
);
builder.push_div(parent_container, range, markdown_end);
@ -852,12 +872,22 @@ impl Element for MarkdownElement {
builder.pop_text_style();
}
let metadata = current_code_block_metadata.take();
if let CodeBlockRenderer::Custom {
transform: Some(modify),
transform: Some(transform),
..
} = &self.code_block_renderer
{
builder.modify_current_div(|el| modify(el, range.clone(), window, cx));
builder.modify_current_div(|el| {
transform(
el,
range.clone(),
metadata.clone().unwrap_or_default(),
window,
cx,
)
});
}
if matches!(
@ -866,9 +896,13 @@ impl Element for MarkdownElement {
) {
builder.flush_text();
builder.modify_current_div(|el| {
let code =
without_fences(parsed_markdown.source()[range.clone()].trim())
.to_string();
let content_range = parser::extract_code_block_content_range(
parsed_markdown.source()[range.clone()].trim(),
);
let content_range = content_range.start + range.start
..content_range.end + range.start;
let code = parsed_markdown.source()[content_range].to_string();
let codeblock = render_copy_code_block_button(
range.end,
code,
@ -1507,43 +1541,3 @@ impl RenderedText {
.find(|link| link.source_range.contains(&source_index))
}
}
/// Some markdown blocks are indented, and others have e.g. ```rust … ``` around them.
/// If this block is fenced with backticks, strip them off (and the language name).
/// We use this when copying code blocks to the clipboard.
pub fn without_fences(mut markdown: &str) -> &str {
if let Some(opening_backticks) = markdown.find("```") {
markdown = &markdown[opening_backticks..];
// Trim off the next newline. This also trims off a language name if it's there.
if let Some(newline) = markdown.find('\n') {
markdown = &markdown[newline + 1..];
}
};
if let Some(closing_backticks) = markdown.rfind("```") {
markdown = &markdown[..closing_backticks];
};
markdown
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_without_fences() {
let input = "```rust\nlet x = 5;\n```";
assert_eq!(without_fences(input), "let x = 5;\n");
let input = " ```\nno language\n``` ";
assert_eq!(without_fences(input), "no language\n");
let input = "plain text";
assert_eq!(without_fences(input), "plain text");
let input = "```python\nprint('hello')\nprint('world')\n```";
assert_eq!(without_fences(input), "print('hello')\nprint('world')\n");
}
}

View file

@ -65,11 +65,33 @@ pub fn parse_markdown(
within_metadata = true;
MarkdownTag::MetadataBlock(kind)
}
pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
MarkdownTag::CodeBlock {
kind: CodeBlockKind::Indented,
metadata: CodeBlockMetadata {
content_range: range.start + 1..range.end + 1,
line_count: 1,
},
}
}
pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
ref info,
)) => {
let content_range = extract_code_block_content_range(&text[range.clone()]);
let content_range =
content_range.start + range.start..content_range.end + range.start;
let line_count = text[content_range.clone()]
.bytes()
.filter(|c| *c == b'\n')
.count();
let metadata = CodeBlockMetadata {
content_range,
line_count,
};
let info = info.trim();
MarkdownTag::CodeBlock(if info.is_empty() {
let kind = if info.is_empty() {
CodeBlockKind::Fenced
// Languages should never contain a slash, and PathRanges always should.
// (Models are told to specify them relative to a workspace root.)
@ -81,9 +103,68 @@ pub fn parse_markdown(
let language = SharedString::from(info.to_string());
language_names.insert(language.clone());
CodeBlockKind::FencedLang(language)
})
};
MarkdownTag::CodeBlock { kind, metadata }
}
pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
pulldown_cmark::Tag::Heading {
level,
id,
classes,
attrs,
} => {
let id = id.map(|id| SharedString::from(id.into_string()));
let classes = classes
.into_iter()
.map(|c| SharedString::from(c.into_string()))
.collect();
let attrs = attrs
.into_iter()
.map(|(key, value)| {
(
SharedString::from(key.into_string()),
value.map(|v| SharedString::from(v.into_string())),
)
})
.collect();
MarkdownTag::Heading {
level,
id,
classes,
attrs,
}
}
pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
pulldown_cmark::Tag::Item => MarkdownTag::Item,
pulldown_cmark::Tag::FootnoteDefinition(label) => {
MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
}
pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
pulldown_cmark::Tag::Image {
link_type,
dest_url,
title,
id,
} => MarkdownTag::Image {
link_type,
dest_url: SharedString::from(dest_url.into_string()),
title: SharedString::from(title.into_string()),
id: SharedString::from(id.into_string()),
},
pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
pulldown_cmark::Tag::DefinitionListDefinition => {
MarkdownTag::DefinitionListDefinition
}
tag => tag.into(),
};
events.push((range, MarkdownEvent::Start(tag)))
}
@ -252,7 +333,10 @@ pub enum MarkdownTag {
BlockQuote,
/// A code block.
CodeBlock(CodeBlockKind),
CodeBlock {
kind: CodeBlockKind,
metadata: CodeBlockMetadata,
},
/// A HTML block.
HtmlBlock,
@ -323,96 +407,26 @@ pub enum CodeBlockKind {
FencedSrc(PathWithRange),
}
impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
fn from(tag: pulldown_cmark::Tag) -> Self {
match tag {
pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
pulldown_cmark::Tag::Heading {
level,
id,
classes,
attrs,
} => {
let id = id.map(|id| SharedString::from(id.into_string()));
let classes = classes
.into_iter()
.map(|c| SharedString::from(c.into_string()))
.collect();
let attrs = attrs
.into_iter()
.map(|(key, value)| {
(
SharedString::from(key.into_string()),
value.map(|v| SharedString::from(v.into_string())),
)
})
.collect();
MarkdownTag::Heading {
level,
id,
classes,
attrs,
}
}
pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
pulldown_cmark::Tag::CodeBlock(kind) => match kind {
pulldown_cmark::CodeBlockKind::Indented => {
MarkdownTag::CodeBlock(CodeBlockKind::Indented)
}
pulldown_cmark::CodeBlockKind::Fenced(info) => {
let info = info.trim();
MarkdownTag::CodeBlock(if info.is_empty() {
CodeBlockKind::Fenced
} else if info.contains('/') {
// Languages should never contain a slash, and PathRanges always should.
// (Models are told to specify them relative to a workspace root.)
CodeBlockKind::FencedSrc(PathWithRange::new(info))
} else {
CodeBlockKind::FencedLang(SharedString::from(info.to_string()))
})
}
},
pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
pulldown_cmark::Tag::Item => MarkdownTag::Item,
pulldown_cmark::Tag::FootnoteDefinition(label) => {
MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
}
pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
pulldown_cmark::Tag::Link {
link_type,
dest_url,
title,
id,
} => MarkdownTag::Link {
link_type,
dest_url: SharedString::from(dest_url.into_string()),
title: SharedString::from(title.into_string()),
id: SharedString::from(id.into_string()),
},
pulldown_cmark::Tag::Image {
link_type,
dest_url,
title,
id,
} => MarkdownTag::Image {
link_type,
dest_url: SharedString::from(dest_url.into_string()),
title: SharedString::from(title.into_string()),
id: SharedString::from(id.into_string()),
},
pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
#[derive(Default, Clone, Debug, PartialEq)]
pub struct CodeBlockMetadata {
pub content_range: Range<usize>,
pub line_count: usize,
}
pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
let mut range = 0..text.len();
if text.starts_with("```") {
range.start += 3;
if let Some(newline_ix) = text[range.clone()].find('\n') {
range.start += newline_ix + 1;
}
}
if !range.is_empty() && text.ends_with("```") {
range.end -= 3;
}
range
}
/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
@ -570,4 +584,41 @@ mod tests {
)
)
}
#[test]
fn test_code_block_metadata() {
assert_eq!(
parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
(
vec![
(
0..37,
Start(CodeBlock {
kind: CodeBlockKind::FencedLang("rust".into()),
metadata: CodeBlockMetadata {
content_range: 8..34,
line_count: 3
}
})
),
(8..34, Text),
(0..37, End(MarkdownTagEnd::CodeBlock)),
],
HashSet::from(["rust".into()]),
HashSet::new()
)
)
}
#[test]
fn test_extract_code_block_content_range() {
let input = "```rust\nlet x = 5;\n```";
assert_eq!(extract_code_block_content_range(input), 8..19);
let input = "plain text";
assert_eq!(extract_code_block_content_range(input), 0..10);
let input = "```python\nprint('hello')\nprint('world')\n```";
assert_eq!(extract_code_block_content_range(input), 10..40);
}
}