Restore direct use of the input text for Markdown Text
(#27620)
PR #24388 changed the markdown parsing to copy parsed text in order to handle markdown escaping, removing the optimization to instead reuse text from the input. Another issue with that change was that handling of finding links within `Text` intermixed use of `text` and `parsed`, relying on the offsets matching up (which I believe was true in practice). The solution is to distinguish pulldown_cmark `Text` nodes that share bytes with the input and those that do not. Release Notes: - N/A
This commit is contained in:
parent
4bcd37a537
commit
e74af03065
3 changed files with 201 additions and 64 deletions
|
@ -951,8 +951,10 @@ mod tests {
|
|||
|
||||
for (range, event) in slice.iter() {
|
||||
match event {
|
||||
MarkdownEvent::Text(parsed) => rendered_text.push_str(parsed),
|
||||
MarkdownEvent::Code => rendered_text.push_str(&text[range.clone()]),
|
||||
MarkdownEvent::SubstitutedText(parsed) => rendered_text.push_str(parsed),
|
||||
MarkdownEvent::Text | MarkdownEvent::Code => {
|
||||
rendered_text.push_str(&text[range.clone()])
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -868,8 +868,11 @@ impl Element for MarkdownElement {
|
|||
}
|
||||
_ => log::debug!("unsupported markdown tag end: {:?}", tag),
|
||||
},
|
||||
MarkdownEvent::Text(parsed) => {
|
||||
builder.push_text(parsed, range.start);
|
||||
MarkdownEvent::Text => {
|
||||
builder.push_text(&parsed_markdown.source[range.clone()], range.start);
|
||||
}
|
||||
MarkdownEvent::SubstitutedText(text) => {
|
||||
builder.push_text(text, range.start);
|
||||
}
|
||||
MarkdownEvent::Code => {
|
||||
builder.push_text_style(self.style.inline_code.clone());
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
use gpui::SharedString;
|
||||
use linkify::LinkFinder;
|
||||
pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
|
||||
use pulldown_cmark::{Alignment, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser};
|
||||
use std::{collections::HashSet, ops::Range};
|
||||
use pulldown_cmark::{
|
||||
Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
|
||||
};
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
ops::{Deref, Range},
|
||||
};
|
||||
|
||||
const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
|
||||
.union(Options::ENABLE_FOOTNOTES)
|
||||
|
@ -49,49 +54,60 @@ pub fn parse_markdown(text: &str) -> (Vec<(Range<usize>, MarkdownEvent)>, HashSe
|
|||
events.push((range, MarkdownEvent::End(tag)));
|
||||
}
|
||||
pulldown_cmark::Event::Text(parsed) => {
|
||||
// Automatically detect links in text if we're not already within a markdown
|
||||
// link.
|
||||
let mut parsed = parsed.as_ref();
|
||||
if !within_link {
|
||||
let mut finder = LinkFinder::new();
|
||||
finder.kinds(&[linkify::LinkKind::Url]);
|
||||
let text_range = range.clone();
|
||||
for link in finder.links(&text[text_range.clone()]) {
|
||||
let link_range =
|
||||
text_range.start + link.start()..text_range.start + link.end();
|
||||
|
||||
if link_range.start > range.start {
|
||||
let (text, tail) = parsed.split_at(link_range.start - range.start);
|
||||
events.push((
|
||||
range.start..link_range.start,
|
||||
MarkdownEvent::Text(SharedString::new(text)),
|
||||
));
|
||||
parsed = tail;
|
||||
}
|
||||
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::Start(MarkdownTag::Link {
|
||||
link_type: LinkType::Autolink,
|
||||
dest_url: SharedString::from(link.as_str().to_string()),
|
||||
title: SharedString::default(),
|
||||
id: SharedString::default(),
|
||||
}),
|
||||
));
|
||||
|
||||
let (link_text, tail) = parsed.split_at(link_range.end - link_range.start);
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::Text(SharedString::new(link_text)),
|
||||
));
|
||||
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
|
||||
|
||||
range.start = link_range.end;
|
||||
parsed = tail;
|
||||
// `parsed` will share bytes with the input unless a substitution like handling of
|
||||
// HTML entities or smart punctuation has occurred. When these substitutions occur,
|
||||
// `parsed` only consists of the result of a single substitution.
|
||||
if !cow_str_points_inside(&parsed, text) {
|
||||
// Attempt to detect cases where the assumptions here are not valid or the
|
||||
// behavior has changed.
|
||||
if parsed.len() > 4 {
|
||||
log::error!(
|
||||
"Bug in markdown parser. \
|
||||
pulldown_cmark::Event::Text expected to a substituted HTML entity, \
|
||||
but it was longer than expected.\n\
|
||||
Source: {}\n\
|
||||
Parsed: {}",
|
||||
&text[range.clone()],
|
||||
parsed
|
||||
);
|
||||
}
|
||||
events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
|
||||
} else {
|
||||
// Automatically detect links in text if not already within a markdown link.
|
||||
if !within_link {
|
||||
let mut finder = LinkFinder::new();
|
||||
finder.kinds(&[linkify::LinkKind::Url]);
|
||||
let text_range = range.clone();
|
||||
for link in finder.links(&text[text_range.clone()]) {
|
||||
let link_range =
|
||||
text_range.start + link.start()..text_range.start + link.end();
|
||||
|
||||
if link_range.start > range.start {
|
||||
events.push((range.start..link_range.start, MarkdownEvent::Text));
|
||||
}
|
||||
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::Start(MarkdownTag::Link {
|
||||
link_type: LinkType::Autolink,
|
||||
dest_url: SharedString::from(link.as_str().to_string()),
|
||||
title: SharedString::default(),
|
||||
id: SharedString::default(),
|
||||
}),
|
||||
));
|
||||
|
||||
events.push((link_range.clone(), MarkdownEvent::Text));
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::End(MarkdownTagEnd::Link),
|
||||
));
|
||||
|
||||
range.start = link_range.end;
|
||||
}
|
||||
}
|
||||
if range.start < range.end {
|
||||
events.push((range, MarkdownEvent::Text));
|
||||
}
|
||||
}
|
||||
if range.start < range.end {
|
||||
events.push((range, MarkdownEvent::Text(SharedString::new(parsed))));
|
||||
}
|
||||
}
|
||||
pulldown_cmark::Event::Code(_) => {
|
||||
|
@ -116,7 +132,7 @@ pub fn parse_markdown(text: &str) -> (Vec<(Range<usize>, MarkdownEvent)>, HashSe
|
|||
(events, languages)
|
||||
}
|
||||
|
||||
pub fn parse_links_only(mut text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
|
||||
pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
|
||||
let mut events = Vec::new();
|
||||
let mut finder = LinkFinder::new();
|
||||
finder.kinds(&[linkify::LinkKind::Url]);
|
||||
|
@ -128,15 +144,9 @@ pub fn parse_links_only(mut text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
|
|||
let link_range = link.start()..link.end();
|
||||
|
||||
if link_range.start > text_range.start {
|
||||
let (head, tail) = text.split_at(link_range.start - text_range.start);
|
||||
events.push((
|
||||
text_range.start..link_range.start,
|
||||
MarkdownEvent::Text(SharedString::new(head)),
|
||||
));
|
||||
text = tail;
|
||||
events.push((text_range.start..link_range.start, MarkdownEvent::Text));
|
||||
}
|
||||
|
||||
let (link_text, tail) = text.split_at(link_range.end - link_range.start);
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::Start(MarkdownTag::Link {
|
||||
|
@ -146,18 +156,14 @@ pub fn parse_links_only(mut text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
|
|||
id: SharedString::default(),
|
||||
}),
|
||||
));
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::Text(SharedString::new(link_text)),
|
||||
));
|
||||
events.push((link_range.clone(), MarkdownEvent::Text));
|
||||
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
|
||||
|
||||
text_range.start = link_range.end;
|
||||
text = tail;
|
||||
}
|
||||
|
||||
if text_range.end > text_range.start {
|
||||
events.push((text_range, MarkdownEvent::Text(SharedString::new(text))));
|
||||
events.push((text_range, MarkdownEvent::Text));
|
||||
}
|
||||
|
||||
events
|
||||
|
@ -173,8 +179,11 @@ pub enum MarkdownEvent {
|
|||
Start(MarkdownTag),
|
||||
/// End of a tagged element.
|
||||
End(MarkdownTagEnd),
|
||||
/// A text node.
|
||||
Text(SharedString),
|
||||
/// Text that uses the associated range from the mardown source.
|
||||
Text,
|
||||
/// Text that differs from the markdown source - typically due to substitution of HTML entities
|
||||
/// and smart punctuation.
|
||||
SubstitutedText(CompactStr),
|
||||
/// An inline code node.
|
||||
Code,
|
||||
/// An HTML node.
|
||||
|
@ -365,8 +374,71 @@ impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
|
|||
}
|
||||
}
|
||||
|
||||
/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
|
||||
/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
|
||||
///
|
||||
/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum CompactStr {
|
||||
Boxed(Box<str>),
|
||||
Inlined(InlineStr),
|
||||
}
|
||||
|
||||
impl Deref for CompactStr {
|
||||
type Target = str;
|
||||
|
||||
fn deref(&self) -> &str {
|
||||
match self {
|
||||
CompactStr::Boxed(b) => b,
|
||||
CompactStr::Inlined(i) => i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for CompactStr {
|
||||
fn from(s: &str) -> Self {
|
||||
if let Ok(inlined) = s.try_into() {
|
||||
CompactStr::Inlined(inlined)
|
||||
} else {
|
||||
CompactStr::Boxed(s.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
|
||||
fn from(cow_str: pulldown_cmark::CowStr) -> Self {
|
||||
match cow_str {
|
||||
pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
|
||||
pulldown_cmark::CowStr::Borrowed(b) => b.into(),
|
||||
pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for CompactStr {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.deref() == other.deref()
|
||||
}
|
||||
}
|
||||
|
||||
fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
|
||||
match substring {
|
||||
pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
|
||||
pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
|
||||
pulldown_cmark::CowStr::Inlined(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn str_points_inside(substring: &str, container: &str) -> bool {
|
||||
let substring_ptr = substring.as_ptr();
|
||||
let container_ptr = container.as_ptr();
|
||||
unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::MarkdownEvent::*;
|
||||
use super::MarkdownTag::*;
|
||||
use super::*;
|
||||
|
||||
const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
|
||||
|
@ -387,4 +459,64 @@ mod tests {
|
|||
Options::empty()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_plain_urls_and_escaped_text() {
|
||||
assert_eq!(
|
||||
parse_markdown(" https://some.url some \\`►\\` text"),
|
||||
(
|
||||
vec![
|
||||
(0..51, Start(Paragraph)),
|
||||
(0..6, SubstitutedText("\u{a0}".into())),
|
||||
(6..12, SubstitutedText("\u{a0}".into())),
|
||||
(12..13, Text),
|
||||
(
|
||||
13..29,
|
||||
Start(Link {
|
||||
link_type: LinkType::Autolink,
|
||||
dest_url: "https://some.url".into(),
|
||||
title: "".into(),
|
||||
id: "".into(),
|
||||
})
|
||||
),
|
||||
(13..29, Text),
|
||||
(13..29, End(MarkdownTagEnd::Link)),
|
||||
(29..35, Text),
|
||||
(36..37, Text), // Escaped backtick
|
||||
(37..44, SubstitutedText("►".into())),
|
||||
(45..46, Text), // Escaped backtick
|
||||
(46..51, Text),
|
||||
(0..51, End(MarkdownTagEnd::Paragraph))
|
||||
],
|
||||
HashSet::new()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smart_punctuation() {
|
||||
assert_eq!(
|
||||
parse_markdown("-- --- ... \"double quoted\" 'single quoted'"),
|
||||
(
|
||||
vec![
|
||||
(0..42, Start(Paragraph)),
|
||||
(0..2, SubstitutedText("–".into())),
|
||||
(2..3, Text),
|
||||
(3..6, SubstitutedText("—".into())),
|
||||
(6..7, Text),
|
||||
(7..10, SubstitutedText("…".into())),
|
||||
(10..11, Text),
|
||||
(11..12, SubstitutedText("“".into())),
|
||||
(12..25, Text),
|
||||
(25..26, SubstitutedText("”".into())),
|
||||
(26..27, Text),
|
||||
(27..28, SubstitutedText("‘".into())),
|
||||
(28..41, Text),
|
||||
(41..42, SubstitutedText("’".into())),
|
||||
(0..42, End(MarkdownTagEnd::Paragraph))
|
||||
],
|
||||
HashSet::new()
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue