Implement better markdown escaping and inline code escape (#23222)

Motivation for this is using markdown for keymap error notifications in #23113, but it also benefits the copied text of repl tables. Release Notes: - N/A
2025-01-16 04:06:57 -07:00 · 2025-01-16 04:06:57 -07:00 · 8e6fc3c807
commit 8e6fc3c807
parent 5fdd7edb90
3 changed files with 250 additions and 12 deletions
--- a/crates/repl/src/outputs/table.rs
+++ b/crates/repl/src/outputs/table.rs
@ -61,6 +61,7 @@ use serde_json::Value;
 use settings::Settings;
 use theme::ThemeSettings;
 use ui::{div, prelude::*, v_flex, IntoElement, Styled};
 use util::markdown::MarkdownString;
 use crate::outputs::OutputContent;
@ -139,17 +140,6 @@ impl TableView {
        }
    }
    fn escape_markdown(s: &str) -> String {
        s.replace('|', "\\|")
            .replace('*', "\\*")
            .replace('_', "\\_")
            .replace('`', "\\`")
            .replace('[', "\\[")
            .replace(']', "\\]")
            .replace('<', "&lt;")
            .replace('>', "&gt;")
    }
    fn create_clipboard_content(table: &TabularDataResource) -> String {
        let data = match table.data.as_ref() {
            Some(data) => data,
@ -180,7 +170,7 @@ impl TableView {
                let row_content = schema
                    .fields
                    .iter()
-                    .map(|field| Self::escape_markdown(&cell_content(record, &field.name)))
+                    .map(|field| MarkdownString::escape(&cell_content(record, &field.name)).0)
                    .collect::<Vec<_>>();
                row_content.join(" | ")
--- a/crates/util/src/markdown.rs
+++ b/crates/util/src/markdown.rs
@ -0,0 +1,247 @@
 use std::fmt::{Display, Formatter};
 /// Markdown text.
 #[derive(Debug)]
 pub struct MarkdownString(pub String);
 impl Display for MarkdownString {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
 }
 impl MarkdownString {
    /// Escapes markdown special characters.
    ///
    /// Also escapes the following markdown extensions:
    ///
    /// * `^` for superscripts
    /// * `$` for inline math
    /// * `~` for strikethrough
    ///
    /// Escape of some character is unnecessary because while they are involved in markdown syntax,
    /// the other characters involved are escaped:
    ///
    /// * `!`, `]`, `(`, and `)` are used in link syntax, but `[` is escaped so these are parsed as
    /// plaintext.
    ///
    /// * `;` is used in HTML entity syntax, but `&` is escaped, so they are parsed as plaintext.
    ///
    /// TODO: There is one escape this doesn't do currently. Period after numbers at the start of the
    /// line (`[0-9]*\.`) should also be escaped to avoid it being interpreted as a list item.
    pub fn escape(text: &str) -> Self {
        let mut chunks = Vec::new();
        let mut start_of_unescaped = None;
        for (ix, c) in text.char_indices() {
            match c {
                // Always escaped.
                '\\' | '`' | '*' | '_' | '[' | '^' | '$' | '~' | '&' |
                // TODO: these only need to be escaped when they are the first non-whitespace
                // character of the line of a block. There should probably be both an `escape_block`
                // which does this and an `escape_inline` method which does not escape these.
                '#' | '+' | '=' | '-' => {
                    match start_of_unescaped {
                        None => {}
                        Some(start_of_unescaped) => {
                            chunks.push(&text[start_of_unescaped..ix]);
                        }
                    }
                    chunks.push("\\");
                    // Can include this char in the "unescaped" text since a
                    // backslash was just emitted.
                    start_of_unescaped = Some(ix);
                }
                // Escaped since `<` is used in opening HTML tags. `&lt;` is used since Markdown
                // supports HTML entities, and this allows the text to be used directly in HTML.
                '<' => {
                    match start_of_unescaped {
                        None => {}
                        Some(start_of_unescaped) => {
                            chunks.push(&text[start_of_unescaped..ix]);
                        }
                    }
                    chunks.push("&lt;");
                    start_of_unescaped = None;
                }
                // Escaped since `>` is used for blockquotes. `&gt;` is used since Markdown supports
                // HTML entities, and this allows the text to be used directly in HTML.
                '>' => {
                    match start_of_unescaped {
                        None => {}
                        Some(start_of_unescaped) => {
                            chunks.push(&text[start_of_unescaped..ix]);
                        }
                    }
                    chunks.push("gt;");
                    start_of_unescaped = None;
                }
                _ => {
                    if start_of_unescaped.is_none() {
                        start_of_unescaped = Some(ix);
                    }
                }
            }
        }
        if let Some(start_of_unescaped) = start_of_unescaped {
            chunks.push(&text[start_of_unescaped..])
        }
        Self(chunks.concat())
    }
    /// Returns markdown for inline code (wrapped in backticks), handling code that contains backticks
    /// and spaces. All whitespace is treated as a single space character. For text that does not
    /// contain whitespace other than ' ', this escaping roundtrips through pulldown-cmark.
    ///
    /// When used in tables, `|` should be escaped like `\|` in the text provided to this function.
    pub fn inline_code(text: &str) -> Self {
        // Apache License 2.0, same as this crate.
        //
        // Copied from `pulldown-cmark-to-cmark-20.0.0` with modifications:
        //
        // * Handling of all whitespace. pulldown-cmark-to-cmark is anticipating
        // `Code` events parsed by pulldown-cmark.
        //
        // * Direct return of string.
        //
        // https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L290
        let mut all_whitespace = true;
        let text = text
            .chars()
            .map(|c| {
                if c.is_whitespace() {
                    ' '
                } else {
                    all_whitespace = false;
                    c
                }
            })
            .collect::<String>();
        // When inline code has leading and trailing ' ' characters, additional space is needed
        // to escape it, unless all characters are space.
        if all_whitespace {
            Self(format!("`{text}`"))
        } else {
            // More backticks are needed to delimit the inline code than the maximum number of
            // backticks in a consecutive run.
            let backticks = "`".repeat(count_max_consecutive_chars(&text, '`') + 1);
            let space = match text.as_bytes() {
                &[b'`', ..] | &[.., b'`'] => " ", // Space needed to separate backtick.
                &[b' ', .., b' '] => " ",         // Space needed to escape inner space.
                _ => "",                          // No space needed.
            };
            Self(format!("{backticks}{space}{text}{space}{backticks}"))
        }
    }
 }
 // Copied from `pulldown-cmark-to-cmark-20.0.0` with changed names.
 // https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L1063
 // Apache License 2.0, same as this code.
 fn count_max_consecutive_chars(text: &str, search: char) -> usize {
    let mut in_search_chars = false;
    let mut max_count = 0;
    let mut cur_count = 0;
    for ch in text.chars() {
        if ch == search {
            cur_count += 1;
            in_search_chars = true;
        } else if in_search_chars {
            max_count = max_count.max(cur_count);
            cur_count = 0;
            in_search_chars = false;
        }
    }
    max_count.max(cur_count)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_markdown_string_escape() {
        let input = r#"
        # Heading
        Another heading
        ===
        Another heading variant
        ---
        Paragraph with [link](https://example.com) and `code`, *emphasis*, and ~strikethrough~.
        ```
        code block
        ```
        List with varying leaders:
          - Item 1
          * Item 2
          + Item 3
        Some math:  $`\sqrt{3x-1}+(1+x)^2`$
        HTML entity: &nbsp;
        "#;
        let expected = r#"
        \# Heading
        Another heading
        \=\=\=
        Another heading variant
        \-\-\-
        Paragraph with \[link](https://example.com) and \`code\`, \*emphasis\*, and \~strikethrough\~.
        \`\`\`
        code block
        \`\`\`
        List with varying leaders:
          \- Item 1
          \* Item 2
          \+ Item 3
        Some math:  \$\`\\sqrt{3x\-1}\+(1\+x)\^2\`\$
        HTML entity: \&nbsp;
        "#;
        assert_eq!(MarkdownString::escape(input).0, expected);
    }
    #[test]
    fn test_markdown_string_inline_code() {
        assert_eq!(MarkdownString::inline_code(" ").0, "` `");
        assert_eq!(MarkdownString::inline_code("text").0, "`text`");
        assert_eq!(MarkdownString::inline_code("text ").0, "`text `");
        assert_eq!(MarkdownString::inline_code(" text ").0, "`  text  `");
        assert_eq!(MarkdownString::inline_code("`").0, "`` ` ``");
        assert_eq!(MarkdownString::inline_code("``").0, "``` `` ```");
        assert_eq!(MarkdownString::inline_code("`text`").0, "`` `text` ``");
        assert_eq!(
            MarkdownString::inline_code("some `text` no leading or trailing backticks").0,
            "``some `text` no leading or trailing backticks``"
        );
    }
    #[test]
    fn test_count_max_consecutive_chars() {
        assert_eq!(
            count_max_consecutive_chars("``a```b``", '`'),
            3,
            "the highest seen consecutive segment of backticks counts"
        );
        assert_eq!(
            count_max_consecutive_chars("```a``b`", '`'),
            3,
            "it can't be downgraded later"
        );
    }
 }
--- a/crates/util/src/util.rs
+++ b/crates/util/src/util.rs
@ -1,6 +1,7 @@
 pub mod arc_cow;
 pub mod command;
 pub mod fs;
 pub mod markdown;
 pub mod paths;
 pub mod serde;
 #[cfg(any(test, feature = "test-support"))]