From 834089feb15af02e70f975287f0da5dddbc59e62 Mon Sep 17 00:00:00 2001 From: Marshall Bowers Date: Fri, 7 Jun 2024 12:54:33 -0400 Subject: [PATCH] Handle Wikipedia code blocks in `/fetch` command (#12780) This PR extends the `/fetch` command with support for Wikipedia code blocks. Release Notes: - N/A --- .../src/slash_command/fetch_command.rs | 5 +- .../html_to_markdown/src/markdown_writer.rs | 2 +- .../src/structure/wikipedia.rs | 104 +++++++++++++++++- 3 files changed, 107 insertions(+), 4 deletions(-) diff --git a/crates/assistant/src/slash_command/fetch_command.rs b/crates/assistant/src/slash_command/fetch_command.rs index 952659274d..718f6674b9 100644 --- a/crates/assistant/src/slash_command/fetch_command.rs +++ b/crates/assistant/src/slash_command/fetch_command.rs @@ -43,12 +43,15 @@ impl FetchSlashCommand { Box::new(markdown::ListHandler), Box::new(markdown::TableHandler::new()), Box::new(markdown::StyledTextHandler), - Box::new(markdown::CodeHandler), ]; if url.contains("wikipedia.org") { use html_to_markdown::structure::wikipedia; handlers.push(Box::new(wikipedia::WikipediaChromeRemover)); + handlers.push(Box::new(wikipedia::WikipediaInfoboxHandler)); + handlers.push(Box::new(wikipedia::WikipediaCodeHandler::new())); + } else { + handlers.push(Box::new(markdown::CodeHandler)); } convert_html_to_markdown(&body[..], handlers) diff --git a/crates/html_to_markdown/src/markdown_writer.rs b/crates/html_to_markdown/src/markdown_writer.rs index 436f895d7e..2022a62d49 100644 --- a/crates/html_to_markdown/src/markdown_writer.rs +++ b/crates/html_to_markdown/src/markdown_writer.rs @@ -162,7 +162,7 @@ impl MarkdownWriter { } let text = text - .trim_matches(|char| char == '\n' || char == '\r') + .trim_matches(|char| char == '\n' || char == '\r' || char == '\t') .replace('\n', " "); self.push_str(&text); diff --git a/crates/html_to_markdown/src/structure/wikipedia.rs b/crates/html_to_markdown/src/structure/wikipedia.rs index c291d4c0f5..2ef8f7eb6c 100644 --- a/crates/html_to_markdown/src/structure/wikipedia.rs +++ b/crates/html_to_markdown/src/structure/wikipedia.rs @@ -1,5 +1,5 @@ use crate::html_element::HtmlElement; -use crate::markdown_writer::{MarkdownWriter, StartTagOutcome}; +use crate::markdown_writer::{HandlerOutcome, MarkdownWriter, StartTagOutcome}; use crate::HandleTag; pub struct WikipediaChromeRemover; @@ -30,7 +30,7 @@ impl HandleTag for WikipediaChromeRemover { return StartTagOutcome::Skip; } - let classes_to_skip = ["mw-editsection", "mw-jump-link"]; + let classes_to_skip = ["noprint", "mw-editsection", "mw-jump-link"]; if tag.has_any_classes(&classes_to_skip) { return StartTagOutcome::Skip; } @@ -42,6 +42,106 @@ impl HandleTag for WikipediaChromeRemover { } } +pub struct WikipediaInfoboxHandler; + +impl HandleTag for WikipediaInfoboxHandler { + fn should_handle(&self, tag: &str) -> bool { + tag == "table" + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "table" => { + if tag.has_class("infobox") { + return StartTagOutcome::Skip; + } + } + _ => {} + } + + StartTagOutcome::Continue + } +} + +pub struct WikipediaCodeHandler { + language: Option, +} + +impl WikipediaCodeHandler { + pub fn new() -> Self { + Self { language: None } + } +} + +impl HandleTag for WikipediaCodeHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "div" | "pre" | "code" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "code" => { + if !writer.is_inside("pre") { + writer.push_str("`"); + } + } + "div" => { + let classes = tag.classes(); + self.language = classes.iter().find_map(|class| { + if let Some((_, language)) = class.split_once("mw-highlight-lang-") { + Some(language.trim().to_owned()) + } else { + None + } + }); + } + "pre" => { + writer.push_blank_line(); + writer.push_str("```"); + if let Some(language) = self.language.take() { + writer.push_str(&language); + } + writer.push_newline(); + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "code" => { + if !writer.is_inside("pre") { + writer.push_str("`"); + } + } + "pre" => writer.push_str("\n```\n"), + _ => {} + } + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if writer.is_inside("pre") { + writer.push_str(&text); + return HandlerOutcome::Handled; + } + + HandlerOutcome::NoOp + } +} + #[cfg(test)] mod tests { use indoc::indoc;