diff --git a/crates/assistant/src/slash_command/fetch_command.rs b/crates/assistant/src/slash_command/fetch_command.rs index f91c156d61..952659274d 100644 --- a/crates/assistant/src/slash_command/fetch_command.rs +++ b/crates/assistant/src/slash_command/fetch_command.rs @@ -5,7 +5,7 @@ use anyhow::{anyhow, bail, Context, Result}; use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection}; use futures::AsyncReadExt; use gpui::{AppContext, Task, WeakView}; -use html_to_markdown::convert_html_to_markdown; +use html_to_markdown::{convert_html_to_markdown, markdown, HandleTag}; use http::{AsyncBody, HttpClient, HttpClientWithUrl}; use language::LspAdapterDelegate; use ui::{prelude::*, ButtonLike, ElevationIndex}; @@ -37,7 +37,21 @@ impl FetchSlashCommand { ); } - convert_html_to_markdown(&body[..]) + let mut handlers: Vec> = vec![ + Box::new(markdown::ParagraphHandler), + Box::new(markdown::HeadingHandler), + Box::new(markdown::ListHandler), + Box::new(markdown::TableHandler::new()), + Box::new(markdown::StyledTextHandler), + Box::new(markdown::CodeHandler), + ]; + if url.contains("wikipedia.org") { + use html_to_markdown::structure::wikipedia; + + handlers.push(Box::new(wikipedia::WikipediaChromeRemover)); + } + + convert_html_to_markdown(&body[..], handlers) } } diff --git a/crates/html_to_markdown/src/html_to_markdown.rs b/crates/html_to_markdown/src/html_to_markdown.rs index 5417c51dd0..df04f30778 100644 --- a/crates/html_to_markdown/src/html_to_markdown.rs +++ b/crates/html_to_markdown/src/html_to_markdown.rs @@ -1,11 +1,9 @@ //! Provides conversion from rustdoc's HTML output to Markdown. -#![deny(missing_docs)] - mod html_element; -mod markdown; +pub mod markdown; mod markdown_writer; -mod structure; +pub mod structure; use std::io::Read; @@ -19,24 +17,17 @@ use markup5ever_rcdom::RcDom; use crate::markdown::{ HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler, }; -use crate::markdown_writer::{HandleTag, MarkdownWriter}; +use crate::markdown_writer::MarkdownWriter; + +pub use crate::markdown_writer::HandleTag; /// Converts the provided HTML to Markdown. -pub fn convert_html_to_markdown(html: impl Read) -> Result { +pub fn convert_html_to_markdown( + html: impl Read, + handlers: Vec>, +) -> Result { let dom = parse_html(html).context("failed to parse HTML")?; - let handlers: Vec> = vec![ - Box::new(ParagraphHandler), - Box::new(HeadingHandler), - Box::new(ListHandler), - Box::new(TableHandler::new()), - Box::new(StyledTextHandler), - Box::new(structure::rustdoc::RustdocChromeRemover), - Box::new(structure::rustdoc::RustdocHeadingHandler), - Box::new(structure::rustdoc::RustdocCodeHandler), - Box::new(structure::rustdoc::RustdocItemHandler), - ]; - let markdown_writer = MarkdownWriter::new(); let markdown = markdown_writer .run(&dom.document, handlers) @@ -47,26 +38,20 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result { /// Converts the provided rustdoc HTML to Markdown. pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result { - let dom = parse_html(html).context("failed to parse rustdoc HTML")?; - - let handlers: Vec> = vec![ - Box::new(ParagraphHandler), - Box::new(HeadingHandler), - Box::new(ListHandler), - Box::new(TableHandler::new()), - Box::new(StyledTextHandler), - Box::new(structure::rustdoc::RustdocChromeRemover), - Box::new(structure::rustdoc::RustdocHeadingHandler), - Box::new(structure::rustdoc::RustdocCodeHandler), - Box::new(structure::rustdoc::RustdocItemHandler), - ]; - - let markdown_writer = MarkdownWriter::new(); - let markdown = markdown_writer - .run(&dom.document, handlers) - .context("failed to convert rustdoc HTML to Markdown")?; - - Ok(markdown) + convert_html_to_markdown( + html, + vec![ + Box::new(ParagraphHandler), + Box::new(HeadingHandler), + Box::new(ListHandler), + Box::new(TableHandler::new()), + Box::new(StyledTextHandler), + Box::new(structure::rustdoc::RustdocChromeRemover), + Box::new(structure::rustdoc::RustdocHeadingHandler), + Box::new(structure::rustdoc::RustdocCodeHandler), + Box::new(structure::rustdoc::RustdocItemHandler), + ], + ) } fn parse_html(mut html: impl Read) -> Result { diff --git a/crates/html_to_markdown/src/markdown.rs b/crates/html_to_markdown/src/markdown.rs index 0d45b17517..3a12cffcee 100644 --- a/crates/html_to_markdown/src/markdown.rs +++ b/crates/html_to_markdown/src/markdown.rs @@ -1,5 +1,5 @@ use crate::html_element::HtmlElement; -use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome}; +use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome}; pub struct ParagraphHandler; @@ -214,3 +214,53 @@ impl HandleTag for StyledTextHandler { } } } + +pub struct CodeHandler; + +impl HandleTag for CodeHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "pre" | "code" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "code" => { + if !writer.is_inside("pre") { + writer.push_str("`"); + } + } + "pre" => writer.push_str("\n\n```\n"), + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "code" => { + if !writer.is_inside("pre") { + writer.push_str("`"); + } + } + "pre" => writer.push_str("\n```\n"), + _ => {} + } + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if writer.is_inside("pre") { + writer.push_str(&text); + return HandlerOutcome::Handled; + } + + HandlerOutcome::NoOp + } +} diff --git a/crates/html_to_markdown/src/structure.rs b/crates/html_to_markdown/src/structure.rs index c6505a2ab6..2ff6b4d4b5 100644 --- a/crates/html_to_markdown/src/structure.rs +++ b/crates/html_to_markdown/src/structure.rs @@ -1 +1,2 @@ pub mod rustdoc; +pub mod wikipedia; diff --git a/crates/html_to_markdown/src/structure/wikipedia.rs b/crates/html_to_markdown/src/structure/wikipedia.rs new file mode 100644 index 0000000000..c291d4c0f5 --- /dev/null +++ b/crates/html_to_markdown/src/structure/wikipedia.rs @@ -0,0 +1,80 @@ +use crate::html_element::HtmlElement; +use crate::markdown_writer::{MarkdownWriter, StartTagOutcome}; +use crate::HandleTag; + +pub struct WikipediaChromeRemover; + +impl HandleTag for WikipediaChromeRemover { + fn should_handle(&self, _tag: &str) -> bool { + true + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip, + "sup" => { + if tag.has_class("reference") { + return StartTagOutcome::Skip; + } + } + "div" | "span" | "a" => { + if tag.attr("id").as_deref() == Some("p-lang-btn") { + return StartTagOutcome::Skip; + } + + if tag.attr("id").as_deref() == Some("p-search") { + return StartTagOutcome::Skip; + } + + let classes_to_skip = ["mw-editsection", "mw-jump-link"]; + if tag.has_any_classes(&classes_to_skip) { + return StartTagOutcome::Skip; + } + } + _ => {} + } + + StartTagOutcome::Continue + } +} + +#[cfg(test)] +mod tests { + use indoc::indoc; + use pretty_assertions::assert_eq; + + use crate::{convert_html_to_markdown, markdown}; + + use super::*; + + fn wikipedia_handlers() -> Vec> { + vec![ + Box::new(markdown::ParagraphHandler), + Box::new(markdown::HeadingHandler), + Box::new(markdown::ListHandler), + Box::new(markdown::StyledTextHandler), + Box::new(WikipediaChromeRemover), + ] + } + + #[test] + fn test_citation_references_get_removed() { + let html = indoc! {r##" +

Rust began as a personal project in 2006 by Mozilla Research employee Graydon Hoare.[20] Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental browser engine called Servo,[21] which was officially announced by Mozilla in 2010.[22][23] Rust's memory and ownership system was influenced by region-based memory management in languages such as Cyclone and ML Kit.[5] +

+ "##}; + let expected = indoc! {" + Rust began as a personal project in 2006 by Mozilla Research employee Graydon Hoare. Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental browser engine called Servo, which was officially announced by Mozilla in 2010. Rust's memory and ownership system was influenced by region-based memory management in languages such as Cyclone and ML Kit. + "} + .trim(); + + assert_eq!( + convert_html_to_markdown(html.as_bytes(), wikipedia_handlers()).unwrap(), + expected + ) + } +}