From 8ccd2a0c999e3be3859efbcab534a2f4f000dc5c Mon Sep 17 00:00:00 2001 From: Marshall Bowers Date: Tue, 11 Jun 2024 15:56:37 -0400 Subject: [PATCH] Add tag handler for collecting crate items from rustdoc output (#12903) This PR adds a tag handler for collecting crate items from rustdoc's HTML output. This will serve as the foundation for getting more insight into a crate's contents. Release Notes: - N/A --- Cargo.lock | 2 + .../src/slash_command/fetch_command.rs | 28 ++-- .../src/slash_command/rustdoc_command.rs | 14 +- crates/html_to_markdown/Cargo.toml | 2 + .../html_to_markdown/src/html_to_markdown.rs | 81 +++++++---- .../html_to_markdown/src/markdown_writer.rs | 34 ++--- .../html_to_markdown/src/structure/rustdoc.rs | 132 ++++++++++++++++++ .../src/structure/wikipedia.rs | 19 +-- 8 files changed, 237 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a96bcab0bb..c18c904f68 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5072,10 +5072,12 @@ version = "0.1.0" dependencies = [ "anyhow", "html5ever", + "indexmap 1.9.3", "indoc", "markup5ever_rcdom", "pretty_assertions", "regex", + "strum", ] [[package]] diff --git a/crates/assistant/src/slash_command/fetch_command.rs b/crates/assistant/src/slash_command/fetch_command.rs index 4799661020..9d8fed1012 100644 --- a/crates/assistant/src/slash_command/fetch_command.rs +++ b/crates/assistant/src/slash_command/fetch_command.rs @@ -1,3 +1,5 @@ +use std::cell::RefCell; +use std::rc::Rc; use std::sync::atomic::AtomicBool; use std::sync::Arc; @@ -5,7 +7,7 @@ use anyhow::{anyhow, bail, Context, Result}; use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection}; use futures::AsyncReadExt; use gpui::{AppContext, Task, WeakView}; -use html_to_markdown::{convert_html_to_markdown, markdown, HandleTag}; +use html_to_markdown::{convert_html_to_markdown, markdown, TagHandler}; use http::{AsyncBody, HttpClient, HttpClientWithUrl}; use language::LspAdapterDelegate; use ui::{prelude::*, ButtonLike, ElevationIndex}; @@ -59,24 +61,26 @@ impl FetchSlashCommand { match content_type { ContentType::Html => { - let mut handlers: Vec> = vec![ - Box::new(markdown::ParagraphHandler), - Box::new(markdown::HeadingHandler), - Box::new(markdown::ListHandler), - Box::new(markdown::TableHandler::new()), - Box::new(markdown::StyledTextHandler), + let mut handlers: Vec = vec![ + Rc::new(RefCell::new(markdown::ParagraphHandler)), + Rc::new(RefCell::new(markdown::HeadingHandler)), + Rc::new(RefCell::new(markdown::ListHandler)), + Rc::new(RefCell::new(markdown::TableHandler::new())), + Rc::new(RefCell::new(markdown::StyledTextHandler)), ]; if url.contains("wikipedia.org") { use html_to_markdown::structure::wikipedia; - handlers.push(Box::new(wikipedia::WikipediaChromeRemover)); - handlers.push(Box::new(wikipedia::WikipediaInfoboxHandler)); - handlers.push(Box::new(wikipedia::WikipediaCodeHandler::new())); + handlers.push(Rc::new(RefCell::new(wikipedia::WikipediaChromeRemover))); + handlers.push(Rc::new(RefCell::new(wikipedia::WikipediaInfoboxHandler))); + handlers.push(Rc::new( + RefCell::new(wikipedia::WikipediaCodeHandler::new()), + )); } else { - handlers.push(Box::new(markdown::CodeHandler)); + handlers.push(Rc::new(RefCell::new(markdown::CodeHandler))); } - convert_html_to_markdown(&body[..], handlers) + convert_html_to_markdown(&body[..], &mut handlers) } ContentType::Plaintext => Ok(std::str::from_utf8(&body)?.to_owned()), ContentType::Json => { diff --git a/crates/assistant/src/slash_command/rustdoc_command.rs b/crates/assistant/src/slash_command/rustdoc_command.rs index 164fc969af..adeca134d0 100644 --- a/crates/assistant/src/slash_command/rustdoc_command.rs +++ b/crates/assistant/src/slash_command/rustdoc_command.rs @@ -42,10 +42,9 @@ impl RustdocSlashCommand { local_cargo_doc_path.push("index.html"); if let Ok(contents) = fs.load(&local_cargo_doc_path).await { - return Ok(( - RustdocSource::Local, - convert_rustdoc_to_markdown(contents.as_bytes())?, - )); + let (markdown, _items) = convert_rustdoc_to_markdown(contents.as_bytes())?; + + return Ok((RustdocSource::Local, markdown)); } } @@ -78,10 +77,9 @@ impl RustdocSlashCommand { ); } - Ok(( - RustdocSource::DocsDotRs, - convert_rustdoc_to_markdown(&body[..])?, - )) + let (markdown, _items) = convert_rustdoc_to_markdown(&body[..])?; + + Ok((RustdocSource::DocsDotRs, markdown)) } fn path_to_cargo_toml(project: Model, cx: &mut AppContext) -> Option> { diff --git a/crates/html_to_markdown/Cargo.toml b/crates/html_to_markdown/Cargo.toml index bac60ef9a6..e7c5f29b1e 100644 --- a/crates/html_to_markdown/Cargo.toml +++ b/crates/html_to_markdown/Cargo.toml @@ -14,8 +14,10 @@ path = "src/html_to_markdown.rs" [dependencies] anyhow.workspace = true html5ever.workspace = true +indexmap.workspace = true markup5ever_rcdom.workspace = true regex.workspace = true +strum.workspace = true [dev-dependencies] indoc.workspace = true diff --git a/crates/html_to_markdown/src/html_to_markdown.rs b/crates/html_to_markdown/src/html_to_markdown.rs index df04f30778..3246b53a42 100644 --- a/crates/html_to_markdown/src/html_to_markdown.rs +++ b/crates/html_to_markdown/src/html_to_markdown.rs @@ -5,7 +5,9 @@ pub mod markdown; mod markdown_writer; pub mod structure; +use std::cell::RefCell; use std::io::Read; +use std::rc::Rc; use anyhow::{Context, Result}; use html5ever::driver::ParseOpts; @@ -19,13 +21,11 @@ use crate::markdown::{ }; use crate::markdown_writer::MarkdownWriter; -pub use crate::markdown_writer::HandleTag; +pub use crate::markdown_writer::{HandleTag, TagHandler}; +use crate::structure::rustdoc::RustdocItem; /// Converts the provided HTML to Markdown. -pub fn convert_html_to_markdown( - html: impl Read, - handlers: Vec>, -) -> Result { +pub fn convert_html_to_markdown(html: impl Read, handlers: &mut Vec) -> Result { let dom = parse_html(html).context("failed to parse HTML")?; let markdown_writer = MarkdownWriter::new(); @@ -37,21 +37,32 @@ pub fn convert_html_to_markdown( } /// Converts the provided rustdoc HTML to Markdown. -pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result { - convert_html_to_markdown( - html, - vec![ - Box::new(ParagraphHandler), - Box::new(HeadingHandler), - Box::new(ListHandler), - Box::new(TableHandler::new()), - Box::new(StyledTextHandler), - Box::new(structure::rustdoc::RustdocChromeRemover), - Box::new(structure::rustdoc::RustdocHeadingHandler), - Box::new(structure::rustdoc::RustdocCodeHandler), - Box::new(structure::rustdoc::RustdocItemHandler), - ], - ) +pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec)> { + let item_collector = Rc::new(RefCell::new(structure::rustdoc::RustdocItemCollector::new())); + + let mut handlers: Vec = vec![ + Rc::new(RefCell::new(ParagraphHandler)), + Rc::new(RefCell::new(HeadingHandler)), + Rc::new(RefCell::new(ListHandler)), + Rc::new(RefCell::new(TableHandler::new())), + Rc::new(RefCell::new(StyledTextHandler)), + Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)), + Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)), + Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)), + Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)), + item_collector.clone(), + ]; + + let markdown = convert_html_to_markdown(html, &mut handlers)?; + + let items = item_collector + .borrow() + .items + .values() + .cloned() + .collect::>(); + + Ok((markdown, items)) } fn parse_html(mut html: impl Read) -> Result { @@ -77,6 +88,20 @@ mod tests { use super::*; + fn rustdoc_handlers() -> Vec { + vec![ + Rc::new(RefCell::new(ParagraphHandler)), + Rc::new(RefCell::new(HeadingHandler)), + Rc::new(RefCell::new(ListHandler)), + Rc::new(RefCell::new(TableHandler::new())), + Rc::new(RefCell::new(StyledTextHandler)), + Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)), + Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)), + Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)), + Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)), + ] + } + #[test] fn test_main_heading_buttons_get_removed() { let html = indoc! {r##" @@ -93,7 +118,7 @@ mod tests { .trim(); assert_eq!( - convert_rustdoc_to_markdown(html.as_bytes()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(), expected ) } @@ -113,7 +138,7 @@ mod tests { .trim(); assert_eq!( - convert_rustdoc_to_markdown(html.as_bytes()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(), expected ) } @@ -159,7 +184,7 @@ mod tests { .trim(); assert_eq!( - convert_rustdoc_to_markdown(html.as_bytes()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(), expected ) } @@ -178,7 +203,7 @@ mod tests { .trim(); assert_eq!( - convert_rustdoc_to_markdown(html.as_bytes()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(), expected ) } @@ -220,7 +245,7 @@ mod tests { .trim(); assert_eq!( - convert_rustdoc_to_markdown(html.as_bytes()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(), expected ) } @@ -252,7 +277,7 @@ mod tests { .trim(); assert_eq!( - convert_rustdoc_to_markdown(html.as_bytes()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(), expected ) } @@ -288,7 +313,7 @@ mod tests { .trim(); assert_eq!( - convert_rustdoc_to_markdown(html.as_bytes()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(), expected ) } @@ -342,7 +367,7 @@ mod tests { .trim(); assert_eq!( - convert_rustdoc_to_markdown(html.as_bytes()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(), expected ) } diff --git a/crates/html_to_markdown/src/markdown_writer.rs b/crates/html_to_markdown/src/markdown_writer.rs index 2022a62d49..7dc6308ffe 100644 --- a/crates/html_to_markdown/src/markdown_writer.rs +++ b/crates/html_to_markdown/src/markdown_writer.rs @@ -1,4 +1,6 @@ +use std::cell::RefCell; use std::collections::VecDeque; +use std::rc::Rc; use std::sync::OnceLock; use anyhow::Result; @@ -22,6 +24,8 @@ pub enum StartTagOutcome { Skip, } +pub type TagHandler = Rc>; + pub struct MarkdownWriter { current_element_stack: VecDeque, pub(crate) markdown: String, @@ -60,12 +64,8 @@ impl MarkdownWriter { self.push_str("\n\n"); } - pub fn run( - mut self, - root_node: &Handle, - mut handlers: Vec>, - ) -> Result { - self.visit_node(&root_node, &mut handlers)?; + pub fn run(mut self, root_node: &Handle, handlers: &mut Vec) -> Result { + self.visit_node(&root_node, handlers)?; Ok(Self::prettify_markdown(self.markdown)) } @@ -76,7 +76,7 @@ impl MarkdownWriter { markdown.trim().to_string() } - fn visit_node(&mut self, node: &Handle, handlers: &mut [Box]) -> Result<()> { + fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> { let mut current_element = None; match node.data { @@ -128,14 +128,10 @@ impl MarkdownWriter { Ok(()) } - fn start_tag( - &mut self, - tag: &HtmlElement, - handlers: &mut [Box], - ) -> StartTagOutcome { + fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome { for handler in handlers { - if handler.should_handle(tag.tag.as_str()) { - match handler.handle_tag_start(tag, self) { + if handler.borrow().should_handle(tag.tag.as_str()) { + match handler.borrow_mut().handle_tag_start(tag, self) { StartTagOutcome::Continue => {} StartTagOutcome::Skip => return StartTagOutcome::Skip, } @@ -145,17 +141,17 @@ impl MarkdownWriter { StartTagOutcome::Continue } - fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box]) { + fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) { for handler in handlers { - if handler.should_handle(tag.tag.as_str()) { - handler.handle_tag_end(tag, self); + if handler.borrow().should_handle(tag.tag.as_str()) { + handler.borrow_mut().handle_tag_end(tag, self); } } } - fn visit_text(&mut self, text: String, handlers: &mut [Box]) -> Result<()> { + fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> { for handler in handlers { - match handler.handle_text(&text, self) { + match handler.borrow_mut().handle_text(&text, self) { HandlerOutcome::Handled => return Ok(()), HandlerOutcome::NoOp => {} } diff --git a/crates/html_to_markdown/src/structure/rustdoc.rs b/crates/html_to_markdown/src/structure/rustdoc.rs index 7d6cc2f0b3..20ed6b1748 100644 --- a/crates/html_to_markdown/src/structure/rustdoc.rs +++ b/crates/html_to_markdown/src/structure/rustdoc.rs @@ -1,3 +1,6 @@ +use indexmap::IndexMap; +use strum::{EnumIter, IntoEnumIterator}; + use crate::html_element::HtmlElement; use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome}; @@ -203,3 +206,132 @@ impl HandleTag for RustdocChromeRemover { StartTagOutcome::Continue } } + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy, EnumIter)] +pub enum RustdocItemKind { + Mod, + Macro, + Struct, + Enum, + Constant, + Trait, + Function, + TypeAlias, + AttributeMacro, + DeriveMacro, +} + +impl RustdocItemKind { + const fn class(&self) -> &'static str { + match self { + Self::Mod => "mod", + Self::Macro => "macro", + Self::Struct => "struct", + Self::Enum => "enum", + Self::Constant => "constant", + Self::Trait => "trait", + Self::Function => "fn", + Self::TypeAlias => "type", + Self::AttributeMacro => "attr", + Self::DeriveMacro => "derive", + } + } +} + +#[derive(Debug, Clone)] +pub struct RustdocItem { + pub kind: RustdocItemKind, + pub name: String, +} + +impl RustdocItem { + pub fn url_path(&self) -> String { + let name = &self.name; + match self.kind { + RustdocItemKind::Mod => format!("{name}/index.html"), + RustdocItemKind::Macro + | RustdocItemKind::Struct + | RustdocItemKind::Enum + | RustdocItemKind::Constant + | RustdocItemKind::Trait + | RustdocItemKind::Function + | RustdocItemKind::TypeAlias + | RustdocItemKind::AttributeMacro + | RustdocItemKind::DeriveMacro => { + format!("{kind}.{name}.html", kind = self.kind.class()) + } + } + } +} + +pub struct RustdocItemCollector { + pub items: IndexMap<(RustdocItemKind, String), RustdocItem>, +} + +impl RustdocItemCollector { + pub fn new() -> Self { + Self { + items: IndexMap::new(), + } + } + + fn parse_item(tag: &HtmlElement) -> Option { + if tag.tag.as_str() != "a" { + return None; + } + + let href = tag.attr("href")?; + if href == "#" { + return None; + } + + for kind in RustdocItemKind::iter() { + if tag.has_class(kind.class()) { + let name = href + .trim_start_matches(&format!("{}.", kind.class())) + .trim_end_matches("/index.html") + .trim_end_matches(".html"); + + return Some(RustdocItem { + kind, + name: name.to_owned(), + }); + } + } + + None + } +} + +impl HandleTag for RustdocItemCollector { + fn should_handle(&self, tag: &str) -> bool { + tag == "a" + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "a" => { + let is_reexport = writer.current_element_stack().iter().any(|element| { + if let Some(id) = element.attr("id") { + id.starts_with("reexport.") + } else { + false + } + }); + + if !is_reexport { + if let Some(item) = Self::parse_item(tag) { + self.items.insert((item.kind, item.name.clone()), item); + } + } + } + _ => {} + } + + StartTagOutcome::Continue + } +} diff --git a/crates/html_to_markdown/src/structure/wikipedia.rs b/crates/html_to_markdown/src/structure/wikipedia.rs index 2ef8f7eb6c..9c9a470685 100644 --- a/crates/html_to_markdown/src/structure/wikipedia.rs +++ b/crates/html_to_markdown/src/structure/wikipedia.rs @@ -144,20 +144,23 @@ impl HandleTag for WikipediaCodeHandler { #[cfg(test)] mod tests { + use std::cell::RefCell; + use std::rc::Rc; + use indoc::indoc; use pretty_assertions::assert_eq; - use crate::{convert_html_to_markdown, markdown}; + use crate::{convert_html_to_markdown, markdown, TagHandler}; use super::*; - fn wikipedia_handlers() -> Vec> { + fn wikipedia_handlers() -> Vec { vec![ - Box::new(markdown::ParagraphHandler), - Box::new(markdown::HeadingHandler), - Box::new(markdown::ListHandler), - Box::new(markdown::StyledTextHandler), - Box::new(WikipediaChromeRemover), + Rc::new(RefCell::new(markdown::ParagraphHandler)), + Rc::new(RefCell::new(markdown::HeadingHandler)), + Rc::new(RefCell::new(markdown::ListHandler)), + Rc::new(RefCell::new(markdown::StyledTextHandler)), + Rc::new(RefCell::new(WikipediaChromeRemover)), ] } @@ -173,7 +176,7 @@ mod tests { .trim(); assert_eq!( - convert_html_to_markdown(html.as_bytes(), wikipedia_handlers()).unwrap(), + convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(), expected ) }