Add tag handler for collecting crate items from rustdoc output (#12903)

This PR adds a tag handler for collecting crate items from rustdoc's HTML output. This will serve as the foundation for getting more insight into a crate's contents. Release Notes: - N/A
2024-06-11 15:56:37 -04:00 · 2024-06-11 15:56:37 -04:00 · 8ccd2a0c99
commit 8ccd2a0c99
parent 57b87be3a0
8 changed files with 237 additions and 75 deletions
--- a/crates/html_to_markdown/src/structure/rustdoc.rs
+++ b/crates/html_to_markdown/src/structure/rustdoc.rs
@ -1,3 +1,6 @@
+use indexmap::IndexMap;
+use strum::{EnumIter, IntoEnumIterator};
+
 use crate::html_element::HtmlElement;
 use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};

@ -203,3 +206,132 @@ impl HandleTag for RustdocChromeRemover {
        StartTagOutcome::Continue
    }
 }
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy, EnumIter)]
+pub enum RustdocItemKind {
+    Mod,
+    Macro,
+    Struct,
+    Enum,
+    Constant,
+    Trait,
+    Function,
+    TypeAlias,
+    AttributeMacro,
+    DeriveMacro,
+}
+
+impl RustdocItemKind {
+    const fn class(&self) -> &'static str {
+        match self {
+            Self::Mod => "mod",
+            Self::Macro => "macro",
+            Self::Struct => "struct",
+            Self::Enum => "enum",
+            Self::Constant => "constant",
+            Self::Trait => "trait",
+            Self::Function => "fn",
+            Self::TypeAlias => "type",
+            Self::AttributeMacro => "attr",
+            Self::DeriveMacro => "derive",
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct RustdocItem {
+    pub kind: RustdocItemKind,
+    pub name: String,
+}
+
+impl RustdocItem {
+    pub fn url_path(&self) -> String {
+        let name = &self.name;
+        match self.kind {
+            RustdocItemKind::Mod => format!("{name}/index.html"),
+            RustdocItemKind::Macro
+            | RustdocItemKind::Struct
+            | RustdocItemKind::Enum
+            | RustdocItemKind::Constant
+            | RustdocItemKind::Trait
+            | RustdocItemKind::Function
+            | RustdocItemKind::TypeAlias
+            | RustdocItemKind::AttributeMacro
+            | RustdocItemKind::DeriveMacro => {
+                format!("{kind}.{name}.html", kind = self.kind.class())
+            }
+        }
+    }
+}
+
+pub struct RustdocItemCollector {
+    pub items: IndexMap<(RustdocItemKind, String), RustdocItem>,
+}
+
+impl RustdocItemCollector {
+    pub fn new() -> Self {
+        Self {
+            items: IndexMap::new(),
+        }
+    }
+
+    fn parse_item(tag: &HtmlElement) -> Option<RustdocItem> {
+        if tag.tag.as_str() != "a" {
+            return None;
+        }
+
+        let href = tag.attr("href")?;
+        if href == "#" {
+            return None;
+        }
+
+        for kind in RustdocItemKind::iter() {
+            if tag.has_class(kind.class()) {
+                let name = href
+                    .trim_start_matches(&format!("{}.", kind.class()))
+                    .trim_end_matches("/index.html")
+                    .trim_end_matches(".html");
+
+                return Some(RustdocItem {
+                    kind,
+                    name: name.to_owned(),
+                });
+            }
+        }
+
+        None
+    }
+}
+
+impl HandleTag for RustdocItemCollector {
+    fn should_handle(&self, tag: &str) -> bool {
+        tag == "a"
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag.as_str() {
+            "a" => {
+                let is_reexport = writer.current_element_stack().iter().any(|element| {
+                    if let Some(id) = element.attr("id") {
+                        id.starts_with("reexport.")
+                    } else {
+                        false
+                    }
+                });
+
+                if !is_reexport {
+                    if let Some(item) = Self::parse_item(tag) {
+                        self.items.insert((item.kind, item.name.clone()), item);
+                    }
+                }
+            }
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+}
--- a/crates/html_to_markdown/src/structure/wikipedia.rs
+++ b/crates/html_to_markdown/src/structure/wikipedia.rs
@ -144,20 +144,23 @@ impl HandleTag for WikipediaCodeHandler {

 #[cfg(test)]
 mod tests {
+    use std::cell::RefCell;
+    use std::rc::Rc;
+
    use indoc::indoc;
    use pretty_assertions::assert_eq;

-    use crate::{convert_html_to_markdown, markdown};
+    use crate::{convert_html_to_markdown, markdown, TagHandler};

    use super::*;

-    fn wikipedia_handlers() -> Vec<Box<dyn HandleTag>> {
+    fn wikipedia_handlers() -> Vec<TagHandler> {
        vec![
-            Box::new(markdown::ParagraphHandler),
-            Box::new(markdown::HeadingHandler),
-            Box::new(markdown::ListHandler),
-            Box::new(markdown::StyledTextHandler),
-            Box::new(WikipediaChromeRemover),
+            Rc::new(RefCell::new(markdown::ParagraphHandler)),
+            Rc::new(RefCell::new(markdown::HeadingHandler)),
+            Rc::new(RefCell::new(markdown::ListHandler)),
+            Rc::new(RefCell::new(markdown::StyledTextHandler)),
+            Rc::new(RefCell::new(WikipediaChromeRemover)),
        ]
    }

@ -173,7 +176,7 @@ mod tests {
        .trim();

        assert_eq!(
-            convert_html_to_markdown(html.as_bytes(), wikipedia_handlers()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(),
            expected
        )
    }