Add tag handler for collecting crate items from rustdoc output (#12903)

This PR adds a tag handler for collecting crate items from rustdoc's
HTML output.

This will serve as the foundation for getting more insight into a
crate's contents.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-06-11 15:56:37 -04:00 committed by GitHub
parent 57b87be3a0
commit 8ccd2a0c99
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 237 additions and 75 deletions

View file

@ -14,8 +14,10 @@ path = "src/html_to_markdown.rs"
[dependencies]
anyhow.workspace = true
html5ever.workspace = true
indexmap.workspace = true
markup5ever_rcdom.workspace = true
regex.workspace = true
strum.workspace = true
[dev-dependencies]
indoc.workspace = true

View file

@ -5,7 +5,9 @@ pub mod markdown;
mod markdown_writer;
pub mod structure;
use std::cell::RefCell;
use std::io::Read;
use std::rc::Rc;
use anyhow::{Context, Result};
use html5ever::driver::ParseOpts;
@ -19,13 +21,11 @@ use crate::markdown::{
};
use crate::markdown_writer::MarkdownWriter;
pub use crate::markdown_writer::HandleTag;
pub use crate::markdown_writer::{HandleTag, TagHandler};
use crate::structure::rustdoc::RustdocItem;
/// Converts the provided HTML to Markdown.
pub fn convert_html_to_markdown(
html: impl Read,
handlers: Vec<Box<dyn HandleTag>>,
) -> Result<String> {
pub fn convert_html_to_markdown(html: impl Read, handlers: &mut Vec<TagHandler>) -> Result<String> {
let dom = parse_html(html).context("failed to parse HTML")?;
let markdown_writer = MarkdownWriter::new();
@ -37,21 +37,32 @@ pub fn convert_html_to_markdown(
}
/// Converts the provided rustdoc HTML to Markdown.
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
convert_html_to_markdown(
html,
vec![
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(TableHandler::new()),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocItemHandler),
],
)
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<RustdocItem>)> {
let item_collector = Rc::new(RefCell::new(structure::rustdoc::RustdocItemCollector::new()));
let mut handlers: Vec<TagHandler> = vec![
Rc::new(RefCell::new(ParagraphHandler)),
Rc::new(RefCell::new(HeadingHandler)),
Rc::new(RefCell::new(ListHandler)),
Rc::new(RefCell::new(TableHandler::new())),
Rc::new(RefCell::new(StyledTextHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
item_collector.clone(),
];
let markdown = convert_html_to_markdown(html, &mut handlers)?;
let items = item_collector
.borrow()
.items
.values()
.cloned()
.collect::<Vec<_>>();
Ok((markdown, items))
}
fn parse_html(mut html: impl Read) -> Result<RcDom> {
@ -77,6 +88,20 @@ mod tests {
use super::*;
fn rustdoc_handlers() -> Vec<TagHandler> {
vec![
Rc::new(RefCell::new(ParagraphHandler)),
Rc::new(RefCell::new(HeadingHandler)),
Rc::new(RefCell::new(ListHandler)),
Rc::new(RefCell::new(TableHandler::new())),
Rc::new(RefCell::new(StyledTextHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
]
}
#[test]
fn test_main_heading_buttons_get_removed() {
let html = indoc! {r##"
@ -93,7 +118,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -113,7 +138,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -159,7 +184,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -178,7 +203,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -220,7 +245,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -252,7 +277,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -288,7 +313,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -342,7 +367,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}

View file

@ -1,4 +1,6 @@
use std::cell::RefCell;
use std::collections::VecDeque;
use std::rc::Rc;
use std::sync::OnceLock;
use anyhow::Result;
@ -22,6 +24,8 @@ pub enum StartTagOutcome {
Skip,
}
pub type TagHandler = Rc<RefCell<dyn HandleTag>>;
pub struct MarkdownWriter {
current_element_stack: VecDeque<HtmlElement>,
pub(crate) markdown: String,
@ -60,12 +64,8 @@ impl MarkdownWriter {
self.push_str("\n\n");
}
pub fn run(
mut self,
root_node: &Handle,
mut handlers: Vec<Box<dyn HandleTag>>,
) -> Result<String> {
self.visit_node(&root_node, &mut handlers)?;
pub fn run(mut self, root_node: &Handle, handlers: &mut Vec<TagHandler>) -> Result<String> {
self.visit_node(&root_node, handlers)?;
Ok(Self::prettify_markdown(self.markdown))
}
@ -76,7 +76,7 @@ impl MarkdownWriter {
markdown.trim().to_string()
}
fn visit_node(&mut self, node: &Handle, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> {
let mut current_element = None;
match node.data {
@ -128,14 +128,10 @@ impl MarkdownWriter {
Ok(())
}
fn start_tag(
&mut self,
tag: &HtmlElement,
handlers: &mut [Box<dyn HandleTag>],
) -> StartTagOutcome {
fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome {
for handler in handlers {
if handler.should_handle(tag.tag.as_str()) {
match handler.handle_tag_start(tag, self) {
if handler.borrow().should_handle(tag.tag.as_str()) {
match handler.borrow_mut().handle_tag_start(tag, self) {
StartTagOutcome::Continue => {}
StartTagOutcome::Skip => return StartTagOutcome::Skip,
}
@ -145,17 +141,17 @@ impl MarkdownWriter {
StartTagOutcome::Continue
}
fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box<dyn HandleTag>]) {
fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) {
for handler in handlers {
if handler.should_handle(tag.tag.as_str()) {
handler.handle_tag_end(tag, self);
if handler.borrow().should_handle(tag.tag.as_str()) {
handler.borrow_mut().handle_tag_end(tag, self);
}
}
}
fn visit_text(&mut self, text: String, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> {
for handler in handlers {
match handler.handle_text(&text, self) {
match handler.borrow_mut().handle_text(&text, self) {
HandlerOutcome::Handled => return Ok(()),
HandlerOutcome::NoOp => {}
}

View file

@ -1,3 +1,6 @@
use indexmap::IndexMap;
use strum::{EnumIter, IntoEnumIterator};
use crate::html_element::HtmlElement;
use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
@ -203,3 +206,132 @@ impl HandleTag for RustdocChromeRemover {
StartTagOutcome::Continue
}
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy, EnumIter)]
pub enum RustdocItemKind {
Mod,
Macro,
Struct,
Enum,
Constant,
Trait,
Function,
TypeAlias,
AttributeMacro,
DeriveMacro,
}
impl RustdocItemKind {
const fn class(&self) -> &'static str {
match self {
Self::Mod => "mod",
Self::Macro => "macro",
Self::Struct => "struct",
Self::Enum => "enum",
Self::Constant => "constant",
Self::Trait => "trait",
Self::Function => "fn",
Self::TypeAlias => "type",
Self::AttributeMacro => "attr",
Self::DeriveMacro => "derive",
}
}
}
#[derive(Debug, Clone)]
pub struct RustdocItem {
pub kind: RustdocItemKind,
pub name: String,
}
impl RustdocItem {
pub fn url_path(&self) -> String {
let name = &self.name;
match self.kind {
RustdocItemKind::Mod => format!("{name}/index.html"),
RustdocItemKind::Macro
| RustdocItemKind::Struct
| RustdocItemKind::Enum
| RustdocItemKind::Constant
| RustdocItemKind::Trait
| RustdocItemKind::Function
| RustdocItemKind::TypeAlias
| RustdocItemKind::AttributeMacro
| RustdocItemKind::DeriveMacro => {
format!("{kind}.{name}.html", kind = self.kind.class())
}
}
}
}
pub struct RustdocItemCollector {
pub items: IndexMap<(RustdocItemKind, String), RustdocItem>,
}
impl RustdocItemCollector {
pub fn new() -> Self {
Self {
items: IndexMap::new(),
}
}
fn parse_item(tag: &HtmlElement) -> Option<RustdocItem> {
if tag.tag.as_str() != "a" {
return None;
}
let href = tag.attr("href")?;
if href == "#" {
return None;
}
for kind in RustdocItemKind::iter() {
if tag.has_class(kind.class()) {
let name = href
.trim_start_matches(&format!("{}.", kind.class()))
.trim_end_matches("/index.html")
.trim_end_matches(".html");
return Some(RustdocItem {
kind,
name: name.to_owned(),
});
}
}
None
}
}
impl HandleTag for RustdocItemCollector {
fn should_handle(&self, tag: &str) -> bool {
tag == "a"
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"a" => {
let is_reexport = writer.current_element_stack().iter().any(|element| {
if let Some(id) = element.attr("id") {
id.starts_with("reexport.")
} else {
false
}
});
if !is_reexport {
if let Some(item) = Self::parse_item(tag) {
self.items.insert((item.kind, item.name.clone()), item);
}
}
}
_ => {}
}
StartTagOutcome::Continue
}
}

View file

@ -144,20 +144,23 @@ impl HandleTag for WikipediaCodeHandler {
#[cfg(test)]
mod tests {
use std::cell::RefCell;
use std::rc::Rc;
use indoc::indoc;
use pretty_assertions::assert_eq;
use crate::{convert_html_to_markdown, markdown};
use crate::{convert_html_to_markdown, markdown, TagHandler};
use super::*;
fn wikipedia_handlers() -> Vec<Box<dyn HandleTag>> {
fn wikipedia_handlers() -> Vec<TagHandler> {
vec![
Box::new(markdown::ParagraphHandler),
Box::new(markdown::HeadingHandler),
Box::new(markdown::ListHandler),
Box::new(markdown::StyledTextHandler),
Box::new(WikipediaChromeRemover),
Rc::new(RefCell::new(markdown::ParagraphHandler)),
Rc::new(RefCell::new(markdown::HeadingHandler)),
Rc::new(RefCell::new(markdown::ListHandler)),
Rc::new(RefCell::new(markdown::StyledTextHandler)),
Rc::new(RefCell::new(WikipediaChromeRemover)),
]
}
@ -173,7 +176,7 @@ mod tests {
.trim();
assert_eq!(
convert_html_to_markdown(html.as_bytes(), wikipedia_handlers()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(),
expected
)
}