Add tag handler for collecting crate items from rustdoc output (#12903)

This PR adds a tag handler for collecting crate items from rustdoc's
HTML output.

This will serve as the foundation for getting more insight into a
crate's contents.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-06-11 15:56:37 -04:00 committed by GitHub
parent 57b87be3a0
commit 8ccd2a0c99
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 237 additions and 75 deletions

View file

@ -5,7 +5,9 @@ pub mod markdown;
mod markdown_writer;
pub mod structure;
use std::cell::RefCell;
use std::io::Read;
use std::rc::Rc;
use anyhow::{Context, Result};
use html5ever::driver::ParseOpts;
@ -19,13 +21,11 @@ use crate::markdown::{
};
use crate::markdown_writer::MarkdownWriter;
pub use crate::markdown_writer::HandleTag;
pub use crate::markdown_writer::{HandleTag, TagHandler};
use crate::structure::rustdoc::RustdocItem;
/// Converts the provided HTML to Markdown.
pub fn convert_html_to_markdown(
html: impl Read,
handlers: Vec<Box<dyn HandleTag>>,
) -> Result<String> {
pub fn convert_html_to_markdown(html: impl Read, handlers: &mut Vec<TagHandler>) -> Result<String> {
let dom = parse_html(html).context("failed to parse HTML")?;
let markdown_writer = MarkdownWriter::new();
@ -37,21 +37,32 @@ pub fn convert_html_to_markdown(
}
/// Converts the provided rustdoc HTML to Markdown.
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
convert_html_to_markdown(
html,
vec![
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(TableHandler::new()),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocItemHandler),
],
)
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<RustdocItem>)> {
let item_collector = Rc::new(RefCell::new(structure::rustdoc::RustdocItemCollector::new()));
let mut handlers: Vec<TagHandler> = vec![
Rc::new(RefCell::new(ParagraphHandler)),
Rc::new(RefCell::new(HeadingHandler)),
Rc::new(RefCell::new(ListHandler)),
Rc::new(RefCell::new(TableHandler::new())),
Rc::new(RefCell::new(StyledTextHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
item_collector.clone(),
];
let markdown = convert_html_to_markdown(html, &mut handlers)?;
let items = item_collector
.borrow()
.items
.values()
.cloned()
.collect::<Vec<_>>();
Ok((markdown, items))
}
fn parse_html(mut html: impl Read) -> Result<RcDom> {
@ -77,6 +88,20 @@ mod tests {
use super::*;
fn rustdoc_handlers() -> Vec<TagHandler> {
vec![
Rc::new(RefCell::new(ParagraphHandler)),
Rc::new(RefCell::new(HeadingHandler)),
Rc::new(RefCell::new(ListHandler)),
Rc::new(RefCell::new(TableHandler::new())),
Rc::new(RefCell::new(StyledTextHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
]
}
#[test]
fn test_main_heading_buttons_get_removed() {
let html = indoc! {r##"
@ -93,7 +118,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -113,7 +138,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -159,7 +184,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -178,7 +203,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -220,7 +245,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -252,7 +277,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -288,7 +313,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@ -342,7 +367,7 @@ mod tests {
.trim();
assert_eq!(
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}