ZIm/crates/html_to_markdown/src/html_element.rs
Marshall Bowers 2d9479667f
Make HTML to Markdown conversion more pluggable (#12653)
This PR overhauls the HTML to Markdown conversion functionality in order
to make it more pluggable. This will ultimately allow for supporting a
variety of different HTML input structures (both natively and via
extensions).

As part of this, the `rustdoc_to_markdown` crate has been renamed to
`html_to_markdown`.

The `MarkdownWriter` now accepts a list of trait objects that can be
used to drive the conversion of the HTML into Markdown. Right now we
have some generic handler implementations for going from plain HTML
elements to their Markdown equivalents, as well as some rustdoc-specific
ones.

Release Notes:

- N/A
2024-06-04 16:14:26 -04:00

75 lines
2.6 KiB
Rust

use std::cell::RefCell;
use std::collections::HashSet;
use std::sync::OnceLock;
use html5ever::Attribute;
/// Returns a [`HashSet`] containing the HTML elements that are inline by default.
///
/// [MDN: List of "inline" elements](https://yari-demos.prod.mdn.mozit.cloud/en-US/docs/Web/HTML/Inline_elements)
fn inline_elements() -> &'static HashSet<&'static str> {
static INLINE_ELEMENTS: OnceLock<HashSet<&str>> = OnceLock::new();
&INLINE_ELEMENTS.get_or_init(|| {
HashSet::from_iter([
"a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "br", "button", "canvas",
"cite", "code", "data", "datalist", "del", "dfn", "em", "embed", "i", "iframe", "img",
"input", "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output",
"picture", "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small",
"span", "strong", "sub", "sup", "svg", "template", "textarea", "time", "tt", "u",
"var", "video", "wbr",
])
})
}
#[derive(Debug, Clone)]
pub struct HtmlElement {
pub(crate) tag: String,
pub(crate) attrs: RefCell<Vec<Attribute>>,
}
impl HtmlElement {
/// Returns whether this [`HtmlElement`] is an inline element.
pub fn is_inline(&self) -> bool {
inline_elements().contains(self.tag.as_str())
}
/// Returns the attribute with the specified name.
pub fn attr(&self, name: &str) -> Option<String> {
self.attrs
.borrow()
.iter()
.find(|attr| attr.name.local.to_string() == name)
.map(|attr| attr.value.to_string())
}
/// Returns the list of classes on this [`HtmlElement`].
pub fn classes(&self) -> Vec<String> {
self.attrs
.borrow()
.iter()
.find(|attr| attr.name.local.to_string() == "class")
.map(|attr| {
attr.value
.split(' ')
.map(|class| class.trim().to_string())
.collect::<Vec<_>>()
})
.unwrap_or_default()
}
/// Returns whether this [`HtmlElement`] has the specified class.
pub fn has_class(&self, class: &str) -> bool {
self.has_any_classes(&[class])
}
/// Returns whether this [`HtmlElement`] has any of the specified classes.
pub fn has_any_classes(&self, classes: &[&str]) -> bool {
self.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class"
&& attr
.value
.split(' ')
.any(|class| classes.contains(&class.trim()))
})
}
}