Make HTML to Markdown conversion more pluggable (#12653)
This PR overhauls the HTML to Markdown conversion functionality in order to make it more pluggable. This will ultimately allow for supporting a variety of different HTML input structures (both natively and via extensions). As part of this, the `rustdoc_to_markdown` crate has been renamed to `html_to_markdown`. The `MarkdownWriter` now accepts a list of trait objects that can be used to drive the conversion of the HTML into Markdown. Right now we have some generic handler implementations for going from plain HTML elements to their Markdown equivalents, as well as some rustdoc-specific ones. Release Notes: - N/A
This commit is contained in:
parent
1c617474fe
commit
2d9479667f
15 changed files with 671 additions and 320 deletions
75
crates/html_to_markdown/src/html_element.rs
Normal file
75
crates/html_to_markdown/src/html_element.rs
Normal file
|
@ -0,0 +1,75 @@
|
|||
use std::cell::RefCell;
|
||||
use std::collections::HashSet;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use html5ever::Attribute;
|
||||
|
||||
/// Returns a [`HashSet`] containing the HTML elements that are inline by default.
|
||||
///
|
||||
/// [MDN: List of "inline" elements](https://yari-demos.prod.mdn.mozit.cloud/en-US/docs/Web/HTML/Inline_elements)
|
||||
fn inline_elements() -> &'static HashSet<&'static str> {
|
||||
static INLINE_ELEMENTS: OnceLock<HashSet<&str>> = OnceLock::new();
|
||||
&INLINE_ELEMENTS.get_or_init(|| {
|
||||
HashSet::from_iter([
|
||||
"a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "br", "button", "canvas",
|
||||
"cite", "code", "data", "datalist", "del", "dfn", "em", "embed", "i", "iframe", "img",
|
||||
"input", "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output",
|
||||
"picture", "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small",
|
||||
"span", "strong", "sub", "sup", "svg", "template", "textarea", "time", "tt", "u",
|
||||
"var", "video", "wbr",
|
||||
])
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HtmlElement {
|
||||
pub(crate) tag: String,
|
||||
pub(crate) attrs: RefCell<Vec<Attribute>>,
|
||||
}
|
||||
|
||||
impl HtmlElement {
|
||||
/// Returns whether this [`HtmlElement`] is an inline element.
|
||||
pub fn is_inline(&self) -> bool {
|
||||
inline_elements().contains(self.tag.as_str())
|
||||
}
|
||||
|
||||
/// Returns the attribute with the specified name.
|
||||
pub fn attr(&self, name: &str) -> Option<String> {
|
||||
self.attrs
|
||||
.borrow()
|
||||
.iter()
|
||||
.find(|attr| attr.name.local.to_string() == name)
|
||||
.map(|attr| attr.value.to_string())
|
||||
}
|
||||
|
||||
/// Returns the list of classes on this [`HtmlElement`].
|
||||
pub fn classes(&self) -> Vec<String> {
|
||||
self.attrs
|
||||
.borrow()
|
||||
.iter()
|
||||
.find(|attr| attr.name.local.to_string() == "class")
|
||||
.map(|attr| {
|
||||
attr.value
|
||||
.split(' ')
|
||||
.map(|class| class.trim().to_string())
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Returns whether this [`HtmlElement`] has the specified class.
|
||||
pub fn has_class(&self, class: &str) -> bool {
|
||||
self.has_any_classes(&[class])
|
||||
}
|
||||
|
||||
/// Returns whether this [`HtmlElement`] has any of the specified classes.
|
||||
pub fn has_any_classes(&self, classes: &[&str]) -> bool {
|
||||
self.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class"
|
||||
&& attr
|
||||
.value
|
||||
.split(' ')
|
||||
.any(|class| classes.contains(&class.trim()))
|
||||
})
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue