Make HTML to Markdown conversion more pluggable (#12653)
This PR overhauls the HTML to Markdown conversion functionality in order to make it more pluggable. This will ultimately allow for supporting a variety of different HTML input structures (both natively and via extensions). As part of this, the `rustdoc_to_markdown` crate has been renamed to `html_to_markdown`. The `MarkdownWriter` now accepts a list of trait objects that can be used to drive the conversion of the HTML into Markdown. Right now we have some generic handler implementations for going from plain HTML elements to their Markdown equivalents, as well as some rustdoc-specific ones. Release Notes: - N/A
This commit is contained in:
parent
1c617474fe
commit
2d9479667f
15 changed files with 671 additions and 320 deletions
22
crates/html_to_markdown/Cargo.toml
Normal file
22
crates/html_to_markdown/Cargo.toml
Normal file
|
@ -0,0 +1,22 @@
|
|||
[package]
|
||||
name = "html_to_markdown"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
publish = false
|
||||
license = "GPL-3.0-or-later"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[lib]
|
||||
path = "src/html_to_markdown.rs"
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
html5ever.workspace = true
|
||||
markup5ever_rcdom.workspace = true
|
||||
regex.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
indoc.workspace = true
|
||||
pretty_assertions.workspace = true
|
1
crates/html_to_markdown/LICENSE-GPL
Symbolic link
1
crates/html_to_markdown/LICENSE-GPL
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../LICENSE-GPL
|
29
crates/html_to_markdown/examples/test.rs
Normal file
29
crates/html_to_markdown/examples/test.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use html_to_markdown::convert_rustdoc_to_markdown;
|
||||
use indoc::indoc;
|
||||
|
||||
pub fn main() {
|
||||
let html = indoc! {"
|
||||
<html>
|
||||
<body>
|
||||
<h1>Hello World</h1>
|
||||
<p>
|
||||
Here is some content.
|
||||
</p>
|
||||
<h2>Some items</h2>
|
||||
<ul>
|
||||
<li>One</li>
|
||||
<li>Two</li>
|
||||
<li>Three</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
"};
|
||||
// To test this out with some real input, try this:
|
||||
//
|
||||
// ```
|
||||
// let html = include_str!("/path/to/zed/target/doc/gpui/index.html");
|
||||
// ```
|
||||
let markdown = convert_rustdoc_to_markdown(html.as_bytes()).unwrap();
|
||||
|
||||
println!("{markdown}");
|
||||
}
|
75
crates/html_to_markdown/src/html_element.rs
Normal file
75
crates/html_to_markdown/src/html_element.rs
Normal file
|
@ -0,0 +1,75 @@
|
|||
use std::cell::RefCell;
|
||||
use std::collections::HashSet;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use html5ever::Attribute;
|
||||
|
||||
/// Returns a [`HashSet`] containing the HTML elements that are inline by default.
|
||||
///
|
||||
/// [MDN: List of "inline" elements](https://yari-demos.prod.mdn.mozit.cloud/en-US/docs/Web/HTML/Inline_elements)
|
||||
fn inline_elements() -> &'static HashSet<&'static str> {
|
||||
static INLINE_ELEMENTS: OnceLock<HashSet<&str>> = OnceLock::new();
|
||||
&INLINE_ELEMENTS.get_or_init(|| {
|
||||
HashSet::from_iter([
|
||||
"a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "br", "button", "canvas",
|
||||
"cite", "code", "data", "datalist", "del", "dfn", "em", "embed", "i", "iframe", "img",
|
||||
"input", "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output",
|
||||
"picture", "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small",
|
||||
"span", "strong", "sub", "sup", "svg", "template", "textarea", "time", "tt", "u",
|
||||
"var", "video", "wbr",
|
||||
])
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HtmlElement {
|
||||
pub(crate) tag: String,
|
||||
pub(crate) attrs: RefCell<Vec<Attribute>>,
|
||||
}
|
||||
|
||||
impl HtmlElement {
|
||||
/// Returns whether this [`HtmlElement`] is an inline element.
|
||||
pub fn is_inline(&self) -> bool {
|
||||
inline_elements().contains(self.tag.as_str())
|
||||
}
|
||||
|
||||
/// Returns the attribute with the specified name.
|
||||
pub fn attr(&self, name: &str) -> Option<String> {
|
||||
self.attrs
|
||||
.borrow()
|
||||
.iter()
|
||||
.find(|attr| attr.name.local.to_string() == name)
|
||||
.map(|attr| attr.value.to_string())
|
||||
}
|
||||
|
||||
/// Returns the list of classes on this [`HtmlElement`].
|
||||
pub fn classes(&self) -> Vec<String> {
|
||||
self.attrs
|
||||
.borrow()
|
||||
.iter()
|
||||
.find(|attr| attr.name.local.to_string() == "class")
|
||||
.map(|attr| {
|
||||
attr.value
|
||||
.split(' ')
|
||||
.map(|class| class.trim().to_string())
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Returns whether this [`HtmlElement`] has the specified class.
|
||||
pub fn has_class(&self, class: &str) -> bool {
|
||||
self.has_any_classes(&[class])
|
||||
}
|
||||
|
||||
/// Returns whether this [`HtmlElement`] has any of the specified classes.
|
||||
pub fn has_any_classes(&self, classes: &[&str]) -> bool {
|
||||
self.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class"
|
||||
&& attr
|
||||
.value
|
||||
.split(' ')
|
||||
.any(|class| classes.contains(&class.trim()))
|
||||
})
|
||||
}
|
||||
}
|
362
crates/html_to_markdown/src/html_to_markdown.rs
Normal file
362
crates/html_to_markdown/src/html_to_markdown.rs
Normal file
|
@ -0,0 +1,362 @@
|
|||
//! Provides conversion from rustdoc's HTML output to Markdown.
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod html_element;
|
||||
mod markdown;
|
||||
mod markdown_writer;
|
||||
mod structure;
|
||||
|
||||
use std::io::Read;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use html5ever::driver::ParseOpts;
|
||||
use html5ever::parse_document;
|
||||
use html5ever::tendril::TendrilSink;
|
||||
use html5ever::tree_builder::TreeBuilderOpts;
|
||||
use markup5ever_rcdom::RcDom;
|
||||
|
||||
use crate::markdown::{HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler};
|
||||
use crate::markdown_writer::{HandleTag, MarkdownWriter};
|
||||
|
||||
/// Converts the provided HTML to Markdown.
|
||||
pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
|
||||
let dom = parse_html(html).context("failed to parse HTML")?;
|
||||
|
||||
let handlers: Vec<Box<dyn HandleTag>> = vec![
|
||||
Box::new(ParagraphHandler),
|
||||
Box::new(HeadingHandler),
|
||||
Box::new(ListHandler),
|
||||
Box::new(StyledTextHandler),
|
||||
Box::new(structure::rustdoc::RustdocChromeRemover),
|
||||
Box::new(structure::rustdoc::RustdocHeadingHandler),
|
||||
Box::new(structure::rustdoc::RustdocCodeHandler),
|
||||
Box::new(structure::rustdoc::RustdocTableHandler::new()),
|
||||
Box::new(structure::rustdoc::RustdocItemHandler),
|
||||
];
|
||||
|
||||
let markdown_writer = MarkdownWriter::new();
|
||||
let markdown = markdown_writer
|
||||
.run(&dom.document, handlers)
|
||||
.context("failed to convert HTML to Markdown")?;
|
||||
|
||||
Ok(markdown)
|
||||
}
|
||||
|
||||
/// Converts the provided rustdoc HTML to Markdown.
|
||||
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
|
||||
let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
|
||||
|
||||
let handlers: Vec<Box<dyn HandleTag>> = vec![
|
||||
Box::new(ParagraphHandler),
|
||||
Box::new(HeadingHandler),
|
||||
Box::new(ListHandler),
|
||||
Box::new(StyledTextHandler),
|
||||
Box::new(structure::rustdoc::RustdocChromeRemover),
|
||||
Box::new(structure::rustdoc::RustdocHeadingHandler),
|
||||
Box::new(structure::rustdoc::RustdocCodeHandler),
|
||||
Box::new(structure::rustdoc::RustdocTableHandler::new()),
|
||||
Box::new(structure::rustdoc::RustdocItemHandler),
|
||||
];
|
||||
|
||||
let markdown_writer = MarkdownWriter::new();
|
||||
let markdown = markdown_writer
|
||||
.run(&dom.document, handlers)
|
||||
.context("failed to convert rustdoc HTML to Markdown")?;
|
||||
|
||||
Ok(markdown)
|
||||
}
|
||||
|
||||
fn parse_html(mut html: impl Read) -> Result<RcDom> {
|
||||
let parse_options = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let dom = parse_document(RcDom::default(), parse_options)
|
||||
.from_utf8()
|
||||
.read_from(&mut html)
|
||||
.context("failed to parse HTML document")?;
|
||||
|
||||
Ok(dom)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use indoc::indoc;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_main_heading_buttons_get_removed() {
|
||||
let html = indoc! {r##"
|
||||
<div class="main-heading">
|
||||
<h1>Crate <a class="mod" href="#">serde</a><button id="copy-path" title="Copy item path to clipboard">Copy item path</button></h1>
|
||||
<span class="out-of-band">
|
||||
<a class="src" href="../src/serde/lib.rs.html#1-340">source</a> · <button id="toggle-all-docs" title="collapse all docs">[<span>−</span>]</button>
|
||||
</span>
|
||||
</div>
|
||||
"##};
|
||||
let expected = indoc! {"
|
||||
# Crate serde
|
||||
"}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_paragraph() {
|
||||
let html = indoc! {r#"
|
||||
<p>In particular, the last point is what sets <code>axum</code> apart from other frameworks.
|
||||
<code>axum</code> doesn’t have its own middleware system but instead uses
|
||||
<a href="https://docs.rs/tower-service/0.3.2/x86_64-unknown-linux-gnu/tower_service/trait.Service.html" title="trait tower_service::Service"><code>tower::Service</code></a>. This means <code>axum</code> gets timeouts, tracing, compression,
|
||||
authorization, and more, for free. It also enables you to share middleware with
|
||||
applications written using <a href="http://crates.io/crates/hyper"><code>hyper</code></a> or <a href="http://crates.io/crates/tonic"><code>tonic</code></a>.</p>
|
||||
"#};
|
||||
let expected = indoc! {"
|
||||
In particular, the last point is what sets `axum` apart from other frameworks. `axum` doesn’t have its own middleware system but instead uses `tower::Service`. This means `axum` gets timeouts, tracing, compression, authorization, and more, for free. It also enables you to share middleware with applications written using `hyper` or `tonic`.
|
||||
"}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_paragraphs() {
|
||||
let html = indoc! {r##"
|
||||
<h2 id="serde"><a class="doc-anchor" href="#serde">§</a>Serde</h2>
|
||||
<p>Serde is a framework for <em><strong>ser</strong></em>ializing and <em><strong>de</strong></em>serializing Rust data
|
||||
structures efficiently and generically.</p>
|
||||
<p>The Serde ecosystem consists of data structures that know how to serialize
|
||||
and deserialize themselves along with data formats that know how to
|
||||
serialize and deserialize other things. Serde provides the layer by which
|
||||
these two groups interact with each other, allowing any supported data
|
||||
structure to be serialized and deserialized using any supported data format.</p>
|
||||
<p>See the Serde website <a href="https://serde.rs/">https://serde.rs/</a> for additional documentation and
|
||||
usage examples.</p>
|
||||
<h3 id="design"><a class="doc-anchor" href="#design">§</a>Design</h3>
|
||||
<p>Where many other languages rely on runtime reflection for serializing data,
|
||||
Serde is instead built on Rust’s powerful trait system. A data structure
|
||||
that knows how to serialize and deserialize itself is one that implements
|
||||
Serde’s <code>Serialize</code> and <code>Deserialize</code> traits (or uses Serde’s derive
|
||||
attribute to automatically generate implementations at compile time). This
|
||||
avoids any overhead of reflection or runtime type information. In fact in
|
||||
many situations the interaction between data structure and data format can
|
||||
be completely optimized away by the Rust compiler, leaving Serde
|
||||
serialization to perform the same speed as a handwritten serializer for the
|
||||
specific selection of data structure and data format.</p>
|
||||
"##};
|
||||
let expected = indoc! {"
|
||||
## Serde
|
||||
|
||||
Serde is a framework for _**ser**_ializing and _**de**_serializing Rust data structures efficiently and generically.
|
||||
|
||||
The Serde ecosystem consists of data structures that know how to serialize and deserialize themselves along with data formats that know how to serialize and deserialize other things. Serde provides the layer by which these two groups interact with each other, allowing any supported data structure to be serialized and deserialized using any supported data format.
|
||||
|
||||
See the Serde website https://serde.rs/ for additional documentation and usage examples.
|
||||
|
||||
### Design
|
||||
|
||||
Where many other languages rely on runtime reflection for serializing data, Serde is instead built on Rust’s powerful trait system. A data structure that knows how to serialize and deserialize itself is one that implements Serde’s `Serialize` and `Deserialize` traits (or uses Serde’s derive attribute to automatically generate implementations at compile time). This avoids any overhead of reflection or runtime type information. In fact in many situations the interaction between data structure and data format can be completely optimized away by the Rust compiler, leaving Serde serialization to perform the same speed as a handwritten serializer for the specific selection of data structure and data format.
|
||||
"}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_styled_text() {
|
||||
let html = indoc! {r#"
|
||||
<p>This text is <strong>bolded</strong>.</p>
|
||||
<p>This text is <em>italicized</em>.</p>
|
||||
"#};
|
||||
let expected = indoc! {"
|
||||
This text is **bolded**.
|
||||
|
||||
This text is _italicized_.
|
||||
"}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rust_code_block() {
|
||||
let html = indoc! {r#"
|
||||
<pre class="rust rust-example-rendered"><code><span class="kw">use </span>axum::extract::{Path, Query, Json};
|
||||
<span class="kw">use </span>std::collections::HashMap;
|
||||
|
||||
<span class="comment">// `Path` gives you the path parameters and deserializes them.
|
||||
</span><span class="kw">async fn </span>path(Path(user_id): Path<u32>) {}
|
||||
|
||||
<span class="comment">// `Query` gives you the query parameters and deserializes them.
|
||||
</span><span class="kw">async fn </span>query(Query(params): Query<HashMap<String, String>>) {}
|
||||
|
||||
<span class="comment">// Buffer the request body and deserialize it as JSON into a
|
||||
// `serde_json::Value`. `Json` supports any type that implements
|
||||
// `serde::Deserialize`.
|
||||
</span><span class="kw">async fn </span>json(Json(payload): Json<serde_json::Value>) {}</code></pre>
|
||||
"#};
|
||||
let expected = indoc! {"
|
||||
```rs
|
||||
use axum::extract::{Path, Query, Json};
|
||||
use std::collections::HashMap;
|
||||
|
||||
// `Path` gives you the path parameters and deserializes them.
|
||||
async fn path(Path(user_id): Path<u32>) {}
|
||||
|
||||
// `Query` gives you the query parameters and deserializes them.
|
||||
async fn query(Query(params): Query<HashMap<String, String>>) {}
|
||||
|
||||
// Buffer the request body and deserialize it as JSON into a
|
||||
// `serde_json::Value`. `Json` supports any type that implements
|
||||
// `serde::Deserialize`.
|
||||
async fn json(Json(payload): Json<serde_json::Value>) {}
|
||||
```
|
||||
"}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_toml_code_block() {
|
||||
let html = indoc! {r##"
|
||||
<h2 id="required-dependencies"><a class="doc-anchor" href="#required-dependencies">§</a>Required dependencies</h2>
|
||||
<p>To use axum there are a few dependencies you have to pull in as well:</p>
|
||||
<div class="example-wrap"><pre class="language-toml"><code>[dependencies]
|
||||
axum = "<latest-version>"
|
||||
tokio = { version = "<latest-version>", features = ["full"] }
|
||||
tower = "<latest-version>"
|
||||
</code></pre></div>
|
||||
"##};
|
||||
let expected = indoc! {r#"
|
||||
## Required dependencies
|
||||
|
||||
To use axum there are a few dependencies you have to pull in as well:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
axum = "<latest-version>"
|
||||
tokio = { version = "<latest-version>", features = ["full"] }
|
||||
tower = "<latest-version>"
|
||||
|
||||
```
|
||||
"#}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_item_table() {
|
||||
let html = indoc! {r##"
|
||||
<h2 id="structs" class="section-header">Structs<a href="#structs" class="anchor">§</a></h2>
|
||||
<ul class="item-table">
|
||||
<li><div class="item-name"><a class="struct" href="struct.Error.html" title="struct axum::Error">Error</a></div><div class="desc docblock-short">Errors that can happen when using axum.</div></li>
|
||||
<li><div class="item-name"><a class="struct" href="struct.Extension.html" title="struct axum::Extension">Extension</a></div><div class="desc docblock-short">Extractor and response for extensions.</div></li>
|
||||
<li><div class="item-name"><a class="struct" href="struct.Form.html" title="struct axum::Form">Form</a><span class="stab portability" title="Available on crate feature `form` only"><code>form</code></span></div><div class="desc docblock-short">URL encoded extractor and response.</div></li>
|
||||
<li><div class="item-name"><a class="struct" href="struct.Json.html" title="struct axum::Json">Json</a><span class="stab portability" title="Available on crate feature `json` only"><code>json</code></span></div><div class="desc docblock-short">JSON Extractor / Response.</div></li>
|
||||
<li><div class="item-name"><a class="struct" href="struct.Router.html" title="struct axum::Router">Router</a></div><div class="desc docblock-short">The router type for composing handlers and services.</div></li></ul>
|
||||
<h2 id="functions" class="section-header">Functions<a href="#functions" class="anchor">§</a></h2>
|
||||
<ul class="item-table">
|
||||
<li><div class="item-name"><a class="fn" href="fn.serve.html" title="fn axum::serve">serve</a><span class="stab portability" title="Available on crate feature `tokio` and (crate features `http1` or `http2`) only"><code>tokio</code> and (<code>http1</code> or <code>http2</code>)</span></div><div class="desc docblock-short">Serve the service with the supplied listener.</div></li>
|
||||
</ul>
|
||||
"##};
|
||||
let expected = indoc! {r#"
|
||||
## Structs
|
||||
|
||||
- `Error`: Errors that can happen when using axum.
|
||||
- `Extension`: Extractor and response for extensions.
|
||||
- `Form` [`form`]: URL encoded extractor and response.
|
||||
- `Json` [`json`]: JSON Extractor / Response.
|
||||
- `Router`: The router type for composing handlers and services.
|
||||
|
||||
## Functions
|
||||
|
||||
- `serve` [`tokio` and (`http1` or `http2`)]: Serve the service with the supplied listener.
|
||||
"#}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_table() {
|
||||
let html = indoc! {r##"
|
||||
<h2 id="feature-flags"><a class="doc-anchor" href="#feature-flags">§</a>Feature flags</h2>
|
||||
<p>axum uses a set of <a href="https://doc.rust-lang.org/cargo/reference/features.html#the-features-section">feature flags</a> to reduce the amount of compiled and
|
||||
optional dependencies.</p>
|
||||
<p>The following optional features are available:</p>
|
||||
<div><table><thead><tr><th>Name</th><th>Description</th><th>Default?</th></tr></thead><tbody>
|
||||
<tr><td><code>http1</code></td><td>Enables hyper’s <code>http1</code> feature</td><td>Yes</td></tr>
|
||||
<tr><td><code>http2</code></td><td>Enables hyper’s <code>http2</code> feature</td><td>No</td></tr>
|
||||
<tr><td><code>json</code></td><td>Enables the <a href="struct.Json.html" title="struct axum::Json"><code>Json</code></a> type and some similar convenience functionality</td><td>Yes</td></tr>
|
||||
<tr><td><code>macros</code></td><td>Enables optional utility macros</td><td>No</td></tr>
|
||||
<tr><td><code>matched-path</code></td><td>Enables capturing of every request’s router path and the <a href="extract/struct.MatchedPath.html" title="struct axum::extract::MatchedPath"><code>MatchedPath</code></a> extractor</td><td>Yes</td></tr>
|
||||
<tr><td><code>multipart</code></td><td>Enables parsing <code>multipart/form-data</code> requests with <a href="extract/struct.Multipart.html" title="struct axum::extract::Multipart"><code>Multipart</code></a></td><td>No</td></tr>
|
||||
<tr><td><code>original-uri</code></td><td>Enables capturing of every request’s original URI and the <a href="extract/struct.OriginalUri.html" title="struct axum::extract::OriginalUri"><code>OriginalUri</code></a> extractor</td><td>Yes</td></tr>
|
||||
<tr><td><code>tokio</code></td><td>Enables <code>tokio</code> as a dependency and <code>axum::serve</code>, <code>SSE</code> and <code>extract::connect_info</code> types.</td><td>Yes</td></tr>
|
||||
<tr><td><code>tower-log</code></td><td>Enables <code>tower</code>’s <code>log</code> feature</td><td>Yes</td></tr>
|
||||
<tr><td><code>tracing</code></td><td>Log rejections from built-in extractors</td><td>Yes</td></tr>
|
||||
<tr><td><code>ws</code></td><td>Enables WebSockets support via <a href="extract/ws/index.html" title="mod axum::extract::ws"><code>extract::ws</code></a></td><td>No</td></tr>
|
||||
<tr><td><code>form</code></td><td>Enables the <code>Form</code> extractor</td><td>Yes</td></tr>
|
||||
<tr><td><code>query</code></td><td>Enables the <code>Query</code> extractor</td><td>Yes</td></tr>
|
||||
</tbody></table>
|
||||
"##};
|
||||
let expected = indoc! {r#"
|
||||
## Feature flags
|
||||
|
||||
axum uses a set of feature flags to reduce the amount of compiled and optional dependencies.
|
||||
|
||||
The following optional features are available:
|
||||
|
||||
| Name | Description | Default? |
|
||||
| --- | --- | --- |
|
||||
| `http1` | Enables hyper’s `http1` feature | Yes |
|
||||
| `http2` | Enables hyper’s `http2` feature | No |
|
||||
| `json` | Enables the `Json` type and some similar convenience functionality | Yes |
|
||||
| `macros` | Enables optional utility macros | No |
|
||||
| `matched-path` | Enables capturing of every request’s router path and the `MatchedPath` extractor | Yes |
|
||||
| `multipart` | Enables parsing `multipart/form-data` requests with `Multipart` | No |
|
||||
| `original-uri` | Enables capturing of every request’s original URI and the `OriginalUri` extractor | Yes |
|
||||
| `tokio` | Enables `tokio` as a dependency and `axum::serve`, `SSE` and `extract::connect_info` types. | Yes |
|
||||
| `tower-log` | Enables `tower`’s `log` feature | Yes |
|
||||
| `tracing` | Log rejections from built-in extractors | Yes |
|
||||
| `ws` | Enables WebSockets support via `extract::ws` | No |
|
||||
| `form` | Enables the `Form` extractor | Yes |
|
||||
| `query` | Enables the `Query` extractor | Yes |
|
||||
"#}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
}
|
135
crates/html_to_markdown/src/markdown.rs
Normal file
135
crates/html_to_markdown/src/markdown.rs
Normal file
|
@ -0,0 +1,135 @@
|
|||
use crate::html_element::HtmlElement;
|
||||
use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome};
|
||||
|
||||
pub struct ParagraphHandler;
|
||||
|
||||
impl HandleTag for ParagraphHandler {
|
||||
fn should_handle(&self, _tag: &str) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
if tag.is_inline() && writer.is_inside("p") {
|
||||
if let Some(parent) = writer.current_element_stack().iter().last() {
|
||||
if !parent.is_inline() {
|
||||
if !(writer.markdown.ends_with(' ') || writer.markdown.ends_with('\n')) {
|
||||
writer.push_str(" ");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match tag.tag.as_str() {
|
||||
"p" => writer.push_blank_line(),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
}
|
||||
|
||||
pub struct HeadingHandler;
|
||||
|
||||
impl HandleTag for HeadingHandler {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag.as_str() {
|
||||
"h1" => writer.push_str("\n\n# "),
|
||||
"h2" => writer.push_str("\n\n## "),
|
||||
"h3" => writer.push_str("\n\n### "),
|
||||
"h4" => writer.push_str("\n\n#### "),
|
||||
"h5" => writer.push_str("\n\n##### "),
|
||||
"h6" => writer.push_str("\n\n###### "),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||
match tag.tag.as_str() {
|
||||
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => writer.push_blank_line(),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ListHandler;
|
||||
|
||||
impl HandleTag for ListHandler {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"ul" | "ol" | "li" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag.as_str() {
|
||||
"ul" | "ol" => writer.push_newline(),
|
||||
"li" => writer.push_str("- "),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||
match tag.tag.as_str() {
|
||||
"ul" | "ol" => writer.push_newline(),
|
||||
"li" => writer.push_newline(),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StyledTextHandler;
|
||||
|
||||
impl HandleTag for StyledTextHandler {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"strong" | "em" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag.as_str() {
|
||||
"strong" => writer.push_str("**"),
|
||||
"em" => writer.push_str("_"),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||
match tag.tag.as_str() {
|
||||
"strong" => writer.push_str("**"),
|
||||
"em" => writer.push_str("_"),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
198
crates/html_to_markdown/src/markdown_writer.rs
Normal file
198
crates/html_to_markdown/src/markdown_writer.rs
Normal file
|
@ -0,0 +1,198 @@
|
|||
use std::collections::VecDeque;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use anyhow::Result;
|
||||
use markup5ever_rcdom::{Handle, NodeData};
|
||||
use regex::Regex;
|
||||
|
||||
use crate::html_element::HtmlElement;
|
||||
|
||||
fn empty_line_regex() -> &'static Regex {
|
||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||
REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
|
||||
}
|
||||
|
||||
fn more_than_three_newlines_regex() -> &'static Regex {
|
||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||
REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
|
||||
}
|
||||
|
||||
pub enum StartTagOutcome {
|
||||
Continue,
|
||||
Skip,
|
||||
}
|
||||
|
||||
pub struct MarkdownWriter {
|
||||
current_element_stack: VecDeque<HtmlElement>,
|
||||
pub(crate) markdown: String,
|
||||
}
|
||||
|
||||
impl MarkdownWriter {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
current_element_stack: VecDeque::new(),
|
||||
markdown: String::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
|
||||
&self.current_element_stack
|
||||
}
|
||||
|
||||
pub fn is_inside(&self, tag: &str) -> bool {
|
||||
self.current_element_stack
|
||||
.iter()
|
||||
.any(|parent_element| parent_element.tag == tag)
|
||||
}
|
||||
|
||||
/// Appends the given string slice onto the end of the Markdown output.
|
||||
pub fn push_str(&mut self, str: &str) {
|
||||
self.markdown.push_str(str);
|
||||
}
|
||||
|
||||
/// Appends a newline to the end of the Markdown output.
|
||||
pub fn push_newline(&mut self) {
|
||||
self.push_str("\n");
|
||||
}
|
||||
|
||||
/// Appends a blank line to the end of the Markdown output.
|
||||
pub fn push_blank_line(&mut self) {
|
||||
self.push_str("\n\n");
|
||||
}
|
||||
|
||||
pub fn run(
|
||||
mut self,
|
||||
root_node: &Handle,
|
||||
mut handlers: Vec<Box<dyn HandleTag>>,
|
||||
) -> Result<String> {
|
||||
self.visit_node(&root_node, &mut handlers)?;
|
||||
Ok(Self::prettify_markdown(self.markdown))
|
||||
}
|
||||
|
||||
fn prettify_markdown(markdown: String) -> String {
|
||||
let markdown = empty_line_regex().replace_all(&markdown, "");
|
||||
let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
|
||||
|
||||
markdown.trim().to_string()
|
||||
}
|
||||
|
||||
fn visit_node(&mut self, node: &Handle, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
|
||||
let mut current_element = None;
|
||||
|
||||
match node.data {
|
||||
NodeData::Document
|
||||
| NodeData::Doctype { .. }
|
||||
| NodeData::ProcessingInstruction { .. }
|
||||
| NodeData::Comment { .. } => {
|
||||
// Currently left unimplemented, as we're not interested in this data
|
||||
// at this time.
|
||||
}
|
||||
NodeData::Element {
|
||||
ref name,
|
||||
ref attrs,
|
||||
..
|
||||
} => {
|
||||
let tag_name = name.local.to_string();
|
||||
if !tag_name.is_empty() {
|
||||
current_element = Some(HtmlElement {
|
||||
tag: tag_name,
|
||||
attrs: attrs.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
NodeData::Text { ref contents } => {
|
||||
let text = contents.borrow().to_string();
|
||||
self.visit_text(text, handlers)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(current_element) = current_element.as_ref() {
|
||||
match self.start_tag(¤t_element, handlers) {
|
||||
StartTagOutcome::Continue => {}
|
||||
StartTagOutcome::Skip => return Ok(()),
|
||||
}
|
||||
|
||||
self.current_element_stack
|
||||
.push_back(current_element.clone());
|
||||
}
|
||||
|
||||
for child in node.children.borrow().iter() {
|
||||
self.visit_node(child, handlers)?;
|
||||
}
|
||||
|
||||
if let Some(current_element) = current_element {
|
||||
self.current_element_stack.pop_back();
|
||||
self.end_tag(¤t_element, handlers);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn start_tag(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
handlers: &mut [Box<dyn HandleTag>],
|
||||
) -> StartTagOutcome {
|
||||
for handler in handlers {
|
||||
if handler.should_handle(tag.tag.as_str()) {
|
||||
match handler.handle_tag_start(tag, self) {
|
||||
StartTagOutcome::Continue => {}
|
||||
StartTagOutcome::Skip => return StartTagOutcome::Skip,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box<dyn HandleTag>]) {
|
||||
for handler in handlers {
|
||||
if handler.should_handle(tag.tag.as_str()) {
|
||||
handler.handle_tag_end(tag, self);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_text(&mut self, text: String, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
|
||||
for handler in handlers {
|
||||
match handler.handle_text(&text, self) {
|
||||
HandlerOutcome::Handled => return Ok(()),
|
||||
HandlerOutcome::NoOp => {}
|
||||
}
|
||||
}
|
||||
|
||||
let text = text
|
||||
.trim_matches(|char| char == '\n' || char == '\r')
|
||||
.replace('\n', " ");
|
||||
|
||||
self.push_str(&text);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub enum HandlerOutcome {
|
||||
Handled,
|
||||
NoOp,
|
||||
}
|
||||
|
||||
pub trait HandleTag {
|
||||
/// Returns whether this handler should handle the given tag.
|
||||
fn should_handle(&self, tag: &str) -> bool;
|
||||
|
||||
/// Handles the start of the given tag.
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
_tag: &HtmlElement,
|
||||
_writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
/// Handles the end of the given tag.
|
||||
fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
|
||||
|
||||
fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||
HandlerOutcome::NoOp
|
||||
}
|
||||
}
|
1
crates/html_to_markdown/src/structure.rs
Normal file
1
crates/html_to_markdown/src/structure.rs
Normal file
|
@ -0,0 +1 @@
|
|||
pub mod rustdoc;
|
286
crates/html_to_markdown/src/structure/rustdoc.rs
Normal file
286
crates/html_to_markdown/src/structure/rustdoc.rs
Normal file
|
@ -0,0 +1,286 @@
|
|||
use crate::html_element::HtmlElement;
|
||||
use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
|
||||
|
||||
pub struct RustdocHeadingHandler;
|
||||
|
||||
impl HandleTag for RustdocHeadingHandler {
|
||||
fn should_handle(&self, _tag: &str) -> bool {
|
||||
// We're only handling text, so we don't need to visit any tags.
|
||||
false
|
||||
}
|
||||
|
||||
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||
if writer.is_inside("h1")
|
||||
|| writer.is_inside("h2")
|
||||
|| writer.is_inside("h3")
|
||||
|| writer.is_inside("h4")
|
||||
|| writer.is_inside("h5")
|
||||
|| writer.is_inside("h6")
|
||||
{
|
||||
let text = text
|
||||
.trim_matches(|char| char == '\n' || char == '\r' || char == '§')
|
||||
.replace('\n', " ");
|
||||
writer.push_str(&text);
|
||||
|
||||
return HandlerOutcome::Handled;
|
||||
}
|
||||
|
||||
HandlerOutcome::NoOp
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RustdocCodeHandler;
|
||||
|
||||
impl HandleTag for RustdocCodeHandler {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"pre" | "code" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag.as_str() {
|
||||
"code" => {
|
||||
if !writer.is_inside("pre") {
|
||||
writer.push_str("`");
|
||||
}
|
||||
}
|
||||
"pre" => {
|
||||
let classes = tag.classes();
|
||||
let is_rust = classes.iter().any(|class| class == "rust");
|
||||
let language = is_rust
|
||||
.then(|| "rs")
|
||||
.or_else(|| {
|
||||
classes.iter().find_map(|class| {
|
||||
if let Some((_, language)) = class.split_once("language-") {
|
||||
Some(language.trim())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
})
|
||||
.unwrap_or("");
|
||||
|
||||
writer.push_str(&format!("\n\n```{language}\n"));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||
match tag.tag.as_str() {
|
||||
"code" => {
|
||||
if !writer.is_inside("pre") {
|
||||
writer.push_str("`");
|
||||
}
|
||||
}
|
||||
"pre" => writer.push_str("\n```\n"),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||
if writer.is_inside("pre") {
|
||||
writer.push_str(&text);
|
||||
return HandlerOutcome::Handled;
|
||||
}
|
||||
|
||||
HandlerOutcome::NoOp
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RustdocTableHandler {
|
||||
/// The number of columns in the current `<table>`.
|
||||
current_table_columns: usize,
|
||||
is_first_th: bool,
|
||||
is_first_td: bool,
|
||||
}
|
||||
|
||||
impl RustdocTableHandler {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
current_table_columns: 0,
|
||||
is_first_th: true,
|
||||
is_first_td: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HandleTag for RustdocTableHandler {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"table" | "thead" | "tbody" | "tr" | "th" | "td" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag.as_str() {
|
||||
"thead" => writer.push_blank_line(),
|
||||
"tr" => writer.push_newline(),
|
||||
"th" => {
|
||||
self.current_table_columns += 1;
|
||||
if self.is_first_th {
|
||||
self.is_first_th = false;
|
||||
} else {
|
||||
writer.push_str(" ");
|
||||
}
|
||||
writer.push_str("| ");
|
||||
}
|
||||
"td" => {
|
||||
if self.is_first_td {
|
||||
self.is_first_td = false;
|
||||
} else {
|
||||
writer.push_str(" ");
|
||||
}
|
||||
writer.push_str("| ");
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||
match tag.tag.as_str() {
|
||||
"thead" => {
|
||||
writer.push_newline();
|
||||
for ix in 0..self.current_table_columns {
|
||||
if ix > 0 {
|
||||
writer.push_str(" ");
|
||||
}
|
||||
writer.push_str("| ---");
|
||||
}
|
||||
writer.push_str(" |");
|
||||
self.is_first_th = true;
|
||||
}
|
||||
"tr" => {
|
||||
writer.push_str(" |");
|
||||
self.is_first_td = true;
|
||||
}
|
||||
"table" => {
|
||||
self.current_table_columns = 0;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
|
||||
|
||||
pub struct RustdocItemHandler;
|
||||
|
||||
impl RustdocItemHandler {
|
||||
/// Returns whether we're currently inside of an `.item-name` element, which
|
||||
/// rustdoc uses to display Rust items in a list.
|
||||
fn is_inside_item_name(writer: &MarkdownWriter) -> bool {
|
||||
writer
|
||||
.current_element_stack()
|
||||
.iter()
|
||||
.any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS))
|
||||
}
|
||||
}
|
||||
|
||||
impl HandleTag for RustdocItemHandler {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"div" | "span" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag.as_str() {
|
||||
"div" | "span" => {
|
||||
if Self::is_inside_item_name(writer) && tag.has_class("stab") {
|
||||
writer.push_str(" [");
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||
match tag.tag.as_str() {
|
||||
"div" | "span" => {
|
||||
if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) {
|
||||
writer.push_str(": ");
|
||||
}
|
||||
|
||||
if Self::is_inside_item_name(writer) && tag.has_class("stab") {
|
||||
writer.push_str("]");
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||
if Self::is_inside_item_name(writer)
|
||||
&& !writer.is_inside("span")
|
||||
&& !writer.is_inside("code")
|
||||
{
|
||||
writer.push_str(&format!("`{text}`"));
|
||||
return HandlerOutcome::Handled;
|
||||
}
|
||||
|
||||
HandlerOutcome::NoOp
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RustdocChromeRemover;
|
||||
|
||||
impl HandleTag for RustdocChromeRemover {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"head" | "script" | "nav" | "summary" | "button" | "div" | "span" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
_writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag.as_str() {
|
||||
"head" | "script" | "nav" => return StartTagOutcome::Skip,
|
||||
"summary" => {
|
||||
if tag.has_class("hideme") {
|
||||
return StartTagOutcome::Skip;
|
||||
}
|
||||
}
|
||||
"button" => {
|
||||
if tag.attr("id").as_deref() == Some("copy-path") {
|
||||
return StartTagOutcome::Skip;
|
||||
}
|
||||
}
|
||||
"div" | "span" => {
|
||||
let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
|
||||
if tag.has_any_classes(&classes_to_skip) {
|
||||
return StartTagOutcome::Skip;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue