Add rustdoc_to_markdown
crate (#12445)
This PR adds a new crate for converting rustdoc output to Markdown. We're leveraging Servo's `html5ever` to parse the Markdown content, and then walking the DOM nodes to convert it to a Markdown string. The Markdown output will be continued to be refined, but it's in a place where it should be reasonable. Release Notes: - N/A
This commit is contained in:
parent
a22cd95f9d
commit
5bcb9ed017
7 changed files with 420 additions and 17 deletions
20
crates/rustdoc_to_markdown/Cargo.toml
Normal file
20
crates/rustdoc_to_markdown/Cargo.toml
Normal file
|
@ -0,0 +1,20 @@
|
|||
[package]
|
||||
name = "rustdoc_to_markdown"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
publish = false
|
||||
license = "GPL-3.0-or-later"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[lib]
|
||||
path = "src/rustdoc_to_markdown.rs"
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
html5ever.workspace = true
|
||||
markup5ever_rcdom.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
indoc.workspace = true
|
1
crates/rustdoc_to_markdown/LICENSE-GPL
Symbolic link
1
crates/rustdoc_to_markdown/LICENSE-GPL
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../LICENSE-GPL
|
29
crates/rustdoc_to_markdown/examples/test.rs
Normal file
29
crates/rustdoc_to_markdown/examples/test.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use indoc::indoc;
|
||||
use rustdoc_to_markdown::convert_rustdoc_to_markdown;
|
||||
|
||||
pub fn main() {
|
||||
let html = indoc! {"
|
||||
<html>
|
||||
<body>
|
||||
<h1>Hello World</h1>
|
||||
<p>
|
||||
Here is some content.
|
||||
</p>
|
||||
<h2>Some items</h2>
|
||||
<ul>
|
||||
<li>One</li>
|
||||
<li>Two</li>
|
||||
<li>Three</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
"};
|
||||
// To test this out with some real input, try this:
|
||||
//
|
||||
// ```
|
||||
// let html = include_str!("/path/to/zed/target/doc/gpui/index.html");
|
||||
// ```
|
||||
let markdown = convert_rustdoc_to_markdown(html).unwrap();
|
||||
|
||||
println!("{markdown}");
|
||||
}
|
201
crates/rustdoc_to_markdown/src/markdown_writer.rs
Normal file
201
crates/rustdoc_to_markdown/src/markdown_writer.rs
Normal file
|
@ -0,0 +1,201 @@
|
|||
use std::cell::RefCell;
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use anyhow::Result;
|
||||
use html5ever::Attribute;
|
||||
use markup5ever_rcdom::{Handle, NodeData};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct HtmlElement {
|
||||
tag: String,
|
||||
attrs: RefCell<Vec<Attribute>>,
|
||||
}
|
||||
|
||||
enum StartTagOutcome {
|
||||
Continue,
|
||||
Skip,
|
||||
}
|
||||
|
||||
pub struct MarkdownWriter {
|
||||
current_element_stack: VecDeque<HtmlElement>,
|
||||
/// The Markdown output.
|
||||
markdown: String,
|
||||
}
|
||||
|
||||
impl MarkdownWriter {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
current_element_stack: VecDeque::new(),
|
||||
markdown: String::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_inside(&self, tag: &str) -> bool {
|
||||
self.current_element_stack
|
||||
.iter()
|
||||
.any(|parent_element| parent_element.tag == tag)
|
||||
}
|
||||
|
||||
fn is_inside_heading(&self) -> bool {
|
||||
["h1", "h2", "h3", "h4", "h5", "h6"]
|
||||
.into_iter()
|
||||
.any(|heading| self.is_inside(heading))
|
||||
}
|
||||
|
||||
/// Appends the given string slice onto the end of the Markdown output.
|
||||
fn push_str(&mut self, str: &str) {
|
||||
self.markdown.push_str(str);
|
||||
}
|
||||
|
||||
/// Appends a newline to the end of the Markdown output.
|
||||
fn push_newline(&mut self) {
|
||||
self.push_str("\n");
|
||||
}
|
||||
|
||||
pub fn run(mut self, root_node: &Handle) -> Result<String> {
|
||||
self.visit_node(&root_node)?;
|
||||
Ok(self.markdown.trim().to_string())
|
||||
}
|
||||
|
||||
fn visit_node(&mut self, node: &Handle) -> Result<()> {
|
||||
let mut current_element = None;
|
||||
|
||||
match node.data {
|
||||
NodeData::Document
|
||||
| NodeData::Doctype { .. }
|
||||
| NodeData::ProcessingInstruction { .. }
|
||||
| NodeData::Comment { .. } => {
|
||||
// Currently left unimplemented, as we're not interested in this data
|
||||
// at this time.
|
||||
}
|
||||
NodeData::Element {
|
||||
ref name,
|
||||
ref attrs,
|
||||
..
|
||||
} => {
|
||||
let tag_name = name.local.to_string();
|
||||
if !tag_name.is_empty() {
|
||||
current_element = Some(HtmlElement {
|
||||
tag: tag_name,
|
||||
attrs: attrs.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
NodeData::Text { ref contents } => {
|
||||
let text = contents.borrow().to_string();
|
||||
self.visit_text(text)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(current_element) = current_element.as_ref() {
|
||||
match self.start_tag(¤t_element) {
|
||||
StartTagOutcome::Continue => {}
|
||||
StartTagOutcome::Skip => return Ok(()),
|
||||
}
|
||||
|
||||
self.current_element_stack
|
||||
.push_back(current_element.clone());
|
||||
}
|
||||
|
||||
for child in node.children.borrow().iter() {
|
||||
self.visit_node(child)?;
|
||||
}
|
||||
|
||||
self.current_element_stack.pop_back();
|
||||
|
||||
if let Some(current_element) = current_element {
|
||||
self.end_tag(¤t_element);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
|
||||
match tag.tag.as_str() {
|
||||
"head" | "script" | "nav" => return StartTagOutcome::Skip,
|
||||
"h1" => self.push_str("\n# "),
|
||||
"h2" => self.push_str("\n## "),
|
||||
"h3" => self.push_str("\n### "),
|
||||
"h4" => self.push_str("\n#### "),
|
||||
"h5" => self.push_str("\n##### "),
|
||||
"h6" => self.push_str("\n###### "),
|
||||
"code" => {
|
||||
if !self.is_inside("pre") {
|
||||
self.push_str("`")
|
||||
}
|
||||
}
|
||||
"pre" => self.push_str("\n```\n"),
|
||||
"ul" | "ol" => self.push_newline(),
|
||||
"li" => self.push_str("- "),
|
||||
"summary" => {
|
||||
if tag.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class" && attr.value.to_string() == "hideme"
|
||||
}) {
|
||||
return StartTagOutcome::Skip;
|
||||
}
|
||||
}
|
||||
"div" | "span" => {
|
||||
if tag.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class"
|
||||
&& attr.value.to_string() == "sidebar-elems"
|
||||
}) {
|
||||
return StartTagOutcome::Skip;
|
||||
}
|
||||
|
||||
if tag.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class"
|
||||
&& attr.value.to_string() == "out-of-band"
|
||||
}) {
|
||||
return StartTagOutcome::Skip;
|
||||
}
|
||||
|
||||
if tag.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
|
||||
}) {
|
||||
self.push_str("`");
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn end_tag(&mut self, tag: &HtmlElement) {
|
||||
match tag.tag.as_str() {
|
||||
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
|
||||
"code" => {
|
||||
if !self.is_inside("pre") {
|
||||
self.push_str("`")
|
||||
}
|
||||
}
|
||||
"pre" => self.push_str("\n```\n"),
|
||||
"ul" | "ol" => self.push_newline(),
|
||||
"li" => self.push_newline(),
|
||||
"div" => {
|
||||
if tag.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
|
||||
}) {
|
||||
self.push_str("`: ");
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_text(&mut self, text: String) -> Result<()> {
|
||||
if self.is_inside("pre") {
|
||||
self.push_str(&text);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.is_inside_heading() && self.is_inside("a") {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let trimmed_text = text.trim_matches(|char| char == '\n' || char == '\r' || char == '§');
|
||||
self.push_str(trimmed_text);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
36
crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs
Normal file
36
crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs
Normal file
|
@ -0,0 +1,36 @@
|
|||
//! Provides conversion from rustdoc's HTML output to Markdown.
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod markdown_writer;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use html5ever::driver::ParseOpts;
|
||||
use html5ever::parse_document;
|
||||
use html5ever::tendril::TendrilSink;
|
||||
use html5ever::tree_builder::TreeBuilderOpts;
|
||||
use markup5ever_rcdom::RcDom;
|
||||
|
||||
use crate::markdown_writer::MarkdownWriter;
|
||||
|
||||
/// Converts the provided rustdoc HTML to Markdown.
|
||||
pub fn convert_rustdoc_to_markdown(html: &str) -> Result<String> {
|
||||
let parse_options = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let dom = parse_document(RcDom::default(), parse_options)
|
||||
.from_utf8()
|
||||
.read_from(&mut html.as_bytes())
|
||||
.context("failed to parse rustdoc HTML")?;
|
||||
|
||||
let markdown_writer = MarkdownWriter::new();
|
||||
let markdown = markdown_writer
|
||||
.run(&dom.document)
|
||||
.context("failed to convert rustdoc to HTML")?;
|
||||
|
||||
Ok(markdown)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue