Add rustdoc_to_markdown crate (#12445)

This PR adds a new crate for converting rustdoc output to Markdown.

We're leveraging Servo's `html5ever` to parse the Markdown content, and
then walking the DOM nodes to convert it to a Markdown string.

The Markdown output will be continued to be refined, but it's in a place
where it should be reasonable.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-05-29 16:05:16 -04:00 committed by GitHub
parent a22cd95f9d
commit 5bcb9ed017
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 420 additions and 17 deletions

View file

@ -0,0 +1,20 @@
[package]
name = "rustdoc_to_markdown"
version = "0.1.0"
edition = "2021"
publish = false
license = "GPL-3.0-or-later"
[lints]
workspace = true
[lib]
path = "src/rustdoc_to_markdown.rs"
[dependencies]
anyhow.workspace = true
html5ever.workspace = true
markup5ever_rcdom.workspace = true
[dev-dependencies]
indoc.workspace = true

View file

@ -0,0 +1 @@
../../LICENSE-GPL

View file

@ -0,0 +1,29 @@
use indoc::indoc;
use rustdoc_to_markdown::convert_rustdoc_to_markdown;
pub fn main() {
let html = indoc! {"
<html>
<body>
<h1>Hello World</h1>
<p>
Here is some content.
</p>
<h2>Some items</h2>
<ul>
<li>One</li>
<li>Two</li>
<li>Three</li>
</ul>
</body>
</html>
"};
// To test this out with some real input, try this:
//
// ```
// let html = include_str!("/path/to/zed/target/doc/gpui/index.html");
// ```
let markdown = convert_rustdoc_to_markdown(html).unwrap();
println!("{markdown}");
}

View file

@ -0,0 +1,201 @@
use std::cell::RefCell;
use std::collections::VecDeque;
use anyhow::Result;
use html5ever::Attribute;
use markup5ever_rcdom::{Handle, NodeData};
#[derive(Debug, Clone)]
struct HtmlElement {
tag: String,
attrs: RefCell<Vec<Attribute>>,
}
enum StartTagOutcome {
Continue,
Skip,
}
pub struct MarkdownWriter {
current_element_stack: VecDeque<HtmlElement>,
/// The Markdown output.
markdown: String,
}
impl MarkdownWriter {
pub fn new() -> Self {
Self {
current_element_stack: VecDeque::new(),
markdown: String::new(),
}
}
fn is_inside(&self, tag: &str) -> bool {
self.current_element_stack
.iter()
.any(|parent_element| parent_element.tag == tag)
}
fn is_inside_heading(&self) -> bool {
["h1", "h2", "h3", "h4", "h5", "h6"]
.into_iter()
.any(|heading| self.is_inside(heading))
}
/// Appends the given string slice onto the end of the Markdown output.
fn push_str(&mut self, str: &str) {
self.markdown.push_str(str);
}
/// Appends a newline to the end of the Markdown output.
fn push_newline(&mut self) {
self.push_str("\n");
}
pub fn run(mut self, root_node: &Handle) -> Result<String> {
self.visit_node(&root_node)?;
Ok(self.markdown.trim().to_string())
}
fn visit_node(&mut self, node: &Handle) -> Result<()> {
let mut current_element = None;
match node.data {
NodeData::Document
| NodeData::Doctype { .. }
| NodeData::ProcessingInstruction { .. }
| NodeData::Comment { .. } => {
// Currently left unimplemented, as we're not interested in this data
// at this time.
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
let tag_name = name.local.to_string();
if !tag_name.is_empty() {
current_element = Some(HtmlElement {
tag: tag_name,
attrs: attrs.clone(),
});
}
}
NodeData::Text { ref contents } => {
let text = contents.borrow().to_string();
self.visit_text(text)?;
}
}
if let Some(current_element) = current_element.as_ref() {
match self.start_tag(&current_element) {
StartTagOutcome::Continue => {}
StartTagOutcome::Skip => return Ok(()),
}
self.current_element_stack
.push_back(current_element.clone());
}
for child in node.children.borrow().iter() {
self.visit_node(child)?;
}
self.current_element_stack.pop_back();
if let Some(current_element) = current_element {
self.end_tag(&current_element);
}
Ok(())
}
fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
match tag.tag.as_str() {
"head" | "script" | "nav" => return StartTagOutcome::Skip,
"h1" => self.push_str("\n# "),
"h2" => self.push_str("\n## "),
"h3" => self.push_str("\n### "),
"h4" => self.push_str("\n#### "),
"h5" => self.push_str("\n##### "),
"h6" => self.push_str("\n###### "),
"code" => {
if !self.is_inside("pre") {
self.push_str("`")
}
}
"pre" => self.push_str("\n```\n"),
"ul" | "ol" => self.push_newline(),
"li" => self.push_str("- "),
"summary" => {
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class" && attr.value.to_string() == "hideme"
}) {
return StartTagOutcome::Skip;
}
}
"div" | "span" => {
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class"
&& attr.value.to_string() == "sidebar-elems"
}) {
return StartTagOutcome::Skip;
}
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class"
&& attr.value.to_string() == "out-of-band"
}) {
return StartTagOutcome::Skip;
}
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
}) {
self.push_str("`");
}
}
_ => {}
}
StartTagOutcome::Continue
}
fn end_tag(&mut self, tag: &HtmlElement) {
match tag.tag.as_str() {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
"code" => {
if !self.is_inside("pre") {
self.push_str("`")
}
}
"pre" => self.push_str("\n```\n"),
"ul" | "ol" => self.push_newline(),
"li" => self.push_newline(),
"div" => {
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
}) {
self.push_str("`: ");
}
}
_ => {}
}
}
fn visit_text(&mut self, text: String) -> Result<()> {
if self.is_inside("pre") {
self.push_str(&text);
return Ok(());
}
if self.is_inside_heading() && self.is_inside("a") {
return Ok(());
}
let trimmed_text = text.trim_matches(|char| char == '\n' || char == '\r' || char == '§');
self.push_str(trimmed_text);
Ok(())
}
}

View file

@ -0,0 +1,36 @@
//! Provides conversion from rustdoc's HTML output to Markdown.
#![deny(missing_docs)]
mod markdown_writer;
use anyhow::{Context, Result};
use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use markup5ever_rcdom::RcDom;
use crate::markdown_writer::MarkdownWriter;
/// Converts the provided rustdoc HTML to Markdown.
pub fn convert_rustdoc_to_markdown(html: &str) -> Result<String> {
let parse_options = ParseOpts {
tree_builder: TreeBuilderOpts {
drop_doctype: true,
..Default::default()
},
..Default::default()
};
let dom = parse_document(RcDom::default(), parse_options)
.from_utf8()
.read_from(&mut html.as_bytes())
.context("failed to parse rustdoc HTML")?;
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
.run(&dom.document)
.context("failed to convert rustdoc to HTML")?;
Ok(markdown)
}