Add basic Wikipedia support to /fetch (#12777)

This PR extends the `/fetch` slash command with the initial support for
Wikipedia's HTML structure.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-06-07 12:03:43 -04:00 committed by GitHub
parent a910f192db
commit 9174858225
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 171 additions and 41 deletions

View file

@ -1,11 +1,9 @@
//! Provides conversion from rustdoc's HTML output to Markdown.
#![deny(missing_docs)]
mod html_element;
mod markdown;
pub mod markdown;
mod markdown_writer;
mod structure;
pub mod structure;
use std::io::Read;
@ -19,24 +17,17 @@ use markup5ever_rcdom::RcDom;
use crate::markdown::{
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
};
use crate::markdown_writer::{HandleTag, MarkdownWriter};
use crate::markdown_writer::MarkdownWriter;
pub use crate::markdown_writer::HandleTag;
/// Converts the provided HTML to Markdown.
pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
pub fn convert_html_to_markdown(
html: impl Read,
handlers: Vec<Box<dyn HandleTag>>,
) -> Result<String> {
let dom = parse_html(html).context("failed to parse HTML")?;
let handlers: Vec<Box<dyn HandleTag>> = vec![
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(TableHandler::new()),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocItemHandler),
];
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
.run(&dom.document, handlers)
@ -47,26 +38,20 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
/// Converts the provided rustdoc HTML to Markdown.
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
let handlers: Vec<Box<dyn HandleTag>> = vec![
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(TableHandler::new()),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocItemHandler),
];
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
.run(&dom.document, handlers)
.context("failed to convert rustdoc HTML to Markdown")?;
Ok(markdown)
convert_html_to_markdown(
html,
vec![
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(TableHandler::new()),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocItemHandler),
],
)
}
fn parse_html(mut html: impl Read) -> Result<RcDom> {