Add basic Wikipedia support to /fetch
(#12777)
This PR extends the `/fetch` slash command with the initial support for Wikipedia's HTML structure. Release Notes: - N/A
This commit is contained in:
parent
a910f192db
commit
9174858225
5 changed files with 171 additions and 41 deletions
|
@ -1,11 +1,9 @@
|
|||
//! Provides conversion from rustdoc's HTML output to Markdown.
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod html_element;
|
||||
mod markdown;
|
||||
pub mod markdown;
|
||||
mod markdown_writer;
|
||||
mod structure;
|
||||
pub mod structure;
|
||||
|
||||
use std::io::Read;
|
||||
|
||||
|
@ -19,24 +17,17 @@ use markup5ever_rcdom::RcDom;
|
|||
use crate::markdown::{
|
||||
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
|
||||
};
|
||||
use crate::markdown_writer::{HandleTag, MarkdownWriter};
|
||||
use crate::markdown_writer::MarkdownWriter;
|
||||
|
||||
pub use crate::markdown_writer::HandleTag;
|
||||
|
||||
/// Converts the provided HTML to Markdown.
|
||||
pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
|
||||
pub fn convert_html_to_markdown(
|
||||
html: impl Read,
|
||||
handlers: Vec<Box<dyn HandleTag>>,
|
||||
) -> Result<String> {
|
||||
let dom = parse_html(html).context("failed to parse HTML")?;
|
||||
|
||||
let handlers: Vec<Box<dyn HandleTag>> = vec![
|
||||
Box::new(ParagraphHandler),
|
||||
Box::new(HeadingHandler),
|
||||
Box::new(ListHandler),
|
||||
Box::new(TableHandler::new()),
|
||||
Box::new(StyledTextHandler),
|
||||
Box::new(structure::rustdoc::RustdocChromeRemover),
|
||||
Box::new(structure::rustdoc::RustdocHeadingHandler),
|
||||
Box::new(structure::rustdoc::RustdocCodeHandler),
|
||||
Box::new(structure::rustdoc::RustdocItemHandler),
|
||||
];
|
||||
|
||||
let markdown_writer = MarkdownWriter::new();
|
||||
let markdown = markdown_writer
|
||||
.run(&dom.document, handlers)
|
||||
|
@ -47,26 +38,20 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
|
|||
|
||||
/// Converts the provided rustdoc HTML to Markdown.
|
||||
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
|
||||
let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
|
||||
|
||||
let handlers: Vec<Box<dyn HandleTag>> = vec![
|
||||
Box::new(ParagraphHandler),
|
||||
Box::new(HeadingHandler),
|
||||
Box::new(ListHandler),
|
||||
Box::new(TableHandler::new()),
|
||||
Box::new(StyledTextHandler),
|
||||
Box::new(structure::rustdoc::RustdocChromeRemover),
|
||||
Box::new(structure::rustdoc::RustdocHeadingHandler),
|
||||
Box::new(structure::rustdoc::RustdocCodeHandler),
|
||||
Box::new(structure::rustdoc::RustdocItemHandler),
|
||||
];
|
||||
|
||||
let markdown_writer = MarkdownWriter::new();
|
||||
let markdown = markdown_writer
|
||||
.run(&dom.document, handlers)
|
||||
.context("failed to convert rustdoc HTML to Markdown")?;
|
||||
|
||||
Ok(markdown)
|
||||
convert_html_to_markdown(
|
||||
html,
|
||||
vec![
|
||||
Box::new(ParagraphHandler),
|
||||
Box::new(HeadingHandler),
|
||||
Box::new(ListHandler),
|
||||
Box::new(TableHandler::new()),
|
||||
Box::new(StyledTextHandler),
|
||||
Box::new(structure::rustdoc::RustdocChromeRemover),
|
||||
Box::new(structure::rustdoc::RustdocHeadingHandler),
|
||||
Box::new(structure::rustdoc::RustdocCodeHandler),
|
||||
Box::new(structure::rustdoc::RustdocItemHandler),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
fn parse_html(mut html: impl Read) -> Result<RcDom> {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue