Handle Wikipedia code blocks in /fetch
command (#12780)
This PR extends the `/fetch` command with support for Wikipedia code blocks. Release Notes: - N/A
This commit is contained in:
parent
9174858225
commit
834089feb1
3 changed files with 107 additions and 4 deletions
|
@ -43,12 +43,15 @@ impl FetchSlashCommand {
|
||||||
Box::new(markdown::ListHandler),
|
Box::new(markdown::ListHandler),
|
||||||
Box::new(markdown::TableHandler::new()),
|
Box::new(markdown::TableHandler::new()),
|
||||||
Box::new(markdown::StyledTextHandler),
|
Box::new(markdown::StyledTextHandler),
|
||||||
Box::new(markdown::CodeHandler),
|
|
||||||
];
|
];
|
||||||
if url.contains("wikipedia.org") {
|
if url.contains("wikipedia.org") {
|
||||||
use html_to_markdown::structure::wikipedia;
|
use html_to_markdown::structure::wikipedia;
|
||||||
|
|
||||||
handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
|
handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
|
||||||
|
handlers.push(Box::new(wikipedia::WikipediaInfoboxHandler));
|
||||||
|
handlers.push(Box::new(wikipedia::WikipediaCodeHandler::new()));
|
||||||
|
} else {
|
||||||
|
handlers.push(Box::new(markdown::CodeHandler));
|
||||||
}
|
}
|
||||||
|
|
||||||
convert_html_to_markdown(&body[..], handlers)
|
convert_html_to_markdown(&body[..], handlers)
|
||||||
|
|
|
@ -162,7 +162,7 @@ impl MarkdownWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
let text = text
|
let text = text
|
||||||
.trim_matches(|char| char == '\n' || char == '\r')
|
.trim_matches(|char| char == '\n' || char == '\r' || char == '\t')
|
||||||
.replace('\n', " ");
|
.replace('\n', " ");
|
||||||
|
|
||||||
self.push_str(&text);
|
self.push_str(&text);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
use crate::html_element::HtmlElement;
|
use crate::html_element::HtmlElement;
|
||||||
use crate::markdown_writer::{MarkdownWriter, StartTagOutcome};
|
use crate::markdown_writer::{HandlerOutcome, MarkdownWriter, StartTagOutcome};
|
||||||
use crate::HandleTag;
|
use crate::HandleTag;
|
||||||
|
|
||||||
pub struct WikipediaChromeRemover;
|
pub struct WikipediaChromeRemover;
|
||||||
|
@ -30,7 +30,7 @@ impl HandleTag for WikipediaChromeRemover {
|
||||||
return StartTagOutcome::Skip;
|
return StartTagOutcome::Skip;
|
||||||
}
|
}
|
||||||
|
|
||||||
let classes_to_skip = ["mw-editsection", "mw-jump-link"];
|
let classes_to_skip = ["noprint", "mw-editsection", "mw-jump-link"];
|
||||||
if tag.has_any_classes(&classes_to_skip) {
|
if tag.has_any_classes(&classes_to_skip) {
|
||||||
return StartTagOutcome::Skip;
|
return StartTagOutcome::Skip;
|
||||||
}
|
}
|
||||||
|
@ -42,6 +42,106 @@ impl HandleTag for WikipediaChromeRemover {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct WikipediaInfoboxHandler;
|
||||||
|
|
||||||
|
impl HandleTag for WikipediaInfoboxHandler {
|
||||||
|
fn should_handle(&self, tag: &str) -> bool {
|
||||||
|
tag == "table"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_start(
|
||||||
|
&mut self,
|
||||||
|
tag: &HtmlElement,
|
||||||
|
_writer: &mut MarkdownWriter,
|
||||||
|
) -> StartTagOutcome {
|
||||||
|
match tag.tag.as_str() {
|
||||||
|
"table" => {
|
||||||
|
if tag.has_class("infobox") {
|
||||||
|
return StartTagOutcome::Skip;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
StartTagOutcome::Continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct WikipediaCodeHandler {
|
||||||
|
language: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WikipediaCodeHandler {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self { language: None }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HandleTag for WikipediaCodeHandler {
|
||||||
|
fn should_handle(&self, tag: &str) -> bool {
|
||||||
|
match tag {
|
||||||
|
"div" | "pre" | "code" => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_start(
|
||||||
|
&mut self,
|
||||||
|
tag: &HtmlElement,
|
||||||
|
writer: &mut MarkdownWriter,
|
||||||
|
) -> StartTagOutcome {
|
||||||
|
match tag.tag.as_str() {
|
||||||
|
"code" => {
|
||||||
|
if !writer.is_inside("pre") {
|
||||||
|
writer.push_str("`");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"div" => {
|
||||||
|
let classes = tag.classes();
|
||||||
|
self.language = classes.iter().find_map(|class| {
|
||||||
|
if let Some((_, language)) = class.split_once("mw-highlight-lang-") {
|
||||||
|
Some(language.trim().to_owned())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
"pre" => {
|
||||||
|
writer.push_blank_line();
|
||||||
|
writer.push_str("```");
|
||||||
|
if let Some(language) = self.language.take() {
|
||||||
|
writer.push_str(&language);
|
||||||
|
}
|
||||||
|
writer.push_newline();
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
StartTagOutcome::Continue
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||||
|
match tag.tag.as_str() {
|
||||||
|
"code" => {
|
||||||
|
if !writer.is_inside("pre") {
|
||||||
|
writer.push_str("`");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"pre" => writer.push_str("\n```\n"),
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||||
|
if writer.is_inside("pre") {
|
||||||
|
writer.push_str(&text);
|
||||||
|
return HandlerOutcome::Handled;
|
||||||
|
}
|
||||||
|
|
||||||
|
HandlerOutcome::NoOp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use indoc::indoc;
|
use indoc::indoc;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue