Refactoring so we can support other html types aswell

This commit is contained in:
Remco Smits 2025-08-22 20:09:45 +02:00
parent 03487fff5b
commit 85321152cf
4 changed files with 105 additions and 152 deletions

105
Cargo.lock generated
View file

@ -5117,12 +5117,6 @@ dependencies = [
"zlog",
]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]]
name = "either"
version = "1.15.0"
@ -6335,15 +6329,6 @@ dependencies = [
"thread_local",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "generator"
version = "0.8.5"
@ -6378,15 +6363,6 @@ dependencies = [
"windows-targets 0.48.5",
]
[[package]]
name = "getopts"
version = "0.2.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cba6ae63eb948698e300f645f87c70f76630d505f23b8907cf1e193ee85048c1"
dependencies = [
"unicode-width 0.2.0",
]
[[package]]
name = "getrandom"
version = "0.2.15"
@ -7913,18 +7889,7 @@ dependencies = [
"log",
"mac",
"markup5ever 0.16.1",
"match_token 0.1.0",
]
[[package]]
name = "html5ever"
version = "0.35.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4"
dependencies = [
"log",
"markup5ever 0.35.0",
"match_token 0.35.0",
"match_token",
]
[[package]]
@ -9965,12 +9930,13 @@ dependencies = [
"editor",
"fs",
"gpui",
"html5ever 0.27.0",
"language",
"linkify",
"log",
"markup5ever_rcdom",
"pretty_assertions",
"pulldown-cmark 0.12.2",
"scraper",
"settings",
"theme",
"ui",
@ -10004,17 +9970,6 @@ dependencies = [
"web_atoms",
]
[[package]]
name = "markup5ever"
version = "0.35.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3"
dependencies = [
"log",
"tendril",
"web_atoms",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.3.0"
@ -10038,17 +9993,6 @@ dependencies = [
"syn 2.0.101",
]
[[package]]
name = "match_token"
version = "0.35.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.101",
]
[[package]]
name = "matchers"
version = "0.1.0"
@ -14460,21 +14404,6 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5f3a24d916e78954af99281a455168d4a9515d65eca99a18da1b813689c4ad9"
dependencies = [
"cssparser",
"ego-tree",
"getopts",
"html5ever 0.35.0",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]]
name = "scratch"
version = "1.0.8"
@ -14719,25 +14648,6 @@ dependencies = [
"libc",
]
[[package]]
name = "selectors"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5685b6ae43bfcf7d2e7dfcfb5d8e8f61b46442c902531e41a32a9a8bf0ee0fb6"
dependencies = [
"bitflags 2.9.0",
"cssparser",
"derive_more 2.0.1",
"fxhash",
"log",
"new_debug_unreachable",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "self_cell"
version = "1.2.0"
@ -14924,15 +14834,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "servo_arc"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "204ea332803bd95a0b60388590d59cf6468ec9becf626e2451f1d26a1d972de4"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "session"
version = "0.1.0"

View file

@ -574,7 +574,6 @@ rustls = { version = "0.23.26" }
rustls-platform-verifier = "0.5.0"
scap = { git = "https://github.com/zed-industries/scap", rev = "808aa5c45b41e8f44729d02e38fd00a2fe2722e7", default-features = false }
schemars = { version = "1.0", features = ["indexmap2"] }
scraper = "0.24.0"
semver = "1.0"
serde = { version = "1.0", features = ["derive", "rc"] }
serde_derive = { version = "1.0", features = ["deserialize_in_place"] }

View file

@ -21,12 +21,13 @@ collections.workspace = true
editor.workspace = true
fs.workspace = true
gpui.workspace = true
html5ever.workspace = true
language.workspace = true
linkify.workspace = true
log.workspace = true
markup5ever_rcdom.workspace = true
pretty_assertions.workspace = true
pulldown-cmark.workspace = true
scraper.workspace = true
settings.workspace = true
theme.workspace = true
ui.workspace = true

View file

@ -2,9 +2,11 @@ use crate::markdown_elements::*;
use async_recursion::async_recursion;
use collections::FxHashMap;
use gpui::FontWeight;
use html5ever::{ParseOpts, local_name, parse_document, tendril::TendrilSink};
use language::LanguageRegistry;
use markup5ever_rcdom::RcDom;
use pulldown_cmark::{Alignment, Event, Options, Parser, Tag, TagEnd};
use std::{ops::Range, path::PathBuf, sync::Arc, vec};
use std::{cell::RefCell, ops::Range, path::PathBuf, rc::Rc, sync::Arc, vec};
use ui::{px, relative};
pub async fn parse_markdown(
@ -757,10 +759,19 @@ impl<'a> MarkdownParser<'a> {
let source_range = source_range.clone();
match current {
Event::Html(html) => {
let fragment = scraper::Html::parse_fragment(html);
let mut cursor = std::io::Cursor::new(html.as_bytes());
let Some(dom) = parse_document(RcDom::default(), ParseOpts::default())
.from_utf8()
.read_from(&mut cursor)
.ok()
else {
self.cursor += 1;
continue;
};
self.cursor += 1;
elements.extend(self.parse_html_image(fragment, source_range));
self.parse_html_node(source_range, &dom.document, &mut elements);
}
Event::End(TagEnd::CodeBlock) => {
self.cursor += 1;
@ -775,6 +786,92 @@ impl<'a> MarkdownParser<'a> {
elements
}
fn attr_value(
attrs: &RefCell<Vec<html5ever::Attribute>>,
name: html5ever::LocalName,
) -> Option<String> {
attrs.borrow().iter().find_map(|attr| {
if attr.name.local == name {
Some(attr.value.to_string())
} else {
None
}
})
}
fn parse_html_node(
&self,
source_range: Range<usize>,
node: &Rc<markup5ever_rcdom::Node>,
elements: &mut Vec<ParsedMarkdownElement>,
) {
match &node.data {
markup5ever_rcdom::NodeData::Document => {
self.consume_children(source_range, node, elements);
}
markup5ever_rcdom::NodeData::Doctype { .. } => {}
markup5ever_rcdom::NodeData::Text { contents } => {
elements.push(ParsedMarkdownElement::Paragraph(vec![
MarkdownParagraphChunk::Text(ParsedMarkdownText {
source_range,
contents: contents.borrow().to_string(),
highlights: Vec::default(),
region_ranges: Vec::default(),
regions: Vec::default(),
}),
]));
}
markup5ever_rcdom::NodeData::Comment { .. } => {}
markup5ever_rcdom::NodeData::Element { name, attrs, .. } => {
if local_name!("img") == name.local {
let Some(src) = Self::attr_value(attrs, local_name!("src")) else {
return;
};
let Some(mut image) = Image::identify(
src.to_string(),
source_range,
self.file_location_directory.clone(),
) else {
return;
};
if let Some(alt) = Self::attr_value(attrs, local_name!("alt")) {
image.set_alt_text(alt.to_string().into());
}
if let Some(width) = Self::attr_value(attrs, local_name!("width"))
.and_then(|width| Self::parse_length(&width))
{
image.set_width(width);
}
if let Some(height) = Self::attr_value(attrs, local_name!("height"))
.and_then(|height| Self::parse_length(&height))
{
image.set_height(height);
}
elements.push(ParsedMarkdownElement::Image(image));
} else {
self.consume_children(source_range, node, elements);
}
}
markup5ever_rcdom::NodeData::ProcessingInstruction { .. } => {}
}
}
fn consume_children(
&self,
source_range: Range<usize>,
node: &Rc<markup5ever_rcdom::Node>,
elements: &mut Vec<ParsedMarkdownElement>,
) {
for node in node.children.borrow().iter() {
self.parse_html_node(source_range.clone(), node, elements);
}
}
/// Parses the width/height attribute value of an html element (e.g. img element)
fn parse_length(value: &str) -> Option<ui::DefiniteLength> {
if value.ends_with("px") {
@ -797,51 +894,6 @@ impl<'a> MarkdownParser<'a> {
.map(|value| px(value).into())
}
}
fn parse_html_image(
&self,
html: scraper::Html,
source_range: Range<usize>,
) -> Vec<ParsedMarkdownElement> {
let mut images = Vec::new();
let selector = scraper::Selector::parse("img").unwrap();
for element in html.select(&selector) {
let Some(src) = element.attr("src") else {
continue;
};
let Some(mut image) = Image::identify(
src.to_string(),
source_range.clone(),
self.file_location_directory.clone(),
) else {
continue;
};
if let Some(alt) = element.attr("alt") {
image.set_alt_text(alt.to_string().into());
}
if let Some(width) = element
.attr("width")
.and_then(|width| Self::parse_length(width))
{
image.set_width(width);
}
if let Some(height) = element
.attr("height")
.and_then(|height| Self::parse_length(height))
{
image.set_height(height);
}
images.push(ParsedMarkdownElement::Image(image));
}
images
}
}
#[cfg(test)]