gleam: Improve indexing of HexDocs (#13787)
This PR improves the indexing of HexDocs content for Gleam packages. We now index each of the modules in the package instead of just the root. Release Notes: - N/A
This commit is contained in:
parent
f024fcff3d
commit
98699a65c1
2 changed files with 211 additions and 37 deletions
|
@ -1,7 +1,6 @@
|
||||||
use html_to_markdown::{convert_html_to_markdown, TagHandler};
|
mod hexdocs;
|
||||||
use std::cell::RefCell;
|
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::rc::Rc;
|
|
||||||
use zed::lsp::CompletionKind;
|
use zed::lsp::CompletionKind;
|
||||||
use zed::{
|
use zed::{
|
||||||
CodeLabel, CodeLabelSpan, HttpRequest, KeyValueStore, LanguageServerId, SlashCommand,
|
CodeLabel, CodeLabelSpan, HttpRequest, KeyValueStore, LanguageServerId, SlashCommand,
|
||||||
|
@ -9,6 +8,8 @@ use zed::{
|
||||||
};
|
};
|
||||||
use zed_extension_api::{self as zed, Result};
|
use zed_extension_api::{self as zed, Result};
|
||||||
|
|
||||||
|
use crate::hexdocs::convert_hexdocs_to_markdown;
|
||||||
|
|
||||||
struct GleamExtension {
|
struct GleamExtension {
|
||||||
cached_binary_path: Option<String>,
|
cached_binary_path: Option<String>,
|
||||||
}
|
}
|
||||||
|
@ -191,19 +192,7 @@ impl zed::Extension for GleamExtension {
|
||||||
),
|
),
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let mut handlers: Vec<TagHandler> = vec![
|
let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
|
||||||
Rc::new(RefCell::new(
|
|
||||||
html_to_markdown::markdown::WebpageChromeRemover,
|
|
||||||
)),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
|
|
||||||
];
|
|
||||||
|
|
||||||
let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
|
|
||||||
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
|
|
||||||
|
|
||||||
let mut text = String::new();
|
let mut text = String::new();
|
||||||
text.push_str(&markdown);
|
text.push_str(&markdown);
|
||||||
|
@ -244,27 +233,7 @@ impl zed::Extension for GleamExtension {
|
||||||
database: &KeyValueStore,
|
database: &KeyValueStore,
|
||||||
) -> Result<(), String> {
|
) -> Result<(), String> {
|
||||||
match provider.as_str() {
|
match provider.as_str() {
|
||||||
"gleam-hexdocs" => {
|
"gleam-hexdocs" => hexdocs::index(package, database),
|
||||||
let response = zed::fetch(&HttpRequest {
|
|
||||||
url: format!("https://hexdocs.pm/{package}"),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let mut handlers: Vec<TagHandler> = vec![
|
|
||||||
Rc::new(RefCell::new(
|
|
||||||
html_to_markdown::markdown::WebpageChromeRemover,
|
|
||||||
)),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
|
|
||||||
Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
|
|
||||||
];
|
|
||||||
|
|
||||||
let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
|
|
||||||
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
|
|
||||||
|
|
||||||
Ok(database.insert(&package, &markdown)?)
|
|
||||||
}
|
|
||||||
_ => Ok(()),
|
_ => Ok(()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
205
extensions/gleam/src/hexdocs.rs
Normal file
205
extensions/gleam/src/hexdocs.rs
Normal file
|
@ -0,0 +1,205 @@
|
||||||
|
use std::cell::RefCell;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
use std::io::Read;
|
||||||
|
use std::rc::Rc;
|
||||||
|
|
||||||
|
use html_to_markdown::markdown::{
|
||||||
|
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
|
||||||
|
};
|
||||||
|
use html_to_markdown::{
|
||||||
|
convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
|
||||||
|
StartTagOutcome, TagHandler,
|
||||||
|
};
|
||||||
|
use zed_extension_api::{self as zed, HttpRequest, KeyValueStore, Result};
|
||||||
|
|
||||||
|
pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
|
||||||
|
let response = zed::fetch(&HttpRequest {
|
||||||
|
url: format!("https://hexdocs.pm/{package}"),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let (package_root_markdown, modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
|
||||||
|
|
||||||
|
database.insert(&package, &package_root_markdown)?;
|
||||||
|
|
||||||
|
for module in modules {
|
||||||
|
let response = zed::fetch(&HttpRequest {
|
||||||
|
url: format!("https://hexdocs.pm/{package}/{module}.html"),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
|
||||||
|
|
||||||
|
database.insert(&module, &markdown)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
|
||||||
|
let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
|
||||||
|
|
||||||
|
let mut handlers: Vec<TagHandler> = vec![
|
||||||
|
module_collector.clone(),
|
||||||
|
Rc::new(RefCell::new(GleamChromeRemover)),
|
||||||
|
Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
|
||||||
|
Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
|
||||||
|
Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
|
||||||
|
Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
|
||||||
|
Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
|
||||||
|
];
|
||||||
|
|
||||||
|
let markdown = convert_html_to_markdown(html, &mut handlers)
|
||||||
|
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
|
||||||
|
|
||||||
|
let modules = module_collector
|
||||||
|
.borrow()
|
||||||
|
.modules
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
Ok((markdown, modules))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A higher-order handler that skips all content from the `nav`.
|
||||||
|
///
|
||||||
|
/// We still need to traverse the `nav` for collecting information, but
|
||||||
|
/// we don't want to include any of its content in the resulting Markdown.
|
||||||
|
pub struct NavSkipper<T: HandleTag> {
|
||||||
|
handler: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: HandleTag> NavSkipper<T> {
|
||||||
|
pub fn new(handler: T) -> Self {
|
||||||
|
Self { handler }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: HandleTag> HandleTag for NavSkipper<T> {
|
||||||
|
fn should_handle(&self, tag: &str) -> bool {
|
||||||
|
tag == "nav" || self.handler.should_handle(tag)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_start(
|
||||||
|
&mut self,
|
||||||
|
tag: &HtmlElement,
|
||||||
|
writer: &mut MarkdownWriter,
|
||||||
|
) -> StartTagOutcome {
|
||||||
|
if writer.is_inside("nav") {
|
||||||
|
return StartTagOutcome::Continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.handler.handle_tag_start(tag, writer)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||||
|
if writer.is_inside("nav") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.handler.handle_tag_end(tag, writer)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||||
|
if writer.is_inside("nav") {
|
||||||
|
return HandlerOutcome::Handled;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.handler.handle_text(text, writer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GleamChromeRemover;
|
||||||
|
|
||||||
|
impl HandleTag for GleamChromeRemover {
|
||||||
|
fn should_handle(&self, tag: &str) -> bool {
|
||||||
|
match tag {
|
||||||
|
"head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_start(
|
||||||
|
&mut self,
|
||||||
|
tag: &HtmlElement,
|
||||||
|
_writer: &mut MarkdownWriter,
|
||||||
|
) -> StartTagOutcome {
|
||||||
|
match tag.tag() {
|
||||||
|
"head" | "script" | "style" | "svg" | "header" | "footer" => {
|
||||||
|
return StartTagOutcome::Skip;
|
||||||
|
}
|
||||||
|
"a" => {
|
||||||
|
if tag.attr("onclick").is_some() {
|
||||||
|
return StartTagOutcome::Skip;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
StartTagOutcome::Continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GleamModuleCollector {
|
||||||
|
modules: BTreeSet<String>,
|
||||||
|
has_seen_modules_header: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GleamModuleCollector {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
modules: BTreeSet::new(),
|
||||||
|
has_seen_modules_header: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_module(tag: &HtmlElement) -> Option<String> {
|
||||||
|
if tag.tag() != "a" {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let href = tag.attr("href")?;
|
||||||
|
if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let module_name = href.trim_start_matches("./").trim_end_matches(".html");
|
||||||
|
|
||||||
|
Some(module_name.to_owned())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HandleTag for GleamModuleCollector {
|
||||||
|
fn should_handle(&self, tag: &str) -> bool {
|
||||||
|
match tag {
|
||||||
|
"h2" | "a" => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_start(
|
||||||
|
&mut self,
|
||||||
|
tag: &HtmlElement,
|
||||||
|
writer: &mut MarkdownWriter,
|
||||||
|
) -> StartTagOutcome {
|
||||||
|
match tag.tag() {
|
||||||
|
"a" => {
|
||||||
|
if self.has_seen_modules_header && writer.is_inside("li") {
|
||||||
|
if let Some(module_name) = Self::parse_module(tag) {
|
||||||
|
self.modules.insert(module_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
StartTagOutcome::Continue
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||||
|
if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
|
||||||
|
self.has_seen_modules_header = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
HandlerOutcome::NoOp
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue