From f4b361f04ddb15184407ba0988cf8f0c9c44d1ff Mon Sep 17 00:00:00 2001 From: claytonrcarter Date: Mon, 26 May 2025 14:00:05 -0400 Subject: [PATCH] language: Select language based on longest matching path extension (#29716) Closes #8408 Closes #10997 This is a reboot of [my original PR](https://github.com/zed-industries/zed/pull/11697) from last year. I believe that I've addressed all the comments raised in that original review, but Zed has changed a lot in the past year, so I'm sure there will be some new stuff to consider too. - updates the language matching and lookup to consider not just "does the suffix/glob match" but also "... and is it the longest such match" - adds a new `LanguageCustomFileTypes` struct to pass user globs from settings to the registry - _minor/unrelated:_ updates a test for the JS extension that wasn't actually testing what is intended to - _minor/unrelated:_ removed 2 redundant path extensions from the JS lang extension **Languages that may use this** - Laravel Blade templates use the `blade.php` compound extension - [apparently](https://github.com/zed-industries/zed/issues/10765#issuecomment-2091293304) Angular uses `component.html` - see also https://github.com/zed-industries/extensions/issues/169 - _hypothetically_ someone could publish a "JS test" extension w/ custom highlights and/or snippets; many JS tests use `test.js` or `spec.js` **Verifying these changes** I added a number of assertions for this new behavior, and I also confirmed that the (recently patched) [Laravel Blade extension](https://github.com/bajrangCoder/zed-laravel-blade) opens as expected for `blade.php` files, whereas on `main` it does not. cc @maxbrunsfeld (reviewed my original PR last year), @osiewicz and @MrSubidubi (have recently been in this part of the code) Release Notes: - Added support for "compound" file extensions in language extensions, such `blade.php` and `component.html`. Closes #8408 and #10997. --- crates/language/src/buffer_tests.rs | 61 ++++++- crates/language/src/language_registry.rs | 166 ++++++++++++++------ crates/languages/src/typescript/config.toml | 2 +- 3 files changed, 177 insertions(+), 52 deletions(-) diff --git a/crates/language/src/buffer_tests.rs b/crates/language/src/buffer_tests.rs index f76d41577a..fd9db25ea7 100644 --- a/crates/language/src/buffer_tests.rs +++ b/crates/language/src/buffer_tests.rs @@ -83,6 +83,17 @@ fn test_select_language(cx: &mut App) { }, Some(tree_sitter_rust::LANGUAGE.into()), ))); + registry.add(Arc::new(Language::new( + LanguageConfig { + name: "Rust with longer extension".into(), + matcher: LanguageMatcher { + path_suffixes: vec!["longer.rs".to_string()], + ..Default::default() + }, + ..Default::default() + }, + Some(tree_sitter_rust::LANGUAGE.into()), + ))); registry.add(Arc::new(Language::new( LanguageConfig { name: LanguageName::new("Make"), @@ -109,6 +120,14 @@ fn test_select_language(cx: &mut App) { Some("Make".into()) ); + // matching longer, compound extension, part of which could also match another lang + assert_eq!( + registry + .language_for_file(&file("src/lib.longer.rs"), None, cx) + .map(|l| l.name()), + Some("Rust with longer extension".into()) + ); + // matching filename assert_eq!( registry @@ -181,7 +200,11 @@ async fn test_language_for_file_with_custom_file_types(cx: &mut TestAppContext) init_settings(cx, |settings| { settings.file_types.extend([ ("TypeScript".into(), vec!["js".into()]), - ("C++".into(), vec!["c".into()]), + ( + "JavaScript".into(), + vec!["*longer.ts".into(), "ecmascript".into()], + ), + ("C++".into(), vec!["c".into(), "*.dev".into()]), ( "Dockerfile".into(), vec!["Dockerfile".into(), "Dockerfile.*".into()], @@ -204,7 +227,7 @@ async fn test_language_for_file_with_custom_file_types(cx: &mut TestAppContext) LanguageConfig { name: "TypeScript".into(), matcher: LanguageMatcher { - path_suffixes: vec!["js".to_string()], + path_suffixes: vec!["ts".to_string(), "ts.ecmascript".to_string()], ..Default::default() }, ..Default::default() @@ -237,6 +260,21 @@ async fn test_language_for_file_with_custom_file_types(cx: &mut TestAppContext) languages.add(Arc::new(Language::new(config, None))); } + // matches system-provided lang extension + let language = cx + .read(|cx| languages.language_for_file(&file("foo.ts"), None, cx)) + .unwrap(); + assert_eq!(language.name(), "TypeScript".into()); + let language = cx + .read(|cx| languages.language_for_file(&file("foo.ts.ecmascript"), None, cx)) + .unwrap(); + assert_eq!(language.name(), "TypeScript".into()); + let language = cx + .read(|cx| languages.language_for_file(&file("foo.cpp"), None, cx)) + .unwrap(); + assert_eq!(language.name(), "C++".into()); + + // user configured lang extension, same length as system-provided let language = cx .read(|cx| languages.language_for_file(&file("foo.js"), None, cx)) .unwrap(); @@ -245,6 +283,25 @@ async fn test_language_for_file_with_custom_file_types(cx: &mut TestAppContext) .read(|cx| languages.language_for_file(&file("foo.c"), None, cx)) .unwrap(); assert_eq!(language.name(), "C++".into()); + + // user configured lang extension, longer than system-provided + let language = cx + .read(|cx| languages.language_for_file(&file("foo.longer.ts"), None, cx)) + .unwrap(); + assert_eq!(language.name(), "JavaScript".into()); + + // user configured lang extension, shorter than system-provided + let language = cx + .read(|cx| languages.language_for_file(&file("foo.ecmascript"), None, cx)) + .unwrap(); + assert_eq!(language.name(), "JavaScript".into()); + + // user configured glob matches + let language = cx + .read(|cx| languages.language_for_file(&file("c-plus-plus.dev"), None, cx)) + .unwrap(); + assert_eq!(language.name(), "C++".into()); + // should match Dockerfile.* => Dockerfile, not *.dev => C++ let language = cx .read(|cx| languages.language_for_file(&file("Dockerfile.dev"), None, cx)) .unwrap(); diff --git a/crates/language/src/language_registry.rs b/crates/language/src/language_registry.rs index 46874d86a7..a860a75fbb 100644 --- a/crates/language/src/language_registry.rs +++ b/crates/language/src/language_registry.rs @@ -16,8 +16,6 @@ use futures::{ }; use globset::GlobSet; use gpui::{App, BackgroundExecutor, SharedString}; -use itertools::FoldWhile::{Continue, Done}; -use itertools::Itertools; use lsp::LanguageServerId; use parking_lot::{Mutex, RwLock}; use postage::watch; @@ -173,18 +171,12 @@ impl AvailableLanguage { } } -#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Copy, Clone, Default)] enum LanguageMatchPrecedence { #[default] Undetermined, - PathOrContent, - UserConfigured, -} - -impl LanguageMatchPrecedence { - fn best_possible_match(&self) -> bool { - *self == LanguageMatchPrecedence::UserConfigured - } + PathOrContent(usize), + UserConfigured(usize), } enum AvailableGrammar { @@ -626,9 +618,14 @@ impl LanguageRegistry { ) -> impl Future>> + use<> { let name = UniCase::new(name); let rx = self.get_or_load_language(|language_name, _, current_best_match| { - (current_best_match < LanguageMatchPrecedence::PathOrContent - && UniCase::new(&language_name.0) == name) - .then_some(LanguageMatchPrecedence::PathOrContent) + match current_best_match { + LanguageMatchPrecedence::Undetermined if UniCase::new(&language_name.0) == name => { + Some(LanguageMatchPrecedence::PathOrContent(name.len())) + } + LanguageMatchPrecedence::Undetermined + | LanguageMatchPrecedence::UserConfigured(_) + | LanguageMatchPrecedence::PathOrContent(_) => None, + } }); async move { rx.await? } } @@ -655,13 +652,23 @@ impl LanguageRegistry { ) -> impl Future>> { let string = UniCase::new(string); let rx = self.get_or_load_language(|name, config, current_best_match| { - (current_best_match < LanguageMatchPrecedence::PathOrContent - && (UniCase::new(&name.0) == string + let name_matches = || { + UniCase::new(&name.0) == string || config .path_suffixes .iter() - .any(|suffix| UniCase::new(suffix) == string))) - .then_some(LanguageMatchPrecedence::PathOrContent) + .any(|suffix| UniCase::new(suffix) == string) + }; + + match current_best_match { + LanguageMatchPrecedence::Undetermined => { + name_matches().then_some(LanguageMatchPrecedence::PathOrContent(string.len())) + } + LanguageMatchPrecedence::PathOrContent(len) => (string.len() > len + && name_matches()) + .then_some(LanguageMatchPrecedence::PathOrContent(string.len())), + LanguageMatchPrecedence::UserConfigured(_) => None, + } }); async move { rx.await? } } @@ -717,10 +724,9 @@ impl LanguageRegistry { // and no other extension which is not the desired behavior here, // as we want `.zshrc` to result in extension being `Some("zshrc")` let extension = filename.and_then(|filename| filename.split('.').next_back()); - let path_suffixes = [extension, filename, path.to_str()]; - let path_suffixes_candidates = path_suffixes + let path_suffixes = [extension, filename, path.to_str()] .iter() - .filter_map(|suffix| suffix.map(globset::Candidate::new)) + .filter_map(|suffix| suffix.map(|suffix| (suffix, globset::Candidate::new(suffix)))) .collect::>(); let content = LazyCell::new(|| { content.map(|content| { @@ -731,20 +737,37 @@ impl LanguageRegistry { }); self.find_matching_language(move |language_name, config, current_best_match| { let path_matches_default_suffix = || { - config - .path_suffixes - .iter() - .any(|suffix| path_suffixes.contains(&Some(suffix.as_str()))) + let len = + config + .path_suffixes + .iter() + .fold(0, |acc: usize, path_suffix: &String| { + let ext = ".".to_string() + path_suffix; + + let matched_suffix_len = path_suffixes + .iter() + .find(|(suffix, _)| suffix.ends_with(&ext) || suffix == path_suffix) + .map(|(suffix, _)| suffix.len()); + + match matched_suffix_len { + Some(len) => acc.max(len), + None => acc, + } + }); + (len > 0).then_some(len) }; + let path_matches_custom_suffix = || { user_file_types .and_then(|types| types.get(language_name.as_ref())) - .map_or(false, |custom_suffixes| { - path_suffixes_candidates + .map_or(None, |custom_suffixes| { + path_suffixes .iter() - .any(|suffix| custom_suffixes.is_match_candidate(suffix)) + .find(|(_, candidate)| custom_suffixes.is_match_candidate(candidate)) + .map(|(suffix, _)| suffix.len()) }) }; + let content_matches = || { config.first_line_pattern.as_ref().map_or(false, |pattern| { content @@ -756,17 +779,29 @@ impl LanguageRegistry { // Only return a match for the given file if we have a better match than // the current one. match current_best_match { - LanguageMatchPrecedence::PathOrContent | LanguageMatchPrecedence::Undetermined - if path_matches_custom_suffix() => - { - Some(LanguageMatchPrecedence::UserConfigured) + LanguageMatchPrecedence::PathOrContent(current_len) => { + if let Some(len) = path_matches_custom_suffix() { + // >= because user config should win tie with system ext len + (len >= current_len).then_some(LanguageMatchPrecedence::UserConfigured(len)) + } else if let Some(len) = path_matches_default_suffix() { + // >= because user config should win tie with system ext len + (len >= current_len).then_some(LanguageMatchPrecedence::PathOrContent(len)) + } else { + None + } } - LanguageMatchPrecedence::Undetermined - if path_matches_default_suffix() || content_matches() => - { - Some(LanguageMatchPrecedence::PathOrContent) + LanguageMatchPrecedence::Undetermined => { + if let Some(len) = path_matches_custom_suffix() { + Some(LanguageMatchPrecedence::UserConfigured(len)) + } else if let Some(len) = path_matches_default_suffix() { + Some(LanguageMatchPrecedence::PathOrContent(len)) + } else if content_matches() { + Some(LanguageMatchPrecedence::PathOrContent(1)) + } else { + None + } } - _ => None, + LanguageMatchPrecedence::UserConfigured(_) => None, } }) } @@ -784,28 +819,61 @@ impl LanguageRegistry { .available_languages .iter() .rev() - .fold_while(None, |best_language_match, language| { + .fold(None, |best_language_match, language| { let current_match_type = best_language_match .as_ref() .map_or(LanguageMatchPrecedence::default(), |(_, score)| *score); let language_score = callback(&language.name, &language.matcher, current_match_type); - debug_assert!( - language_score.is_none_or(|new_score| new_score > current_match_type), - "Matching callback should only return a better match than the current one" - ); - match language_score { - Some(new_score) if new_score.best_possible_match() => { - Done(Some((language.clone(), new_score))) + match (language_score, current_match_type) { + // no current best, so our candidate is better + ( + Some( + LanguageMatchPrecedence::PathOrContent(_) + | LanguageMatchPrecedence::UserConfigured(_), + ), + LanguageMatchPrecedence::Undetermined, + ) => language_score.map(|new_score| (language.clone(), new_score)), + + // our candidate is better only if the name is longer + ( + Some(LanguageMatchPrecedence::PathOrContent(new_len)), + LanguageMatchPrecedence::PathOrContent(current_len), + ) + | ( + Some(LanguageMatchPrecedence::UserConfigured(new_len)), + LanguageMatchPrecedence::UserConfigured(current_len), + ) + | ( + Some(LanguageMatchPrecedence::PathOrContent(new_len)), + LanguageMatchPrecedence::UserConfigured(current_len), + ) => { + if new_len > current_len { + language_score.map(|new_score| (language.clone(), new_score)) + } else { + best_language_match + } } - Some(new_score) if current_match_type < new_score => { - Continue(Some((language.clone(), new_score))) + + // our candidate is better if the name is longer or equal to + ( + Some(LanguageMatchPrecedence::UserConfigured(new_len)), + LanguageMatchPrecedence::PathOrContent(current_len), + ) => { + if new_len >= current_len { + language_score.map(|new_score| (language.clone(), new_score)) + } else { + best_language_match + } + } + + // no candidate, use current best + (None, _) | (Some(LanguageMatchPrecedence::Undetermined), _) => { + best_language_match } - _ => Continue(best_language_match), } }) - .into_inner() .map(|(available_language, _)| available_language); drop(state); available_language diff --git a/crates/languages/src/typescript/config.toml b/crates/languages/src/typescript/config.toml index 10134066ab..aca294df85 100644 --- a/crates/languages/src/typescript/config.toml +++ b/crates/languages/src/typescript/config.toml @@ -1,6 +1,6 @@ name = "TypeScript" grammar = "typescript" -path_suffixes = ["ts", "cts", "d.cts", "d.mts", "mts"] +path_suffixes = ["ts", "cts", "mts"] first_line_pattern = '^#!.*\b(?:deno run|ts-node|bun|tsx)\b' line_comments = ["// "] block_comment = ["/*", "*/"]