From 8db0333b043a2d47539850a5ab4a1473c26fc628 Mon Sep 17 00:00:00 2001 From: Umesh Yadav <23421535+imumesh18@users.noreply.github.com> Date: Mon, 12 May 2025 20:13:14 +0530 Subject: [PATCH] Fix out-of-bounds panic in fuzzy matcher with Unicode/multibyte characters (#30546) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR fixes a crash in the fuzzy matcher that occurred when handling Unicode or multibyte characters (such as Turkish `İ` or `ş`). The issue was caused by the matcher attempting to index beyond the end of internal arrays when lowercased Unicode characters expanded into multiple codepoints, resulting in an out-of-bounds panic. #### Root Cause The loop in `recursive_score_match` used an upper bound (`limit`) derived from `self.last_positions[query_idx]`, which could exceed the actual length of the arrays being indexed, especially with multibyte Unicode input. #### Solution The fix clamps the loop’s upper bound to the maximum valid index for the arrays being accessed: ```rust let max_valid_index = (prefix.len() + path_lowercased.len()).saturating_sub(1); let safe_limit = limit.min(max_valid_index); for j in path_idx..=safe_limit { ... } ``` This ensures all indexing is safe and prevents panics. Closes #30269 Release Notes: - N/A --------- Signed-off-by: Umesh Yadav --- crates/fuzzy/src/matcher.rs | 109 ++++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 5 deletions(-) diff --git a/crates/fuzzy/src/matcher.rs b/crates/fuzzy/src/matcher.rs index 0fe5ff098d..ba27be1505 100644 --- a/crates/fuzzy/src/matcher.rs +++ b/crates/fuzzy/src/matcher.rs @@ -158,7 +158,6 @@ impl<'a> Matcher<'a> { if score <= 0.0 { return 0.0; } - let path_len = prefix.len() + path.len(); let mut cur_start = 0; let mut byte_ix = 0; @@ -173,8 +172,17 @@ impl<'a> Matcher<'a> { byte_ix += ch.len_utf8(); char_ix += 1; } - cur_start = match_char_ix + 1; + self.match_positions[i] = byte_ix; + + let matched_ch = prefix + .get(match_char_ix) + .or_else(|| path.get(match_char_ix - prefix.len())) + .unwrap(); + byte_ix += matched_ch.len_utf8(); + + cur_start = match_char_ix + 1; + char_ix = match_char_ix + 1; } score @@ -209,8 +217,11 @@ impl<'a> Matcher<'a> { let query_char = self.lowercase_query[query_idx]; let limit = self.last_positions[query_idx]; + let max_valid_index = (prefix.len() + path_lowercased.len()).saturating_sub(1); + let safe_limit = limit.min(max_valid_index); + let mut last_slash = 0; - for j in path_idx..=limit { + for j in path_idx..=safe_limit { let extra_lowercase_chars_count = extra_lowercase_chars .iter() .take_while(|(i, _)| i < &&j) @@ -218,10 +229,15 @@ impl<'a> Matcher<'a> { .sum::(); let j_regular = j - extra_lowercase_chars_count; - let path_char = if j_regular < prefix.len() { + let path_char = if j < prefix.len() { lowercase_prefix[j] } else { - path_lowercased[j - prefix.len()] + let path_index = j - prefix.len(); + if path_index < path_lowercased.len() { + path_lowercased[path_index] + } else { + continue; + } }; let is_path_sep = path_char == MAIN_SEPARATOR; @@ -490,6 +506,89 @@ mod tests { ); } + #[test] + fn match_unicode_path_entries() { + let mixed_unicode_paths = vec![ + "İolu/oluş", + "İstanbul/code", + "Athens/Şanlıurfa", + "Çanakkale/scripts", + "paris/Düzce_İl", + "Berlin_Önemli_Ğündem", + "KİTAPLIK/london/dosya", + "tokyo/kyoto/fuji", + "new_york/san_francisco", + ]; + + assert_eq!( + match_single_path_query("İo/oluş", false, &mixed_unicode_paths), + vec![("İolu/oluş", vec![0, 2, 4, 6, 8, 10, 12])] + ); + + assert_eq!( + match_single_path_query("İst/code", false, &mixed_unicode_paths), + vec![("İstanbul/code", vec![0, 2, 4, 6, 8, 10, 12, 14])] + ); + + assert_eq!( + match_single_path_query("athens/şa", false, &mixed_unicode_paths), + vec![("Athens/Şanlıurfa", vec![0, 1, 2, 3, 4, 5, 6, 7, 9])] + ); + + assert_eq!( + match_single_path_query("BerlinÖĞ", false, &mixed_unicode_paths), + vec![("Berlin_Önemli_Ğündem", vec![0, 1, 2, 3, 4, 5, 7, 15])] + ); + + assert_eq!( + match_single_path_query("tokyo/fuji", false, &mixed_unicode_paths), + vec![("tokyo/kyoto/fuji", vec![0, 1, 2, 3, 4, 5, 12, 13, 14, 15])] + ); + + let mixed_script_paths = vec![ + "résumé_Москва", + "naïve_київ_implementation", + "café_北京_app", + "東京_über_driver", + "déjà_vu_cairo", + "seoul_piñata_game", + "voilà_istanbul_result", + ]; + + assert_eq!( + match_single_path_query("résmé", false, &mixed_script_paths), + vec![("résumé_Москва", vec![0, 1, 3, 5, 6])] + ); + + assert_eq!( + match_single_path_query("café北京", false, &mixed_script_paths), + vec![("café_北京_app", vec![0, 1, 2, 3, 6, 9])] + ); + + assert_eq!( + match_single_path_query("ista", false, &mixed_script_paths), + vec![("voilà_istanbul_result", vec![7, 8, 9, 10])] + ); + + let complex_paths = vec![ + "document_📚_library", + "project_👨‍👩‍👧‍👦_family", + "flags_🇯🇵🇺🇸🇪🇺_world", + "code_😀😃😄😁_happy", + "photo_👩‍👩‍👧‍👦_album", + ]; + + assert_eq!( + match_single_path_query("doc📚lib", false, &complex_paths), + vec![("document_📚_library", vec![0, 1, 2, 9, 14, 15, 16])] + ); + + assert_eq!( + match_single_path_query("codehappy", false, &complex_paths), + vec![("code_😀😃😄😁_happy", vec![0, 1, 2, 3, 22, 23, 24, 25, 26])] + ); + } + fn match_single_path_query<'a>( query: &str, smart_case: bool,