Fix out-of-bounds panic in fuzzy matcher with Unicode/multibyte characters (#30546)
This PR fixes a crash in the fuzzy matcher that occurred when handling Unicode or multibyte characters (such as Turkish `İ` or `ş`). The issue was caused by the matcher attempting to index beyond the end of internal arrays when lowercased Unicode characters expanded into multiple codepoints, resulting in an out-of-bounds panic. #### Root Cause The loop in `recursive_score_match` used an upper bound (`limit`) derived from `self.last_positions[query_idx]`, which could exceed the actual length of the arrays being indexed, especially with multibyte Unicode input. #### Solution The fix clamps the loop’s upper bound to the maximum valid index for the arrays being accessed: ```rust let max_valid_index = (prefix.len() + path_lowercased.len()).saturating_sub(1); let safe_limit = limit.min(max_valid_index); for j in path_idx..=safe_limit { ... } ``` This ensures all indexing is safe and prevents panics. Closes #30269 Release Notes: - N/A --------- Signed-off-by: Umesh Yadav <git@umesh.dev>
This commit is contained in:
parent
a13c8b70dd
commit
8db0333b04
1 changed files with 104 additions and 5 deletions
|
@ -158,7 +158,6 @@ impl<'a> Matcher<'a> {
|
||||||
if score <= 0.0 {
|
if score <= 0.0 {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
let path_len = prefix.len() + path.len();
|
let path_len = prefix.len() + path.len();
|
||||||
let mut cur_start = 0;
|
let mut cur_start = 0;
|
||||||
let mut byte_ix = 0;
|
let mut byte_ix = 0;
|
||||||
|
@ -173,8 +172,17 @@ impl<'a> Matcher<'a> {
|
||||||
byte_ix += ch.len_utf8();
|
byte_ix += ch.len_utf8();
|
||||||
char_ix += 1;
|
char_ix += 1;
|
||||||
}
|
}
|
||||||
cur_start = match_char_ix + 1;
|
|
||||||
self.match_positions[i] = byte_ix;
|
self.match_positions[i] = byte_ix;
|
||||||
|
|
||||||
|
let matched_ch = prefix
|
||||||
|
.get(match_char_ix)
|
||||||
|
.or_else(|| path.get(match_char_ix - prefix.len()))
|
||||||
|
.unwrap();
|
||||||
|
byte_ix += matched_ch.len_utf8();
|
||||||
|
|
||||||
|
cur_start = match_char_ix + 1;
|
||||||
|
char_ix = match_char_ix + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
score
|
score
|
||||||
|
@ -209,8 +217,11 @@ impl<'a> Matcher<'a> {
|
||||||
let query_char = self.lowercase_query[query_idx];
|
let query_char = self.lowercase_query[query_idx];
|
||||||
let limit = self.last_positions[query_idx];
|
let limit = self.last_positions[query_idx];
|
||||||
|
|
||||||
|
let max_valid_index = (prefix.len() + path_lowercased.len()).saturating_sub(1);
|
||||||
|
let safe_limit = limit.min(max_valid_index);
|
||||||
|
|
||||||
let mut last_slash = 0;
|
let mut last_slash = 0;
|
||||||
for j in path_idx..=limit {
|
for j in path_idx..=safe_limit {
|
||||||
let extra_lowercase_chars_count = extra_lowercase_chars
|
let extra_lowercase_chars_count = extra_lowercase_chars
|
||||||
.iter()
|
.iter()
|
||||||
.take_while(|(i, _)| i < &&j)
|
.take_while(|(i, _)| i < &&j)
|
||||||
|
@ -218,10 +229,15 @@ impl<'a> Matcher<'a> {
|
||||||
.sum::<usize>();
|
.sum::<usize>();
|
||||||
let j_regular = j - extra_lowercase_chars_count;
|
let j_regular = j - extra_lowercase_chars_count;
|
||||||
|
|
||||||
let path_char = if j_regular < prefix.len() {
|
let path_char = if j < prefix.len() {
|
||||||
lowercase_prefix[j]
|
lowercase_prefix[j]
|
||||||
} else {
|
} else {
|
||||||
path_lowercased[j - prefix.len()]
|
let path_index = j - prefix.len();
|
||||||
|
if path_index < path_lowercased.len() {
|
||||||
|
path_lowercased[path_index]
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
let is_path_sep = path_char == MAIN_SEPARATOR;
|
let is_path_sep = path_char == MAIN_SEPARATOR;
|
||||||
|
|
||||||
|
@ -490,6 +506,89 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn match_unicode_path_entries() {
|
||||||
|
let mixed_unicode_paths = vec![
|
||||||
|
"İolu/oluş",
|
||||||
|
"İstanbul/code",
|
||||||
|
"Athens/Şanlıurfa",
|
||||||
|
"Çanakkale/scripts",
|
||||||
|
"paris/Düzce_İl",
|
||||||
|
"Berlin_Önemli_Ğündem",
|
||||||
|
"KİTAPLIK/london/dosya",
|
||||||
|
"tokyo/kyoto/fuji",
|
||||||
|
"new_york/san_francisco",
|
||||||
|
];
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("İo/oluş", false, &mixed_unicode_paths),
|
||||||
|
vec![("İolu/oluş", vec![0, 2, 4, 6, 8, 10, 12])]
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("İst/code", false, &mixed_unicode_paths),
|
||||||
|
vec![("İstanbul/code", vec![0, 2, 4, 6, 8, 10, 12, 14])]
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("athens/şa", false, &mixed_unicode_paths),
|
||||||
|
vec![("Athens/Şanlıurfa", vec![0, 1, 2, 3, 4, 5, 6, 7, 9])]
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("BerlinÖĞ", false, &mixed_unicode_paths),
|
||||||
|
vec![("Berlin_Önemli_Ğündem", vec![0, 1, 2, 3, 4, 5, 7, 15])]
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("tokyo/fuji", false, &mixed_unicode_paths),
|
||||||
|
vec![("tokyo/kyoto/fuji", vec![0, 1, 2, 3, 4, 5, 12, 13, 14, 15])]
|
||||||
|
);
|
||||||
|
|
||||||
|
let mixed_script_paths = vec![
|
||||||
|
"résumé_Москва",
|
||||||
|
"naïve_київ_implementation",
|
||||||
|
"café_北京_app",
|
||||||
|
"東京_über_driver",
|
||||||
|
"déjà_vu_cairo",
|
||||||
|
"seoul_piñata_game",
|
||||||
|
"voilà_istanbul_result",
|
||||||
|
];
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("résmé", false, &mixed_script_paths),
|
||||||
|
vec![("résumé_Москва", vec![0, 1, 3, 5, 6])]
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("café北京", false, &mixed_script_paths),
|
||||||
|
vec![("café_北京_app", vec![0, 1, 2, 3, 6, 9])]
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("ista", false, &mixed_script_paths),
|
||||||
|
vec![("voilà_istanbul_result", vec![7, 8, 9, 10])]
|
||||||
|
);
|
||||||
|
|
||||||
|
let complex_paths = vec![
|
||||||
|
"document_📚_library",
|
||||||
|
"project_👨👩👧👦_family",
|
||||||
|
"flags_🇯🇵🇺🇸🇪🇺_world",
|
||||||
|
"code_😀😃😄😁_happy",
|
||||||
|
"photo_👩👩👧👦_album",
|
||||||
|
];
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("doc📚lib", false, &complex_paths),
|
||||||
|
vec![("document_📚_library", vec![0, 1, 2, 9, 14, 15, 16])]
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
match_single_path_query("codehappy", false, &complex_paths),
|
||||||
|
vec![("code_😀😃😄😁_happy", vec![0, 1, 2, 3, 22, 23, 24, 25, 26])]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
fn match_single_path_query<'a>(
|
fn match_single_path_query<'a>(
|
||||||
query: &str,
|
query: &str,
|
||||||
smart_case: bool,
|
smart_case: bool,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue