Fuzzy-match lines when applying edits from the assistant (#12056)

This uses Jaro-Winkler similarity for now, which seemed to produce
pretty good results in my tests. We can easily swap it with something
else if needed.

Release Notes:

- N/A
This commit is contained in:
Antonio Scandurra 2024-05-20 17:02:15 +02:00 committed by GitHub
parent 0b8c1680fb
commit 3a79aa85f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 104 additions and 75 deletions

17
Cargo.lock generated
View file

@ -368,6 +368,7 @@ dependencies = [
"serde_json", "serde_json",
"settings", "settings",
"smol", "smol",
"strsim 0.11.1",
"telemetry_events", "telemetry_events",
"theme", "theme",
"tiktoken-rs", "tiktoken-rs",
@ -1684,7 +1685,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
dependencies = [ dependencies = [
"memchr", "memchr",
"regex-automata 0.3.8", "regex-automata 0.3.9",
"serde", "serde",
] ]
@ -2094,7 +2095,7 @@ dependencies = [
"bitflags 1.3.2", "bitflags 1.3.2",
"clap_lex 0.2.4", "clap_lex 0.2.4",
"indexmap 1.9.3", "indexmap 1.9.3",
"strsim", "strsim 0.10.0",
"termcolor", "termcolor",
"textwrap", "textwrap",
] ]
@ -2118,7 +2119,7 @@ dependencies = [
"anstream", "anstream",
"anstyle", "anstyle",
"clap_lex 0.5.1", "clap_lex 0.5.1",
"strsim", "strsim 0.10.0",
] ]
[[package]] [[package]]
@ -8141,9 +8142,9 @@ dependencies = [
[[package]] [[package]]
name = "regex-automata" name = "regex-automata"
version = "0.3.8" version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
[[package]] [[package]]
name = "regex-automata" name = "regex-automata"
@ -9783,6 +9784,12 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]] [[package]]
name = "strum" name = "strum"
version = "0.25.0" version = "0.25.0"

View file

@ -40,6 +40,7 @@ serde.workspace = true
serde_json.workspace = true serde_json.workspace = true
settings.workspace = true settings.workspace = true
smol.workspace = true smol.workspace = true
strsim = "0.11"
telemetry_events.workspace = true telemetry_events.workspace = true
theme.workspace = true theme.workspace = true
tiktoken-rs.workspace = true tiktoken-rs.workspace = true

View file

@ -3058,9 +3058,9 @@ impl ConversationEditor {
.entry(buffer) .entry(buffer)
.or_insert(Vec::<(Range<language::Anchor>, _)>::new()); .or_insert(Vec::<(Range<language::Anchor>, _)>::new());
for suggestion in suggestions { for suggestion in suggestions {
let ranges = if let Some(range) =
fuzzy_search_lines(snapshot.as_rope(), &suggestion.old_text); fuzzy_search_lines(snapshot.as_rope(), &suggestion.old_text)
if let Some(range) = ranges.first() { {
let edit_start = snapshot.anchor_after(range.start); let edit_start = snapshot.anchor_after(range.start);
let edit_end = snapshot.anchor_before(range.end); let edit_end = snapshot.anchor_before(range.end);
if let Err(ix) = edits.binary_search_by(|(range, _)| { if let Err(ix) = edits.binary_search_by(|(range, _)| {

View file

@ -6,51 +6,75 @@ use std::ops::Range;
/// ///
/// Returns a vector of ranges of byte offsets in the buffer corresponding /// Returns a vector of ranges of byte offsets in the buffer corresponding
/// to the entire lines of the buffer. /// to the entire lines of the buffer.
pub fn fuzzy_search_lines(haystack: &Rope, needle: &str) -> Vec<Range<usize>> { pub fn fuzzy_search_lines(haystack: &Rope, needle: &str) -> Option<Range<usize>> {
let mut matches = Vec::new(); const SIMILARITY_THRESHOLD: f64 = 0.8;
let mut best_match: Option<(Range<usize>, f64)> = None; // (range, score)
let mut haystack_lines = haystack.chunks().lines(); let mut haystack_lines = haystack.chunks().lines();
let mut haystack_line_start = 0; let mut haystack_line_start = 0;
while let Some(haystack_line) = haystack_lines.next() { while let Some(mut haystack_line) = haystack_lines.next() {
let next_haystack_line_start = haystack_line_start + haystack_line.len() + 1; let next_haystack_line_start = haystack_line_start + haystack_line.len() + 1;
let mut trimmed_needle_lines = needle.lines().map(|line| line.trim()); let mut advanced_to_next_haystack_line = false;
if Some(haystack_line.trim()) == trimmed_needle_lines.next() {
let mut matched = true;
let match_start = haystack_line_start; let match_start = haystack_line_start;
let mut match_end = next_haystack_line_start; let mut match_end = next_haystack_line_start;
let matched = loop { let mut match_score = 0.0;
match (haystack_lines.next(), trimmed_needle_lines.next()) { let mut needle_lines = needle.lines().peekable();
(Some(haystack_line), Some(needle_line)) => { while let Some(needle_line) = needle_lines.next() {
// Haystack line differs from needle line: not a match. let similarity = line_similarity(haystack_line, needle_line);
if haystack_line.trim() == needle_line { if similarity >= SIMILARITY_THRESHOLD {
match_end = haystack_lines.offset(); match_end = haystack_lines.offset();
match_score += similarity;
if needle_lines.peek().is_some() {
if let Some(next_haystack_line) = haystack_lines.next() {
advanced_to_next_haystack_line = true;
haystack_line = next_haystack_line;
} else { } else {
break false; matched = false;
break;
} }
} else {
break;
} }
// We exhausted the haystack but not the query: not a match. } else {
(None, Some(_)) => break false, matched = false;
// We exhausted the query: it's a match. break;
(_, None) => break true,
} }
};
if matched {
matches.push(match_start..match_end)
} }
// Advance to the next line. if matched
&& best_match
.as_ref()
.map(|(_, best_score)| match_score > *best_score)
.unwrap_or(true)
{
best_match = Some((match_start..match_end, match_score));
}
if advanced_to_next_haystack_line {
haystack_lines.seek(next_haystack_line_start); haystack_lines.seek(next_haystack_line_start);
} }
haystack_line_start = next_haystack_line_start; haystack_line_start = next_haystack_line_start;
} }
matches
best_match.map(|(range, _)| range)
}
/// Calculates the similarity between two lines, ignoring leading and trailing whitespace,
/// using the Jaro-Winkler distance.
///
/// Returns a value between 0.0 and 1.0, where 1.0 indicates an exact match.
fn line_similarity(line1: &str, line2: &str) -> f64 {
strsim::jaro_winkler(line1.trim(), line2.trim())
} }
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;
use gpui::{AppContext, Context as _}; use gpui::{AppContext, Context as _};
use language::{Buffer, OffsetRangeExt}; use language::Buffer;
use unindent::Unindent as _; use unindent::Unindent as _;
use util::test::marked_text_ranges; use util::test::marked_text_ranges;
@ -79,17 +103,11 @@ mod test {
); );
» »
assert_eq!( « assert_eq!(
"something", "something",
"else", "else",
); );
»
if b {
« assert_eq!(
1 + 2,
3,
);
» }
} }
"# "#
.unindent(), .unindent(),
@ -99,7 +117,7 @@ mod test {
let buffer = cx.new_model(|cx| Buffer::local(&text, cx)); let buffer = cx.new_model(|cx| Buffer::local(&text, cx));
let snapshot = buffer.read_with(cx, |buffer, _| buffer.snapshot()); let snapshot = buffer.read_with(cx, |buffer, _| buffer.snapshot());
let actual_ranges = fuzzy_search_lines( let actual_range = fuzzy_search_lines(
snapshot.as_rope(), snapshot.as_rope(),
&" &"
assert_eq!( assert_eq!(
@ -108,22 +126,11 @@ mod test {
); );
" "
.unindent(), .unindent(),
); )
assert_eq!( .unwrap();
actual_ranges, assert_eq!(actual_range, expected_ranges[0]);
expected_ranges,
"actual: {:?}, expected: {:?}",
actual_ranges
.iter()
.map(|range| range.to_point(&snapshot))
.collect::<Vec<_>>(),
expected_ranges
.iter()
.map(|range| range.to_point(&snapshot))
.collect::<Vec<_>>()
);
let actual_ranges = fuzzy_search_lines( let actual_range = fuzzy_search_lines(
snapshot.as_rope(), snapshot.as_rope(),
&" &"
assert_eq!( assert_eq!(
@ -132,19 +139,33 @@ mod test {
); );
" "
.unindent(), .unindent(),
); )
.unwrap();
assert_eq!(actual_range, expected_ranges[0]);
let actual_range = fuzzy_search_lines(
snapshot.as_rope(),
&"
asst_eq!(
\"something\",
\"els\"
)
"
.unindent(),
)
.unwrap();
assert_eq!(actual_range, expected_ranges[1]);
let actual_range = fuzzy_search_lines(
snapshot.as_rope(),
&"
assert_eq!( assert_eq!(
actual_ranges, 2 + 1,
expected_ranges, 3,
"actual: {:?}, expected: {:?}",
actual_ranges
.iter()
.map(|range| range.to_point(&snapshot))
.collect::<Vec<_>>(),
expected_ranges
.iter()
.map(|range| range.to_point(&snapshot))
.collect::<Vec<_>>()
); );
"
.unindent(),
);
assert_eq!(actual_range, None);
} }
} }