From 0fdad0c0d6216618b59e269a04fbe81e5b758bfa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 19 Feb 2025 16:56:01 -0800 Subject: [PATCH] Use line-based and word-based diff when reloading and formatting buffers (#25129) Closes https://github.com/zed-industries/zed/issues/10122 Closes https://github.com/zed-industries/zed/issues/25034 When formatting buffers or reloading them after they change on disk, we performed a diff between the buffer's current contents and the new content. We need this diff in order preserve the positions of cursors and other decorations when updating the buffer's text. In order to handle changes within lines, we would previously compute a *character-wise* diff. This was extremely expensive for large files. This PR gets rid of the character-wise diff, and instead performs a normal line-wise diff. Then, for certain replace hunks, we compute a secondary word-based diff. Also, I've switched to the [`imara-diff`](https://github.com/pascalkuthe/imara-diff) crate, instead of `similar`. Release Notes: - Fixed a hang that could occur when large files were changed on disk or formatted. --- Cargo.lock | 22 +- Cargo.toml | 2 +- crates/assistant/Cargo.toml | 1 - crates/assistant/src/inline_assistant.rs | 63 ++---- crates/assistant2/Cargo.toml | 6 +- crates/assistant2/src/buffer_codegen.rs | 63 ++---- crates/editor/Cargo.toml | 1 - crates/editor/src/editor.rs | 63 ++---- crates/editor/src/element.rs | 3 +- crates/language/Cargo.toml | 2 +- crates/language/src/buffer.rs | 58 +---- crates/language/src/buffer_tests.rs | 45 +++- crates/language/src/language.rs | 2 + crates/language/src/text_diff.rs | 274 +++++++++++++++++++++++ crates/project/Cargo.toml | 1 - crates/project/src/lsp_store.rs | 58 ++--- crates/project/src/project_tests.rs | 30 ++- crates/zeta/Cargo.toml | 1 - crates/zeta/src/zeta.rs | 77 +------ 19 files changed, 429 insertions(+), 343 deletions(-) create mode 100644 crates/language/src/text_diff.rs diff --git a/Cargo.lock b/Cargo.lock index 6e674784ad..412008d161 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -416,7 +416,6 @@ dependencies = [ "serde", "serde_json_lenient", "settings", - "similar", "smol", "streaming_diff", "telemetry", @@ -482,7 +481,6 @@ dependencies = [ "serde", "serde_json", "settings", - "similar", "smol", "streaming_diff", "telemetry_events", @@ -4064,7 +4062,6 @@ dependencies = [ "serde", "serde_json", "settings", - "similar", "smallvec", "smol", "snippet", @@ -6376,6 +6373,15 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edcd27d72f2f071c64249075f42e205ff93c9a4c5f6c6da53e79ed9f9832c285" +[[package]] +name = "imara-diff" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17d34b7d42178945f775e84bc4c36dde7c1c6cdfea656d3354d009056f2bb3d2" +dependencies = [ + "hashbrown 0.15.2", +] + [[package]] name = "imgref" version = "1.11.0" @@ -6868,6 +6874,7 @@ dependencies = [ "globset", "gpui", "http_client", + "imara-diff", "indoc", "itertools 0.14.0", "log", @@ -6882,7 +6889,6 @@ dependencies = [ "serde", "serde_json", "settings", - "similar", "smallvec", "smol", "streaming-iterator", @@ -10129,7 +10135,6 @@ dependencies = [ "sha2", "shellexpand 2.1.2", "shlex", - "similar", "smol", "snippet", "snippet_provider", @@ -12270,12 +12275,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" -[[package]] -name = "similar" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad1d488a557b235fc46dae55512ffbfc429d2482b08b4d9435ab07384ca8aec" - [[package]] name = "simple_asn1" version = "0.6.2" @@ -17140,7 +17139,6 @@ dependencies = [ "serde", "serde_json", "settings", - "similar", "telemetry", "telemetry_events", "theme", diff --git a/Cargo.toml b/Cargo.toml index 4b787d5e75..c93c56486d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -421,6 +421,7 @@ hyper = "0.14" http = "1.1" ignore = "0.4.22" image = "0.25.1" +imara-diff = "0.1.8" indexmap = { version = "2.7.0", features = ["serde"] } indoc = "2" inventory = "0.3.19" @@ -499,7 +500,6 @@ sha2 = "0.10" shellexpand = "2.1.0" shlex = "1.3.0" signal-hook = "0.3.17" -similar = "1.3" simplelog = "0.12.2" smallvec = { version = "1.6", features = ["union"] } smol = "2.0" diff --git a/crates/assistant/Cargo.toml b/crates/assistant/Cargo.toml index 2ad0e7e082..1bb7cbc1ae 100644 --- a/crates/assistant/Cargo.toml +++ b/crates/assistant/Cargo.toml @@ -59,7 +59,6 @@ search.workspace = true semantic_index.workspace = true serde.workspace = true settings.workspace = true -similar.workspace = true smol.workspace = true streaming_diff.workspace = true telemetry.workspace = true diff --git a/crates/assistant/src/inline_assistant.rs b/crates/assistant/src/inline_assistant.rs index 554db12645..cb4ada5f08 100644 --- a/crates/assistant/src/inline_assistant.rs +++ b/crates/assistant/src/inline_assistant.rs @@ -30,7 +30,7 @@ use gpui::{ EventEmitter, FocusHandle, Focusable, FontWeight, Global, HighlightStyle, Subscription, Task, TextStyle, UpdateGlobal, WeakEntity, Window, }; -use language::{Buffer, IndentKind, Point, Selection, TransactionId}; +use language::{line_diff, Buffer, IndentKind, Point, Selection, TransactionId}; use language_model::{ LanguageModel, LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage, LanguageModelTextStream, Role, @@ -3350,52 +3350,29 @@ impl CodegenAlternative { ) .collect::(); - let mut old_row = old_range.start.row; - let mut new_row = new_range.start.row; - let batch_diff = - similar::TextDiff::from_lines(old_text.as_str(), new_text.as_str()); - + let old_start_row = old_range.start.row; + let new_start_row = new_range.start.row; let mut deleted_row_ranges: Vec<(Anchor, RangeInclusive)> = Vec::new(); let mut inserted_row_ranges = Vec::new(); - for change in batch_diff.iter_all_changes() { - let line_count = change.value().lines().count() as u32; - match change.tag() { - similar::ChangeTag::Equal => { - old_row += line_count; - new_row += line_count; - } - similar::ChangeTag::Delete => { - let old_end_row = old_row + line_count - 1; - let new_row = new_snapshot.anchor_before(Point::new(new_row, 0)); - - if let Some((_, last_deleted_row_range)) = - deleted_row_ranges.last_mut() - { - if *last_deleted_row_range.end() + 1 == old_row { - *last_deleted_row_range = - *last_deleted_row_range.start()..=old_end_row; - } else { - deleted_row_ranges.push((new_row, old_row..=old_end_row)); - } - } else { - deleted_row_ranges.push((new_row, old_row..=old_end_row)); - } - - old_row += line_count; - } - similar::ChangeTag::Insert => { - let new_end_row = new_row + line_count - 1; - let start = new_snapshot.anchor_before(Point::new(new_row, 0)); - let end = new_snapshot.anchor_before(Point::new( - new_end_row, - new_snapshot.line_len(MultiBufferRow(new_end_row)), - )); - inserted_row_ranges.push(start..end); - new_row += line_count; - } + for (old_rows, new_rows) in line_diff(&old_text, &new_text) { + let old_rows = old_start_row + old_rows.start..old_start_row + old_rows.end; + let new_rows = new_start_row + new_rows.start..new_start_row + new_rows.end; + if !old_rows.is_empty() { + deleted_row_ranges.push(( + new_snapshot.anchor_before(Point::new(new_rows.start, 0)), + old_rows.start..=old_rows.end - 1, + )); + } + if !new_rows.is_empty() { + let start = new_snapshot.anchor_before(Point::new(new_rows.start, 0)); + let new_end_row = new_rows.end - 1; + let end = new_snapshot.anchor_before(Point::new( + new_end_row, + new_snapshot.line_len(MultiBufferRow(new_end_row)), + )); + inserted_row_ranges.push(start..end); } } - (deleted_row_ranges, inserted_row_ranges) }) .await; diff --git a/crates/assistant2/Cargo.toml b/crates/assistant2/Cargo.toml index f53595cd0c..9a74a5e2fe 100644 --- a/crates/assistant2/Cargo.toml +++ b/crates/assistant2/Cargo.toml @@ -62,7 +62,6 @@ rope.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true -similar.workspace = true smol.workspace = true streaming_diff.workspace = true telemetry_events.workspace = true @@ -79,5 +78,10 @@ workspace.workspace = true zed_actions.workspace = true [dev-dependencies] +editor = { workspace = true, features = ["test-support"] } +gpui = { workspace = true, "features" = ["test-support"] } +language = { workspace = true, "features" = ["test-support"] } +language_model = { workspace = true, "features" = ["test-support"] } +project = { workspace = true, features = ["test-support"] } rand.workspace = true indoc.workspace = true diff --git a/crates/assistant2/src/buffer_codegen.rs b/crates/assistant2/src/buffer_codegen.rs index 9c36cc69dc..4e62f9549d 100644 --- a/crates/assistant2/src/buffer_codegen.rs +++ b/crates/assistant2/src/buffer_codegen.rs @@ -7,7 +7,7 @@ use collections::HashSet; use editor::{Anchor, AnchorRangeExt, MultiBuffer, MultiBufferSnapshot, ToOffset as _, ToPoint}; use futures::{channel::mpsc, future::LocalBoxFuture, join, SinkExt, Stream, StreamExt}; use gpui::{App, AppContext as _, Context, Entity, EventEmitter, Subscription, Task}; -use language::{Buffer, IndentKind, Point, TransactionId}; +use language::{line_diff, Buffer, IndentKind, Point, TransactionId}; use language_model::{ LanguageModel, LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage, LanguageModelTextStream, Role, @@ -827,52 +827,29 @@ impl CodegenAlternative { ) .collect::(); - let mut old_row = old_range.start.row; - let mut new_row = new_range.start.row; - let batch_diff = - similar::TextDiff::from_lines(old_text.as_str(), new_text.as_str()); - + let old_start_row = old_range.start.row; + let new_start_row = new_range.start.row; let mut deleted_row_ranges: Vec<(Anchor, RangeInclusive)> = Vec::new(); let mut inserted_row_ranges = Vec::new(); - for change in batch_diff.iter_all_changes() { - let line_count = change.value().lines().count() as u32; - match change.tag() { - similar::ChangeTag::Equal => { - old_row += line_count; - new_row += line_count; - } - similar::ChangeTag::Delete => { - let old_end_row = old_row + line_count - 1; - let new_row = new_snapshot.anchor_before(Point::new(new_row, 0)); - - if let Some((_, last_deleted_row_range)) = - deleted_row_ranges.last_mut() - { - if *last_deleted_row_range.end() + 1 == old_row { - *last_deleted_row_range = - *last_deleted_row_range.start()..=old_end_row; - } else { - deleted_row_ranges.push((new_row, old_row..=old_end_row)); - } - } else { - deleted_row_ranges.push((new_row, old_row..=old_end_row)); - } - - old_row += line_count; - } - similar::ChangeTag::Insert => { - let new_end_row = new_row + line_count - 1; - let start = new_snapshot.anchor_before(Point::new(new_row, 0)); - let end = new_snapshot.anchor_before(Point::new( - new_end_row, - new_snapshot.line_len(MultiBufferRow(new_end_row)), - )); - inserted_row_ranges.push(start..end); - new_row += line_count; - } + for (old_rows, new_rows) in line_diff(&old_text, &new_text) { + let old_rows = old_start_row + old_rows.start..old_start_row + old_rows.end; + let new_rows = new_start_row + new_rows.start..new_start_row + new_rows.end; + if !old_rows.is_empty() { + deleted_row_ranges.push(( + new_snapshot.anchor_before(Point::new(new_rows.start, 0)), + old_rows.start..=old_rows.end - 1, + )); + } + if !new_rows.is_empty() { + let start = new_snapshot.anchor_before(Point::new(new_rows.start, 0)); + let new_end_row = new_rows.end - 1; + let end = new_snapshot.anchor_before(Point::new( + new_end_row, + new_snapshot.line_len(MultiBufferRow(new_end_row)), + )); + inserted_row_ranges.push(start..end); } } - (deleted_row_ranges, inserted_row_ranges) }) .await; diff --git a/crates/editor/Cargo.toml b/crates/editor/Cargo.toml index abc8f0b6f0..08d626c25f 100644 --- a/crates/editor/Cargo.toml +++ b/crates/editor/Cargo.toml @@ -66,7 +66,6 @@ schemars.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true -similar.workspace = true smallvec.workspace = true smol.workspace = true snippet.workspace = true diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index 1afd49fba2..e30b369bd3 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -100,10 +100,10 @@ use language::{ language_settings::{ self, all_language_settings, language_settings, InlayHintSettings, RewrapBehavior, }, - point_from_lsp, AutoindentMode, BracketPair, Buffer, Capability, CharKind, CodeLabel, - CursorShape, Diagnostic, DiskState, EditPredictionsMode, EditPreview, HighlightedText, - IndentKind, IndentSize, Language, OffsetRangeExt, Point, Selection, SelectionGoal, TextObject, - TransactionId, TreeSitterOptions, + point_from_lsp, text_diff_with_options, AutoindentMode, BracketPair, Buffer, Capability, + CharKind, CodeLabel, CursorShape, Diagnostic, DiffOptions, DiskState, EditPredictionsMode, + EditPreview, HighlightedText, IndentKind, IndentSize, Language, OffsetRangeExt, Point, + Selection, SelectionGoal, TextObject, TransactionId, TreeSitterOptions, }; use language::{point_to_lsp, BufferRow, CharClassifier, Runnable, RunnableRange}; use linked_editing_ranges::refresh_linked_ranges; @@ -112,7 +112,6 @@ use persistence::DB; pub use proposed_changes_editor::{ ProposedChangeLocation, ProposedChangesEditor, ProposedChangesEditorToolbar, }; -use similar::{ChangeTag, TextDiff}; use std::iter::Peekable; use task::{ResolvedTask, TaskTemplate, TaskVariables}; @@ -202,7 +201,7 @@ pub(crate) const CURSORS_VISIBLE_FOR: Duration = Duration::from_millis(2000); #[doc(hidden)] pub const CODE_ACTIONS_DEBOUNCE_TIMEOUT: Duration = Duration::from_millis(250); -pub(crate) const FORMAT_TIMEOUT: Duration = Duration::from_secs(2); +pub(crate) const FORMAT_TIMEOUT: Duration = Duration::from_secs(5); pub(crate) const SCROLL_CENTER_TOP_BOTTOM_DEBOUNCE_TIMEOUT: Duration = Duration::from_secs(1); pub(crate) const EDIT_PREDICTION_KEY_CONTEXT: &str = "edit_prediction"; @@ -7829,6 +7828,7 @@ impl Editor { } let start = Point::new(start_row, 0); + let start_offset = start.to_offset(&buffer); let end = Point::new(end_row, buffer.line_len(MultiBufferRow(end_row))); let selection_text = buffer.text_for_range(start..end).collect::(); let Some(lines_without_prefixes) = selection_text @@ -7858,44 +7858,21 @@ impl Editor { // TODO: should always use char-based diff while still supporting cursor behavior that // matches vim. - let diff = match is_vim_mode { - IsVimMode::Yes => TextDiff::from_lines(&selection_text, &wrapped_text), - IsVimMode::No => TextDiff::from_chars(&selection_text, &wrapped_text), - }; - let mut offset = start.to_offset(&buffer); - let mut moved_since_edit = true; + let mut diff_options = DiffOptions::default(); + if is_vim_mode == IsVimMode::Yes { + diff_options.max_word_diff_len = 0; + diff_options.max_word_diff_line_count = 0; + } else { + diff_options.max_word_diff_len = usize::MAX; + diff_options.max_word_diff_line_count = usize::MAX; + } - for change in diff.iter_all_changes() { - let value = change.value(); - match change.tag() { - ChangeTag::Equal => { - offset += value.len(); - moved_since_edit = true; - } - ChangeTag::Delete => { - let start = buffer.anchor_after(offset); - let end = buffer.anchor_before(offset + value.len()); - - if moved_since_edit { - edits.push((start..end, String::new())); - } else { - edits.last_mut().unwrap().0.end = end; - } - - offset += value.len(); - moved_since_edit = false; - } - ChangeTag::Insert => { - if moved_since_edit { - let anchor = buffer.anchor_after(offset); - edits.push((anchor..anchor, value.to_string())); - } else { - edits.last_mut().unwrap().1.push_str(value); - } - - moved_since_edit = false; - } - } + for (old_range, new_text) in + text_diff_with_options(&selection_text, &wrapped_text, diff_options) + { + let edit_start = buffer.anchor_after(start_offset + old_range.start); + let edit_end = buffer.anchor_after(start_offset + old_range.end); + edits.push((edit_start..edit_end, new_text)); } rewrapped_row_ranges.push(start_row..=end_row); diff --git a/crates/editor/src/element.rs b/crates/editor/src/element.rs index 93d60624e2..ec99900ba8 100644 --- a/crates/editor/src/element.rs +++ b/crates/editor/src/element.rs @@ -8409,7 +8409,6 @@ mod tests { use gpui::{TestAppContext, VisualTestContext}; use language::language_settings; use log::info; - use similar::DiffableStr; use std::num::NonZeroU32; use util::test::sample_text; @@ -8709,7 +8708,7 @@ mod tests { state .line_numbers .get(&MultiBufferRow(0)) - .and_then(|line_number| line_number.shaped_line.text.as_str()), + .map(|line_number| line_number.shaped_line.text.as_ref()), Some("1") ); } diff --git a/crates/language/Cargo.toml b/crates/language/Cargo.toml index 9e5fd55c96..c2e4c35454 100644 --- a/crates/language/Cargo.toml +++ b/crates/language/Cargo.toml @@ -37,6 +37,7 @@ fuzzy.workspace = true globset.workspace = true gpui.workspace = true http_client.workspace = true +imara-diff.workspace = true itertools.workspace = true log.workspace = true lsp.workspace = true @@ -49,7 +50,6 @@ schemars.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true -similar.workspace = true smallvec.workspace = true smol.workspace = true streaming-iterator.workspace = true diff --git a/crates/language/src/buffer.rs b/crates/language/src/buffer.rs index c23eeae533..192519367b 100644 --- a/crates/language/src/buffer.rs +++ b/crates/language/src/buffer.rs @@ -12,6 +12,7 @@ use crate::{ SyntaxMapMatches, SyntaxSnapshot, ToTreeSitterPoint, }, task_context::RunnableRange, + text_diff::text_diff, LanguageScope, Outline, OutlineConfig, RunnableCapture, RunnableTag, TextObject, TreeSitterOptions, }; @@ -32,7 +33,6 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use serde_json::Value; use settings::WorktreeId; -use similar::{ChangeTag, TextDiff}; use smallvec::SmallVec; use smol::future::yield_now; use std::{ @@ -1792,61 +1792,7 @@ impl Buffer { let old_text = old_text.to_string(); let line_ending = LineEnding::detect(&new_text); LineEnding::normalize(&mut new_text); - - let diff = TextDiff::from_chars(old_text.as_str(), new_text.as_str()); - let empty: Arc = Arc::default(); - - let mut edits = Vec::new(); - let mut old_offset = 0; - let mut new_offset = 0; - let mut last_edit: Option<(Range, Range)> = None; - for change in diff.iter_all_changes().map(Some).chain([None]) { - if let Some(change) = &change { - let len = change.value().len(); - match change.tag() { - ChangeTag::Equal => { - old_offset += len; - new_offset += len; - } - ChangeTag::Delete => { - let old_end_offset = old_offset + len; - if let Some((last_old_range, _)) = &mut last_edit { - last_old_range.end = old_end_offset; - } else { - last_edit = - Some((old_offset..old_end_offset, new_offset..new_offset)); - } - old_offset = old_end_offset; - } - ChangeTag::Insert => { - let new_end_offset = new_offset + len; - if let Some((_, last_new_range)) = &mut last_edit { - last_new_range.end = new_end_offset; - } else { - last_edit = - Some((old_offset..old_offset, new_offset..new_end_offset)); - } - new_offset = new_end_offset; - } - } - } - - if let Some((old_range, new_range)) = &last_edit { - if old_offset > old_range.end - || new_offset > new_range.end - || change.is_none() - { - let text = if new_range.is_empty() { - empty.clone() - } else { - new_text[new_range.clone()].into() - }; - edits.push((old_range.clone(), text)); - last_edit.take(); - } - } - } - + let edits = text_diff(&old_text, &new_text); Diff { base_version, line_ending, diff --git a/crates/language/src/buffer_tests.rs b/crates/language/src/buffer_tests.rs index 84aef374ca..1b3b39dcc4 100644 --- a/crates/language/src/buffer_tests.rs +++ b/crates/language/src/buffer_tests.rs @@ -25,6 +25,7 @@ use text::{BufferId, LineEnding}; use text::{Point, ToPoint}; use theme::ActiveTheme; use unindent::Unindent as _; +use util::test::marked_text_offsets; use util::{assert_set_eq, post_inc, test::marked_text_ranges, RandomCharIter}; pub static TRAILING_WHITESPACE_REGEX: LazyLock = LazyLock::new(|| { @@ -354,24 +355,44 @@ fn test_edit_events(cx: &mut gpui::App) { #[gpui::test] async fn test_apply_diff(cx: &mut TestAppContext) { - let text = "a\nbb\nccc\ndddd\neeeee\nffffff\n"; + let (text, offsets) = marked_text_offsets( + "one two three\nfour fiˇve six\nseven eightˇ nine\nten eleven twelve\n", + ); let buffer = cx.new(|cx| Buffer::local(text, cx)); - let anchor = buffer.update(cx, |buffer, _| buffer.anchor_before(Point::new(3, 3))); - - let text = "a\nccc\ndddd\nffffff\n"; - let diff = buffer.update(cx, |b, cx| b.diff(text.into(), cx)).await; - buffer.update(cx, |buffer, cx| { - buffer.apply_diff(diff, cx).unwrap(); - assert_eq!(buffer.text(), text); - assert_eq!(anchor.to_point(buffer), Point::new(2, 3)); + let anchors = buffer.update(cx, |buffer, _| { + offsets + .iter() + .map(|offset| buffer.anchor_before(offset)) + .collect::>() }); - let text = "a\n1\n\nccc\ndd2dd\nffffff\n"; - let diff = buffer.update(cx, |b, cx| b.diff(text.into(), cx)).await; + let (text, offsets) = marked_text_offsets( + "one two three\n{\nfour FIVEˇ six\n}\nseven AND EIGHTˇ nine\nten eleven twelve\n", + ); + + let diff = buffer.update(cx, |b, cx| b.diff(text.clone(), cx)).await; buffer.update(cx, |buffer, cx| { buffer.apply_diff(diff, cx).unwrap(); assert_eq!(buffer.text(), text); - assert_eq!(anchor.to_point(buffer), Point::new(4, 4)); + let actual_offsets = anchors + .iter() + .map(|anchor| anchor.to_offset(buffer)) + .collect::>(); + assert_eq!(actual_offsets, offsets); + }); + + let (text, offsets) = + marked_text_offsets("one two three\n{\nˇ}\nseven AND EIGHTEENˇ nine\nten eleven twelve\n"); + + let diff = buffer.update(cx, |b, cx| b.diff(text.clone(), cx)).await; + buffer.update(cx, |buffer, cx| { + buffer.apply_diff(diff, cx).unwrap(); + assert_eq!(buffer.text(), text); + let actual_offsets = anchors + .iter() + .map(|anchor| anchor.to_offset(buffer)) + .collect::>(); + assert_eq!(actual_offsets, offsets); }); } diff --git a/crates/language/src/language.rs b/crates/language/src/language.rs index b6c5bf6225..89efb27ec4 100644 --- a/crates/language/src/language.rs +++ b/crates/language/src/language.rs @@ -15,6 +15,7 @@ mod outline; pub mod proto; mod syntax_map; mod task_context; +mod text_diff; mod toolchain; #[cfg(test)] @@ -62,6 +63,7 @@ use std::{num::NonZeroU32, sync::OnceLock}; use syntax_map::{QueryCursorHandle, SyntaxSnapshot}; use task::RunnableTag; pub use task_context::{ContextProvider, RunnableRange}; +pub use text_diff::{line_diff, text_diff, text_diff_with_options, unified_diff, DiffOptions}; use theme::SyntaxTheme; pub use toolchain::{LanguageToolchainStore, Toolchain, ToolchainList, ToolchainLister}; use tree_sitter::{self, wasmtime, Query, QueryCursor, WasmStore}; diff --git a/crates/language/src/text_diff.rs b/crates/language/src/text_diff.rs new file mode 100644 index 0000000000..83c4aba3bb --- /dev/null +++ b/crates/language/src/text_diff.rs @@ -0,0 +1,274 @@ +use crate::{CharClassifier, CharKind, LanguageScope}; +use imara_diff::{ + diff, + intern::{InternedInput, Token}, + sources::lines_with_terminator, + Algorithm, UnifiedDiffBuilder, +}; +use std::{iter, ops::Range, sync::Arc}; + +const MAX_WORD_DIFF_LEN: usize = 512; +const MAX_WORD_DIFF_LINE_COUNT: usize = 8; + +/// Computes a diff between two strings, returning a unified diff string. +pub fn unified_diff(old_text: &str, new_text: &str) -> String { + let input = InternedInput::new(old_text, new_text); + diff( + Algorithm::Histogram, + &input, + UnifiedDiffBuilder::new(&input), + ) +} + +/// Computes a diff between two strings, returning a vector of old and new row +/// ranges. +pub fn line_diff(old_text: &str, new_text: &str) -> Vec<(Range, Range)> { + let mut edits = Vec::new(); + let input = InternedInput::new( + lines_with_terminator(old_text), + lines_with_terminator(new_text), + ); + diff_internal(&input, |_, _, old_rows, new_rows| { + edits.push((old_rows, new_rows)); + }); + edits +} + +/// Computes a diff between two strings, returning a vector of edits. +/// +/// The edits are represented as tuples of byte ranges and replacement strings. +/// +/// Internally, this function first performs a line-based diff, and then performs a second +/// word-based diff within hunks that replace small numbers of lines. +pub fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range, Arc)> { + text_diff_with_options(old_text, new_text, DiffOptions::default()) +} + +pub struct DiffOptions { + pub language_scope: Option, + pub max_word_diff_len: usize, + pub max_word_diff_line_count: usize, +} + +impl Default for DiffOptions { + fn default() -> Self { + Self { + language_scope: Default::default(), + max_word_diff_len: MAX_WORD_DIFF_LEN, + max_word_diff_line_count: MAX_WORD_DIFF_LINE_COUNT, + } + } +} + +/// Computes a diff between two strings, using a specific language scope's +/// word characters for word-level diffing. +pub fn text_diff_with_options( + old_text: &str, + new_text: &str, + options: DiffOptions, +) -> Vec<(Range, Arc)> { + let empty: Arc = Arc::default(); + let mut edits = Vec::new(); + let mut hunk_input = InternedInput::default(); + let input = InternedInput::new( + lines_with_terminator(old_text), + lines_with_terminator(new_text), + ); + diff_internal( + &input, + |old_byte_range, new_byte_range, old_rows, new_rows| { + if should_perform_word_diff_within_hunk( + &old_rows, + &old_byte_range, + &new_rows, + &new_byte_range, + &options, + ) { + let old_offset = old_byte_range.start; + let new_offset = new_byte_range.start; + hunk_input.clear(); + hunk_input.update_before(tokenize( + &old_text[old_byte_range.clone()], + options.language_scope.clone(), + )); + hunk_input.update_after(tokenize( + &new_text[new_byte_range.clone()], + options.language_scope.clone(), + )); + diff_internal(&hunk_input, |old_byte_range, new_byte_range, _, _| { + let old_byte_range = + old_offset + old_byte_range.start..old_offset + old_byte_range.end; + let new_byte_range = + new_offset + new_byte_range.start..new_offset + new_byte_range.end; + let replacement_text = if new_byte_range.is_empty() { + empty.clone() + } else { + new_text[new_byte_range.clone()].into() + }; + edits.push((old_byte_range, replacement_text)); + }); + } else { + let replacement_text = if new_byte_range.is_empty() { + empty.clone() + } else { + new_text[new_byte_range.clone()].into() + }; + edits.push((old_byte_range.clone(), replacement_text)); + } + }, + ); + edits +} + +fn should_perform_word_diff_within_hunk( + old_row_range: &Range, + old_byte_range: &Range, + new_row_range: &Range, + new_byte_range: &Range, + options: &DiffOptions, +) -> bool { + !old_byte_range.is_empty() + && !new_byte_range.is_empty() + && old_byte_range.len() <= options.max_word_diff_len + && new_byte_range.len() <= options.max_word_diff_len + && old_row_range.len() <= options.max_word_diff_line_count + && new_row_range.len() <= options.max_word_diff_line_count +} + +fn diff_internal( + input: &InternedInput<&str>, + mut on_change: impl FnMut(Range, Range, Range, Range), +) { + let mut old_offset = 0; + let mut new_offset = 0; + let mut old_token_ix = 0; + let mut new_token_ix = 0; + diff( + Algorithm::Histogram, + input, + |old_tokens: Range, new_tokens: Range| { + old_offset += token_len( + &input, + &input.before[old_token_ix as usize..old_tokens.start as usize], + ); + new_offset += token_len( + &input, + &input.after[new_token_ix as usize..new_tokens.start as usize], + ); + let old_len = token_len( + &input, + &input.before[old_tokens.start as usize..old_tokens.end as usize], + ); + let new_len = token_len( + &input, + &input.after[new_tokens.start as usize..new_tokens.end as usize], + ); + let old_byte_range = old_offset..old_offset + old_len; + let new_byte_range = new_offset..new_offset + new_len; + old_token_ix = old_tokens.end; + new_token_ix = new_tokens.end; + old_offset = old_byte_range.end; + new_offset = new_byte_range.end; + on_change(old_byte_range, new_byte_range, old_tokens, new_tokens); + }, + ); +} + +fn tokenize(text: &str, language_scope: Option) -> impl Iterator { + let classifier = CharClassifier::new(language_scope).for_completion(true); + let mut chars = text.char_indices(); + let mut prev = None; + let mut start_ix = 0; + iter::from_fn(move || { + while let Some((ix, c)) = chars.next() { + let mut token = None; + let kind = classifier.kind(c); + if let Some((prev_char, prev_kind)) = prev { + if kind != prev_kind || (kind == CharKind::Punctuation && c != prev_char) { + token = Some(&text[start_ix..ix]); + start_ix = ix; + } + } + prev = Some((c, kind)); + if token.is_some() { + return token; + } + } + if start_ix < text.len() { + let token = &text[start_ix..]; + start_ix = text.len(); + return Some(token); + } + None + }) +} + +fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize { + tokens + .iter() + .map(|token| input.interner[*token].len()) + .sum() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tokenize() { + let text = ""; + assert_eq!(tokenize(text, None).collect::>(), Vec::<&str>::new()); + + let text = " "; + assert_eq!(tokenize(text, None).collect::>(), vec![" "]); + + let text = "one"; + assert_eq!(tokenize(text, None).collect::>(), vec!["one"]); + + let text = "one\n"; + assert_eq!(tokenize(text, None).collect::>(), vec!["one", "\n"]); + + let text = "one.two(three)"; + assert_eq!( + tokenize(text, None).collect::>(), + vec!["one", ".", "two", "(", "three", ")"] + ); + + let text = "one two three()"; + assert_eq!( + tokenize(text, None).collect::>(), + vec!["one", " ", "two", " ", "three", "(", ")"] + ); + + let text = " one\n two three"; + assert_eq!( + tokenize(text, None).collect::>(), + vec![" ", "one", "\n ", "two", " ", "three"] + ); + } + + #[test] + fn test_text_diff() { + let old_text = "one two three"; + let new_text = "one TWO three"; + assert_eq!(text_diff(old_text, new_text), [(4..7, "TWO".into()),]); + + let old_text = "one\ntwo\nthree\n"; + let new_text = "one\ntwo\nAND\nTHEN\nthree\n"; + assert_eq!( + text_diff(old_text, new_text), + [(8..8, "AND\nTHEN\n".into()),] + ); + + let old_text = "one two\nthree four five\nsix seven eight nine\nten\n"; + let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n"; + assert_eq!( + text_diff(old_text, new_text), + [ + (14..18, "FOUR".into()), + (28..33, "SEVEN".into()), + (49..49, "ELEVEN\n".into()) + ] + ); + } +} diff --git a/crates/project/Cargo.toml b/crates/project/Cargo.toml index ea5a852f78..d2d571a81c 100644 --- a/crates/project/Cargo.toml +++ b/crates/project/Cargo.toml @@ -64,7 +64,6 @@ settings.workspace = true sha2.workspace = true shellexpand.workspace = true shlex.workspace = true -similar = "1.3" smol.workspace = true snippet.workspace = true snippet_provider.workspace = true diff --git a/crates/project/src/lsp_store.rs b/crates/project/src/lsp_store.rs index 427ebd1b59..632b96c8b7 100644 --- a/crates/project/src/lsp_store.rs +++ b/crates/project/src/lsp_store.rs @@ -63,7 +63,6 @@ use rpc::{ use serde::Serialize; use settings::{Settings, SettingsLocation, SettingsStore}; use sha2::{Digest, Sha256}; -use similar::{ChangeTag, TextDiff}; use smol::channel::Sender; use snippet::Snippet; use std::{ @@ -110,15 +109,13 @@ pub enum LspFormatTarget { Ranges(BTreeMap>>), } -// proto::RegisterBufferWithLanguageServer {} - pub type OpenLspBufferHandle = Entity>; // Currently, formatting operations are represented differently depending on // whether they come from a language server or an external command. #[derive(Debug)] pub enum FormatOperation { - Lsp(Vec<(Range, String)>), + Lsp(Vec<(Range, Arc)>), External(Diff), Prettier(Diff), } @@ -1388,7 +1385,7 @@ impl LocalLspStore { language_server: &Arc, settings: &LanguageSettings, cx: &mut AsyncApp, - ) -> Result, String)>> { + ) -> Result, Arc)>> { let capabilities = &language_server.capabilities(); let range_formatting_provider = capabilities.document_range_formatting_provider.as_ref(); if range_formatting_provider.map_or(false, |provider| provider == &OneOf::Left(false)) { @@ -1459,7 +1456,7 @@ impl LocalLspStore { language_server: &Arc, settings: &LanguageSettings, cx: &mut AsyncApp, - ) -> Result, String)>> { + ) -> Result, Arc)>> { let uri = lsp::Url::from_file_path(abs_path) .map_err(|_| anyhow!("failed to convert abs path to uri"))?; let text_document = lsp::TextDocumentIdentifier::new(uri); @@ -2144,7 +2141,7 @@ impl LocalLspStore { server_id: LanguageServerId, version: Option, cx: &mut Context, - ) -> Task, String)>>> { + ) -> Task, Arc)>>> { let snapshot = self.buffer_snapshot_for_lsp_version(buffer, server_id, version, cx); cx.background_spawn(async move { let snapshot = snapshot?; @@ -2192,48 +2189,23 @@ impl LocalLspStore { // we can identify the changes more precisely, preserving the locations // of any anchors positioned in the unchanged regions. if range.end.row > range.start.row { - let mut offset = range.start.to_offset(&snapshot); + let offset = range.start.to_offset(&snapshot); let old_text = snapshot.text_for_range(range).collect::(); - - let diff = TextDiff::from_lines(old_text.as_str(), &new_text); - let mut moved_since_edit = true; - for change in diff.iter_all_changes() { - let tag = change.tag(); - let value = change.value(); - match tag { - ChangeTag::Equal => { - offset += value.len(); - moved_since_edit = true; - } - ChangeTag::Delete => { - let start = snapshot.anchor_after(offset); - let end = snapshot.anchor_before(offset + value.len()); - if moved_since_edit { - edits.push((start..end, String::new())); - } else { - edits.last_mut().unwrap().0.end = end; - } - offset += value.len(); - moved_since_edit = false; - } - ChangeTag::Insert => { - if moved_since_edit { - let anchor = snapshot.anchor_after(offset); - edits.push((anchor..anchor, value.to_string())); - } else { - edits.last_mut().unwrap().1.push_str(value); - } - moved_since_edit = false; - } - } - } + let range_edits = language::text_diff(old_text.as_str(), &new_text); + edits.extend(range_edits.into_iter().map(|(range, replacement)| { + ( + snapshot.anchor_after(offset + range.start) + ..snapshot.anchor_before(offset + range.end), + replacement, + ) + })); } else if range.end == range.start { let anchor = snapshot.anchor_after(range.start); - edits.push((anchor..anchor, new_text)); + edits.push((anchor..anchor, new_text.into())); } else { let edit_start = snapshot.anchor_after(range.start); let edit_end = snapshot.anchor_before(range.end); - edits.push((edit_start..edit_end, new_text)); + edits.push((edit_start..edit_end, new_text.into())); } } diff --git a/crates/project/src/project_tests.rs b/crates/project/src/project_tests.rs index 1a3d753a21..150fc9fa56 100644 --- a/crates/project/src/project_tests.rs +++ b/crates/project/src/project_tests.rs @@ -25,7 +25,11 @@ use std::{mem, num::NonZeroU32, ops::Range, task::Poll}; use task::{ResolvedTask, TaskContext}; use unindent::Unindent as _; use util::{ - assert_set_eq, path, paths::PathMatcher, separator, test::TempTree, uri, TryFutureExt as _, + assert_set_eq, path, + paths::PathMatcher, + separator, + test::{marked_text_offsets, TempTree}, + uri, TryFutureExt as _, }; #[gpui::test] @@ -3625,7 +3629,8 @@ async fn test_buffer_is_dirty(cx: &mut gpui::TestAppContext) { async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { init_test(cx); - let initial_contents = "aaa\nbbbbb\nc\n"; + let (initial_contents, initial_offsets) = + marked_text_offsets("one twoˇ\nthree ˇfourˇ five\nsixˇ seven\n"); let fs = FakeFs::new(cx.executor()); fs.insert_tree( path!("/dir"), @@ -3640,8 +3645,9 @@ async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { .await .unwrap(); - let anchors = (0..3) - .map(|row| buffer.update(cx, |b, _| b.anchor_before(Point::new(row, 1)))) + let anchors = initial_offsets + .iter() + .map(|offset| buffer.update(cx, |b, _| b.anchor_before(offset))) .collect::>(); // Change the file on disk, adding two new lines of text, and removing @@ -3650,10 +3656,12 @@ async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { assert!(!buffer.is_dirty()); assert!(!buffer.has_conflict()); }); - let new_contents = "AAAA\naaa\nBB\nbbbbb\n"; + + let (new_contents, new_offsets) = + marked_text_offsets("oneˇ\nthree ˇFOURˇ five\nsixtyˇ seven\n"); fs.save( path!("/dir/the-file").as_ref(), - &new_contents.into(), + &new_contents.as_str().into(), LineEnding::Unix, ) .await @@ -3668,14 +3676,11 @@ async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { assert!(!buffer.is_dirty()); assert!(!buffer.has_conflict()); - let anchor_positions = anchors + let anchor_offsets = anchors .iter() - .map(|anchor| anchor.to_point(&*buffer)) + .map(|anchor| anchor.to_offset(&*buffer)) .collect::>(); - assert_eq!( - anchor_positions, - [Point::new(1, 1), Point::new(3, 1), Point::new(3, 5)] - ); + assert_eq!(anchor_offsets, new_offsets); }); // Modify the buffer @@ -3698,6 +3703,7 @@ async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { // marked as having a conflict. cx.executor().run_until_parked(); buffer.update(cx, |buffer, _| { + assert_eq!(buffer.text(), " ".to_string() + &new_contents); assert!(buffer.has_conflict()); }); } diff --git a/crates/zeta/Cargo.toml b/crates/zeta/Cargo.toml index f8f0bb4da3..515624962a 100644 --- a/crates/zeta/Cargo.toml +++ b/crates/zeta/Cargo.toml @@ -45,7 +45,6 @@ release_channel.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true -similar.workspace = true telemetry.workspace = true telemetry_events.workspace = true theme.workspace = true diff --git a/crates/zeta/src/zeta.rs b/crates/zeta/src/zeta.rs index f7f335469c..82fd0e9991 100644 --- a/crates/zeta/src/zeta.rs +++ b/crates/zeta/src/zeta.rs @@ -29,8 +29,7 @@ use gpui::{ use http_client::{HttpClient, Method}; use input_excerpt::excerpt_for_cursor_position; use language::{ - Anchor, Buffer, BufferSnapshot, CharClassifier, CharKind, EditPreview, OffsetRangeExt, - ToOffset, ToPoint, + text_diff, Anchor, Buffer, BufferSnapshot, EditPreview, OffsetRangeExt, ToOffset, ToPoint, }; use language_models::LlmApiToken; use postage::watch; @@ -919,77 +918,18 @@ and then another offset: usize, snapshot: &BufferSnapshot, ) -> Vec<(Range, String)> { - fn tokenize(text: &str) -> Vec<&str> { - let classifier = CharClassifier::new(None).for_completion(true); - let mut chars = text.chars().peekable(); - let mut prev_ch = chars.peek().copied(); - let mut tokens = Vec::new(); - let mut start = 0; - let mut end = 0; - while let Some(ch) = chars.next() { - let prev_kind = prev_ch.map(|ch| classifier.kind(ch)); - let kind = classifier.kind(ch); - if Some(kind) != prev_kind || (kind == CharKind::Punctuation && Some(ch) != prev_ch) - { - tokens.push(&text[start..end]); - start = end; - } - end += ch.len_utf8(); - prev_ch = Some(ch); - } - tokens.push(&text[start..end]); - tokens - } - - let old_tokens = tokenize(&old_text); - let new_tokens = tokenize(new_text); - - let diff = similar::TextDiffConfig::default() - .algorithm(similar::Algorithm::Patience) - .diff_slices(&old_tokens, &new_tokens); - let mut edits: Vec<(Range, String)> = Vec::new(); - let mut old_start = offset; - for change in diff.iter_all_changes() { - let value = change.value(); - match change.tag() { - similar::ChangeTag::Equal => { - old_start += value.len(); - } - similar::ChangeTag::Delete => { - let old_end = old_start + value.len(); - if let Some((last_old_range, _)) = edits.last_mut() { - if last_old_range.end == old_start { - last_old_range.end = old_end; - } else { - edits.push((old_start..old_end, String::new())); - } - } else { - edits.push((old_start..old_end, String::new())); - } - old_start = old_end; - } - similar::ChangeTag::Insert => { - if let Some((last_old_range, last_new_text)) = edits.last_mut() { - if last_old_range.end == old_start { - last_new_text.push_str(value); - } else { - edits.push((old_start..old_start, value.into())); - } - } else { - edits.push((old_start..old_start, value.into())); - } - } - } - } - - edits + text_diff(&old_text, &new_text) .into_iter() .map(|(mut old_range, new_text)| { + old_range.start += offset; + old_range.end += offset; + let prefix_len = common_prefix( snapshot.chars_for_range(old_range.clone()), new_text.chars(), ); old_range.start += prefix_len; + let suffix_len = common_prefix( snapshot.reversed_chars_for_range(old_range.clone()), new_text[prefix_len..].chars().rev(), @@ -1248,10 +1188,7 @@ impl Event { writeln!(prompt, "User renamed {:?} to {:?}\n", old_path, new_path).unwrap(); } - let diff = - similar::TextDiff::from_lines(&old_snapshot.text(), &new_snapshot.text()) - .unified_diff() - .to_string(); + let diff = language::unified_diff(&old_snapshot.text(), &new_snapshot.text()); if !diff.is_empty() { write!( prompt,