use crate::{CharClassifier, CharKind, LanguageScope}; use imara_diff::{ Algorithm, UnifiedDiffBuilder, diff, intern::{InternedInput, Token}, sources::lines_with_terminator, }; use std::{iter, ops::Range, sync::Arc}; const MAX_WORD_DIFF_LEN: usize = 512; const MAX_WORD_DIFF_LINE_COUNT: usize = 8; /// Computes a diff between two strings, returning a unified diff string. pub fn unified_diff(old_text: &str, new_text: &str) -> String { let input = InternedInput::new(old_text, new_text); diff( Algorithm::Histogram, &input, UnifiedDiffBuilder::new(&input), ) } /// Computes a diff between two strings, returning a vector of old and new row /// ranges. pub fn line_diff(old_text: &str, new_text: &str) -> Vec<(Range, Range)> { let mut edits = Vec::new(); let input = InternedInput::new( lines_with_terminator(old_text), lines_with_terminator(new_text), ); diff_internal(&input, |_, _, old_rows, new_rows| { edits.push((old_rows, new_rows)); }); edits } /// Computes a diff between two strings, returning a vector of edits. /// /// The edits are represented as tuples of byte ranges and replacement strings. /// /// Internally, this function first performs a line-based diff, and then performs a second /// word-based diff within hunks that replace small numbers of lines. pub fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range, Arc)> { text_diff_with_options(old_text, new_text, DiffOptions::default()) } pub struct DiffOptions { pub language_scope: Option, pub max_word_diff_len: usize, pub max_word_diff_line_count: usize, } impl Default for DiffOptions { fn default() -> Self { Self { language_scope: Default::default(), max_word_diff_len: MAX_WORD_DIFF_LEN, max_word_diff_line_count: MAX_WORD_DIFF_LINE_COUNT, } } } /// Computes a diff between two strings, using a specific language scope's /// word characters for word-level diffing. pub fn text_diff_with_options( old_text: &str, new_text: &str, options: DiffOptions, ) -> Vec<(Range, Arc)> { let empty: Arc = Arc::default(); let mut edits = Vec::new(); let mut hunk_input = InternedInput::default(); let input = InternedInput::new( lines_with_terminator(old_text), lines_with_terminator(new_text), ); diff_internal( &input, |old_byte_range, new_byte_range, old_rows, new_rows| { if should_perform_word_diff_within_hunk( &old_rows, &old_byte_range, &new_rows, &new_byte_range, &options, ) { let old_offset = old_byte_range.start; let new_offset = new_byte_range.start; hunk_input.clear(); hunk_input.update_before(tokenize( &old_text[old_byte_range.clone()], options.language_scope.clone(), )); hunk_input.update_after(tokenize( &new_text[new_byte_range.clone()], options.language_scope.clone(), )); diff_internal(&hunk_input, |old_byte_range, new_byte_range, _, _| { let old_byte_range = old_offset + old_byte_range.start..old_offset + old_byte_range.end; let new_byte_range = new_offset + new_byte_range.start..new_offset + new_byte_range.end; let replacement_text = if new_byte_range.is_empty() { empty.clone() } else { new_text[new_byte_range.clone()].into() }; edits.push((old_byte_range, replacement_text)); }); } else { let replacement_text = if new_byte_range.is_empty() { empty.clone() } else { new_text[new_byte_range.clone()].into() }; edits.push((old_byte_range.clone(), replacement_text)); } }, ); edits } fn should_perform_word_diff_within_hunk( old_row_range: &Range, old_byte_range: &Range, new_row_range: &Range, new_byte_range: &Range, options: &DiffOptions, ) -> bool { !old_byte_range.is_empty() && !new_byte_range.is_empty() && old_byte_range.len() <= options.max_word_diff_len && new_byte_range.len() <= options.max_word_diff_len && old_row_range.len() <= options.max_word_diff_line_count && new_row_range.len() <= options.max_word_diff_line_count } fn diff_internal( input: &InternedInput<&str>, mut on_change: impl FnMut(Range, Range, Range, Range), ) { let mut old_offset = 0; let mut new_offset = 0; let mut old_token_ix = 0; let mut new_token_ix = 0; diff( Algorithm::Histogram, input, |old_tokens: Range, new_tokens: Range| { old_offset += token_len( &input, &input.before[old_token_ix as usize..old_tokens.start as usize], ); new_offset += token_len( &input, &input.after[new_token_ix as usize..new_tokens.start as usize], ); let old_len = token_len( &input, &input.before[old_tokens.start as usize..old_tokens.end as usize], ); let new_len = token_len( &input, &input.after[new_tokens.start as usize..new_tokens.end as usize], ); let old_byte_range = old_offset..old_offset + old_len; let new_byte_range = new_offset..new_offset + new_len; old_token_ix = old_tokens.end; new_token_ix = new_tokens.end; old_offset = old_byte_range.end; new_offset = new_byte_range.end; on_change(old_byte_range, new_byte_range, old_tokens, new_tokens); }, ); } fn tokenize(text: &str, language_scope: Option) -> impl Iterator { let classifier = CharClassifier::new(language_scope).for_completion(true); let mut chars = text.char_indices(); let mut prev = None; let mut start_ix = 0; iter::from_fn(move || { while let Some((ix, c)) = chars.next() { let mut token = None; let kind = classifier.kind(c); if let Some((prev_char, prev_kind)) = prev { if kind != prev_kind || (kind == CharKind::Punctuation && c != prev_char) { token = Some(&text[start_ix..ix]); start_ix = ix; } } prev = Some((c, kind)); if token.is_some() { return token; } } if start_ix < text.len() { let token = &text[start_ix..]; start_ix = text.len(); return Some(token); } None }) } fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize { tokens .iter() .map(|token| input.interner[*token].len()) .sum() } #[cfg(test)] mod tests { use super::*; #[test] fn test_tokenize() { let text = ""; assert_eq!(tokenize(text, None).collect::>(), Vec::<&str>::new()); let text = " "; assert_eq!(tokenize(text, None).collect::>(), vec![" "]); let text = "one"; assert_eq!(tokenize(text, None).collect::>(), vec!["one"]); let text = "one\n"; assert_eq!(tokenize(text, None).collect::>(), vec!["one", "\n"]); let text = "one.two(three)"; assert_eq!( tokenize(text, None).collect::>(), vec!["one", ".", "two", "(", "three", ")"] ); let text = "one two three()"; assert_eq!( tokenize(text, None).collect::>(), vec!["one", " ", "two", " ", "three", "(", ")"] ); let text = " one\n two three"; assert_eq!( tokenize(text, None).collect::>(), vec![" ", "one", "\n ", "two", " ", "three"] ); } #[test] fn test_text_diff() { let old_text = "one two three"; let new_text = "one TWO three"; assert_eq!(text_diff(old_text, new_text), [(4..7, "TWO".into()),]); let old_text = "one\ntwo\nthree\n"; let new_text = "one\ntwo\nAND\nTHEN\nthree\n"; assert_eq!( text_diff(old_text, new_text), [(8..8, "AND\nTHEN\n".into()),] ); let old_text = "one two\nthree four five\nsix seven eight nine\nten\n"; let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n"; assert_eq!( text_diff(old_text, new_text), [ (14..18, "FOUR".into()), (28..33, "SEVEN".into()), (49..49, "ELEVEN\n".into()) ] ); } }