Support multi-byte characters in diff

This commit is contained in:
Antonio Scandurra 2023-08-23 09:32:38 +02:00
parent a9871a7a70
commit c2935056e8

View file

@ -1,6 +1,7 @@
use std::{ use std::{
cmp, cmp,
fmt::{self, Debug}, fmt::{self, Debug},
ops::Range,
}; };
use collections::BinaryHeap; use collections::BinaryHeap;
@ -71,8 +72,8 @@ pub enum Hunk {
} }
pub struct Diff { pub struct Diff {
old: String, old: Vec<char>,
new: String, new: Vec<char>,
scores: Matrix, scores: Matrix,
old_text_ix: usize, old_text_ix: usize,
new_text_ix: usize, new_text_ix: usize,
@ -84,6 +85,7 @@ impl Diff {
const EQUALITY_SCORE: isize = 5; const EQUALITY_SCORE: isize = 5;
pub fn new(old: String) -> Self { pub fn new(old: String) -> Self {
let old = old.chars().collect::<Vec<_>>();
let mut scores = Matrix::new(); let mut scores = Matrix::new();
scores.resize(old.len() + 1, 1); scores.resize(old.len() + 1, 1);
for i in 0..=old.len() { for i in 0..=old.len() {
@ -91,7 +93,7 @@ impl Diff {
} }
Self { Self {
old, old,
new: String::new(), new: Vec::new(),
scores, scores,
old_text_ix: 0, old_text_ix: 0,
new_text_ix: 0, new_text_ix: 0,
@ -99,7 +101,7 @@ impl Diff {
} }
pub fn push_new(&mut self, text: &str) -> Vec<Hunk> { pub fn push_new(&mut self, text: &str) -> Vec<Hunk> {
self.new.push_str(text); self.new.extend(text.chars());
self.scores.resize(self.old.len() + 1, self.new.len() + 1); self.scores.resize(self.old.len() + 1, self.new.len() + 1);
for j in self.new_text_ix + 1..=self.new.len() { for j in self.new_text_ix + 1..=self.new.len() {
@ -107,7 +109,7 @@ impl Diff {
for i in 1..=self.old.len() { for i in 1..=self.old.len() {
let insertion_score = self.scores.get(i, j - 1) + Self::INSERTION_SCORE; let insertion_score = self.scores.get(i, j - 1) + Self::INSERTION_SCORE;
let deletion_score = self.scores.get(i - 1, j) + Self::DELETION_SCORE; let deletion_score = self.scores.get(i - 1, j) + Self::DELETION_SCORE;
let equality_score = if self.old.as_bytes()[i - 1] == self.new.as_bytes()[j - 1] { let equality_score = if self.old[i - 1] == self.new[j - 1] {
self.scores.get(i - 1, j - 1) + Self::EQUALITY_SCORE self.scores.get(i - 1, j - 1) + Self::EQUALITY_SCORE
} else { } else {
isize::MIN isize::MIN
@ -138,6 +140,7 @@ impl Diff {
} }
fn backtrack(&self, old_text_ix: usize, new_text_ix: usize) -> Vec<Hunk> { fn backtrack(&self, old_text_ix: usize, new_text_ix: usize) -> Vec<Hunk> {
let mut pending_insert: Option<Range<usize>> = None;
let mut hunks = Vec::new(); let mut hunks = Vec::new();
let mut i = old_text_ix; let mut i = old_text_ix;
let mut j = new_text_ix; let mut j = new_text_ix;
@ -153,7 +156,7 @@ impl Diff {
None None
}; };
let equality_score = if i > self.old_text_ix && j > self.new_text_ix { let equality_score = if i > self.old_text_ix && j > self.new_text_ix {
if self.old.as_bytes()[i - 1] == self.new.as_bytes()[j - 1] { if self.old[i - 1] == self.new[j - 1] {
Some((i - 1, j - 1)) Some((i - 1, j - 1))
} else { } else {
None None
@ -169,30 +172,44 @@ impl Diff {
.unwrap(); .unwrap();
if prev_i == i && prev_j == j - 1 { if prev_i == i && prev_j == j - 1 {
if let Some(Hunk::Insert { text }) = hunks.last_mut() { if let Some(pending_insert) = pending_insert.as_mut() {
text.insert_str(0, &self.new[prev_j..j]); pending_insert.start = prev_j;
} else { } else {
hunks.push(Hunk::Insert { pending_insert = Some(prev_j..j);
text: self.new[prev_j..j].to_string(),
})
} }
} else if prev_i == i - 1 && prev_j == j {
if let Some(Hunk::Remove { len }) = hunks.last_mut() {
*len += 1;
} else { } else {
hunks.push(Hunk::Remove { len: 1 }) if let Some(range) = pending_insert.take() {
hunks.push(Hunk::Insert {
text: self.new[range].iter().collect(),
});
}
let char_len = self.old[i - 1].len_utf8();
if prev_i == i - 1 && prev_j == j {
if let Some(Hunk::Remove { len }) = hunks.last_mut() {
*len += char_len;
} else {
hunks.push(Hunk::Remove { len: char_len })
} }
} else { } else {
if let Some(Hunk::Keep { len }) = hunks.last_mut() { if let Some(Hunk::Keep { len }) = hunks.last_mut() {
*len += 1; *len += char_len;
} else { } else {
hunks.push(Hunk::Keep { len: 1 }) hunks.push(Hunk::Keep { len: char_len })
}
} }
} }
i = prev_i; i = prev_i;
j = prev_j; j = prev_j;
} }
if let Some(range) = pending_insert.take() {
hunks.push(Hunk::Insert {
text: self.new[range].iter().collect(),
});
}
hunks.reverse(); hunks.reverse();
hunks hunks
} }