Support multi-byte characters in diff

This commit is contained in:
Antonio Scandurra 2023-08-23 09:32:38 +02:00
parent a9871a7a70
commit c2935056e8

View file

@ -1,6 +1,7 @@
use std::{
cmp,
fmt::{self, Debug},
ops::Range,
};
use collections::BinaryHeap;
@ -71,8 +72,8 @@ pub enum Hunk {
}
pub struct Diff {
old: String,
new: String,
old: Vec<char>,
new: Vec<char>,
scores: Matrix,
old_text_ix: usize,
new_text_ix: usize,
@ -84,6 +85,7 @@ impl Diff {
const EQUALITY_SCORE: isize = 5;
pub fn new(old: String) -> Self {
let old = old.chars().collect::<Vec<_>>();
let mut scores = Matrix::new();
scores.resize(old.len() + 1, 1);
for i in 0..=old.len() {
@ -91,7 +93,7 @@ impl Diff {
}
Self {
old,
new: String::new(),
new: Vec::new(),
scores,
old_text_ix: 0,
new_text_ix: 0,
@ -99,7 +101,7 @@ impl Diff {
}
pub fn push_new(&mut self, text: &str) -> Vec<Hunk> {
self.new.push_str(text);
self.new.extend(text.chars());
self.scores.resize(self.old.len() + 1, self.new.len() + 1);
for j in self.new_text_ix + 1..=self.new.len() {
@ -107,7 +109,7 @@ impl Diff {
for i in 1..=self.old.len() {
let insertion_score = self.scores.get(i, j - 1) + Self::INSERTION_SCORE;
let deletion_score = self.scores.get(i - 1, j) + Self::DELETION_SCORE;
let equality_score = if self.old.as_bytes()[i - 1] == self.new.as_bytes()[j - 1] {
let equality_score = if self.old[i - 1] == self.new[j - 1] {
self.scores.get(i - 1, j - 1) + Self::EQUALITY_SCORE
} else {
isize::MIN
@ -138,6 +140,7 @@ impl Diff {
}
fn backtrack(&self, old_text_ix: usize, new_text_ix: usize) -> Vec<Hunk> {
let mut pending_insert: Option<Range<usize>> = None;
let mut hunks = Vec::new();
let mut i = old_text_ix;
let mut j = new_text_ix;
@ -153,7 +156,7 @@ impl Diff {
None
};
let equality_score = if i > self.old_text_ix && j > self.new_text_ix {
if self.old.as_bytes()[i - 1] == self.new.as_bytes()[j - 1] {
if self.old[i - 1] == self.new[j - 1] {
Some((i - 1, j - 1))
} else {
None
@ -169,30 +172,44 @@ impl Diff {
.unwrap();
if prev_i == i && prev_j == j - 1 {
if let Some(Hunk::Insert { text }) = hunks.last_mut() {
text.insert_str(0, &self.new[prev_j..j]);
if let Some(pending_insert) = pending_insert.as_mut() {
pending_insert.start = prev_j;
} else {
hunks.push(Hunk::Insert {
text: self.new[prev_j..j].to_string(),
})
}
} else if prev_i == i - 1 && prev_j == j {
if let Some(Hunk::Remove { len }) = hunks.last_mut() {
*len += 1;
} else {
hunks.push(Hunk::Remove { len: 1 })
pending_insert = Some(prev_j..j);
}
} else {
if let Some(Hunk::Keep { len }) = hunks.last_mut() {
*len += 1;
if let Some(range) = pending_insert.take() {
hunks.push(Hunk::Insert {
text: self.new[range].iter().collect(),
});
}
let char_len = self.old[i - 1].len_utf8();
if prev_i == i - 1 && prev_j == j {
if let Some(Hunk::Remove { len }) = hunks.last_mut() {
*len += char_len;
} else {
hunks.push(Hunk::Remove { len: char_len })
}
} else {
hunks.push(Hunk::Keep { len: 1 })
if let Some(Hunk::Keep { len }) = hunks.last_mut() {
*len += char_len;
} else {
hunks.push(Hunk::Keep { len: char_len })
}
}
}
i = prev_i;
j = prev_j;
}
if let Some(range) = pending_insert.take() {
hunks.push(Hunk::Insert {
text: self.new[range].iter().collect(),
});
}
hunks.reverse();
hunks
}