From 4431ef1870d097c22b8a94827abffe2493314519 Mon Sep 17 00:00:00 2001 From: Antonio Scandurra Date: Wed, 30 Oct 2024 10:59:03 +0100 Subject: [PATCH] Speed up point translation in the Rope (#19913) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This pull request introduces an index of Unicode codepoints, newlines and UTF-16 codepoints. Benchmarks worth a thousand words: ``` push/4096 time: [467.06 µs 470.07 µs 473.24 µs] thrpt: [8.2543 MiB/s 8.3100 MiB/s 8.3635 MiB/s] change: time: [-4.1462% -3.0990% -2.0527%] (p = 0.00 < 0.05) thrpt: [+2.0957% +3.1981% +4.3255%] Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) low mild 2 (2.00%) high mild push/65536 time: [1.4650 ms 1.4796 ms 1.4922 ms] thrpt: [41.885 MiB/s 42.242 MiB/s 42.664 MiB/s] change: time: [-3.2871% -2.3489% -1.4555%] (p = 0.00 < 0.05) thrpt: [+1.4770% +2.4054% +3.3988%] Performance has improved. Found 6 outliers among 100 measurements (6.00%) 3 (3.00%) low severe 3 (3.00%) low mild append/4096 time: [729.00 ns 730.57 ns 732.14 ns] thrpt: [5.2103 GiB/s 5.2215 GiB/s 5.2327 GiB/s] change: time: [-81.884% -81.836% -81.790%] (p = 0.00 < 0.05) thrpt: [+449.16% +450.53% +452.01%] Performance has improved. Found 11 outliers among 100 measurements (11.00%) 3 (3.00%) low mild 6 (6.00%) high mild 2 (2.00%) high severe append/65536 time: [504.44 ns 505.58 ns 506.77 ns] thrpt: [120.44 GiB/s 120.72 GiB/s 121.00 GiB/s] change: time: [-94.833% -94.807% -94.782%] (p = 0.00 < 0.05) thrpt: [+1816.3% +1825.8% +1835.5%] Performance has improved. Found 4 outliers among 100 measurements (4.00%) 3 (3.00%) high mild 1 (1.00%) high severe slice/4096 time: [29.661 µs 29.733 µs 29.816 µs] thrpt: [131.01 MiB/s 131.38 MiB/s 131.70 MiB/s] change: time: [-48.833% -48.533% -48.230%] (p = 0.00 < 0.05) thrpt: [+93.161% +94.298% +95.440%] Performance has improved. slice/65536 time: [588.00 µs 590.22 µs 592.17 µs] thrpt: [105.54 MiB/s 105.89 MiB/s 106.29 MiB/s] change: time: [-45.599% -45.347% -45.099%] (p = 0.00 < 0.05) thrpt: [+82.147% +82.971% +83.821%] Performance has improved. Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) low severe 1 (1.00%) high mild bytes_in_range/4096 time: [3.8630 µs 3.8811 µs 3.8994 µs] thrpt: [1001.8 MiB/s 1006.5 MiB/s 1011.2 MiB/s] change: time: [+0.0600% +0.6000% +1.1833%] (p = 0.03 < 0.05) thrpt: [-1.1695% -0.5964% -0.0600%] Change within noise threshold. bytes_in_range/65536 time: [98.178 µs 98.545 µs 98.931 µs] thrpt: [631.75 MiB/s 634.23 MiB/s 636.60 MiB/s] change: time: [-0.6513% +0.7537% +2.2265%] (p = 0.30 > 0.05) thrpt: [-2.1780% -0.7481% +0.6555%] No change in performance detected. Found 11 outliers among 100 measurements (11.00%) 8 (8.00%) high mild 3 (3.00%) high severe chars/4096 time: [878.91 ns 879.45 ns 880.06 ns] thrpt: [4.3346 GiB/s 4.3376 GiB/s 4.3403 GiB/s] change: time: [+9.1679% +9.4000% +9.6304%] (p = 0.00 < 0.05) thrpt: [-8.7844% -8.5923% -8.3979%] Performance has regressed. Found 8 outliers among 100 measurements (8.00%) 1 (1.00%) low severe 1 (1.00%) low mild 3 (3.00%) high mild 3 (3.00%) high severe chars/65536 time: [15.615 µs 15.691 µs 15.757 µs] thrpt: [3.8735 GiB/s 3.8899 GiB/s 3.9087 GiB/s] change: time: [+5.4902% +5.9345% +6.4044%] (p = 0.00 < 0.05) thrpt: [-6.0190% -5.6021% -5.2045%] Performance has regressed. Found 2 outliers among 100 measurements (2.00%) 2 (2.00%) low mild clip_point/4096 time: [29.677 µs 29.835 µs 30.019 µs] thrpt: [130.13 MiB/s 130.93 MiB/s 131.63 MiB/s] change: time: [-46.306% -45.866% -45.436%] (p = 0.00 < 0.05) thrpt: [+83.272% +84.728% +86.240%] Performance has improved. Found 11 outliers among 100 measurements (11.00%) 3 (3.00%) high mild 8 (8.00%) high severe clip_point/65536 time: [1.5933 ms 1.6116 ms 1.6311 ms] thrpt: [38.318 MiB/s 38.782 MiB/s 39.226 MiB/s] change: time: [-30.388% -29.598% -28.717%] (p = 0.00 < 0.05) thrpt: [+40.286% +42.040% +43.653%] Performance has improved. Found 3 outliers among 100 measurements (3.00%) 3 (3.00%) high mild running 0 tests test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 7 filtered out; finished in 0.00s point_to_offset/4096 time: [14.493 µs 14.591 µs 14.707 µs] thrpt: [265.61 MiB/s 267.72 MiB/s 269.52 MiB/s] change: time: [-71.990% -71.787% -71.588%] (p = 0.00 < 0.05) thrpt: [+251.96% +254.45% +257.01%] Performance has improved. Found 9 outliers among 100 measurements (9.00%) 5 (5.00%) high mild 4 (4.00%) high severe point_to_offset/65536 time: [700.72 µs 713.75 µs 727.26 µs] thrpt: [85.939 MiB/s 87.566 MiB/s 89.194 MiB/s] change: time: [-61.778% -61.015% -60.256%] (p = 0.00 < 0.05) thrpt: [+151.61% +156.51% +161.63%] Performance has improved. ``` Calling `Rope::chars` got slightly slower but I don't think it's a big issue (we don't really call `chars` for an entire `Rope`). In a future pull request, I want to use the tab index (which we're not yet using) and the char index to make `TabMap` a lot faster. Release Notes: - N/A --- Cargo.lock | 1 + Cargo.toml | 1 + crates/rope/Cargo.toml | 1 + crates/rope/benches/rope_benchmark.rs | 19 + crates/rope/src/chunk.rs | 878 ++++++++++++++++++++++++++ crates/rope/src/rope.rs | 446 ++++--------- crates/rope/src/unclipped.rs | 6 +- crates/sum_tree/Cargo.toml | 2 +- 8 files changed, 1029 insertions(+), 325 deletions(-) create mode 100644 crates/rope/src/chunk.rs diff --git a/Cargo.lock b/Cargo.lock index f473a0f74d..56d538a883 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9898,6 +9898,7 @@ dependencies = [ "gpui", "log", "rand 0.8.5", + "rayon", "smallvec", "sum_tree", "unicode-segmentation", diff --git a/Cargo.toml b/Cargo.toml index e269dd99ea..d67f78dc2b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -392,6 +392,7 @@ prost-build = "0.9" prost-types = "0.9" pulldown-cmark = { version = "0.12.0", default-features = false } rand = "0.8.5" +rayon = "1.8" regex = "1.5" repair_json = "0.1.0" reqwest = { git = "https://github.com/zed-industries/reqwest.git", rev = "fd110f6998da16bbca97b6dddda9be7827c50e29", default-features = false, features = [ diff --git a/crates/rope/Cargo.toml b/crates/rope/Cargo.toml index 309ceaf0bf..13f5979732 100644 --- a/crates/rope/Cargo.toml +++ b/crates/rope/Cargo.toml @@ -14,6 +14,7 @@ path = "src/rope.rs" [dependencies] arrayvec = "0.7.1" log.workspace = true +rayon.workspace = true smallvec.workspace = true sum_tree.workspace = true unicode-segmentation.workspace = true diff --git a/crates/rope/benches/rope_benchmark.rs b/crates/rope/benches/rope_benchmark.rs index 1f95559d77..01811c0c86 100644 --- a/crates/rope/benches/rope_benchmark.rs +++ b/crates/rope/benches/rope_benchmark.rs @@ -171,6 +171,25 @@ fn rope_benchmarks(c: &mut Criterion) { }); } group.finish(); + + let mut group = c.benchmark_group("point_to_offset"); + for size in sizes.iter() { + group.throughput(Throughput::Bytes(*size as u64)); + group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, &size| { + let rope = generate_random_rope(rng.clone(), *size); + + b.iter_batched( + || generate_random_rope_points(rng.clone(), &rope), + |offsets| { + for offset in offsets.iter() { + black_box(rope.point_to_offset(*offset)); + } + }, + BatchSize::SmallInput, + ); + }); + } + group.finish(); } criterion_group!(benches, rope_benchmarks); diff --git a/crates/rope/src/chunk.rs b/crates/rope/src/chunk.rs new file mode 100644 index 0000000000..0490c5a9cd --- /dev/null +++ b/crates/rope/src/chunk.rs @@ -0,0 +1,878 @@ +use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped}; +use arrayvec::ArrayString; +use std::{cmp, ops::Range}; +use sum_tree::Bias; +use unicode_segmentation::GraphemeCursor; +use util::debug_panic; + +pub(crate) const MIN_BASE: usize = if cfg!(test) { 6 } else { 64 }; +pub(crate) const MAX_BASE: usize = MIN_BASE * 2; + +#[derive(Clone, Debug, Default)] +pub struct Chunk { + chars: u128, + chars_utf16: u128, + newlines: u128, + pub text: ArrayString, +} + +impl Chunk { + #[inline(always)] + pub fn new(text: &str) -> Self { + let mut this = Chunk::default(); + this.push_str(text); + this + } + + #[inline(always)] + pub fn push_str(&mut self, text: &str) { + for (char_ix, c) in text.char_indices() { + let ix = self.text.len() + char_ix; + self.chars |= 1 << ix; + self.chars_utf16 |= 1 << ix; + self.chars_utf16 |= (c.len_utf16() as u128) << ix; + self.newlines |= ((c == '\n') as u128) << ix; + } + self.text.push_str(text); + } + + #[inline(always)] + pub fn append(&mut self, slice: ChunkSlice) { + if slice.is_empty() { + return; + }; + + let base_ix = self.text.len(); + self.chars |= slice.chars << base_ix; + self.chars_utf16 |= slice.chars_utf16 << base_ix; + self.newlines |= slice.newlines << base_ix; + self.text.push_str(&slice.text); + } + + #[inline(always)] + pub fn as_slice(&self) -> ChunkSlice { + ChunkSlice { + chars: self.chars, + chars_utf16: self.chars_utf16, + newlines: self.newlines, + text: &self.text, + } + } + + #[inline(always)] + pub fn slice(&self, range: Range) -> ChunkSlice { + self.as_slice().slice(range) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct ChunkSlice<'a> { + chars: u128, + chars_utf16: u128, + newlines: u128, + text: &'a str, +} + +impl<'a> Into for ChunkSlice<'a> { + fn into(self) -> Chunk { + Chunk { + chars: self.chars, + chars_utf16: self.chars_utf16, + newlines: self.newlines, + text: self.text.try_into().unwrap(), + } + } +} + +impl<'a> ChunkSlice<'a> { + #[inline(always)] + pub fn is_empty(self) -> bool { + self.text.is_empty() + } + + #[inline(always)] + pub fn is_char_boundary(self, offset: usize) -> bool { + self.text.is_char_boundary(offset) + } + + #[inline(always)] + pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) { + if mid == MAX_BASE { + let left = self; + let right = ChunkSlice { + chars: 0, + chars_utf16: 0, + newlines: 0, + text: "", + }; + (left, right) + } else { + let mask = if mid == MAX_BASE { + u128::MAX + } else { + (1u128 << mid) - 1 + }; + let (left_text, right_text) = self.text.split_at(mid); + let left = ChunkSlice { + chars: self.chars & mask, + chars_utf16: self.chars_utf16 & mask, + newlines: self.newlines & mask, + text: left_text, + }; + let right = ChunkSlice { + chars: self.chars >> mid, + chars_utf16: self.chars_utf16 >> mid, + newlines: self.newlines >> mid, + text: right_text, + }; + (left, right) + } + } + + #[inline(always)] + pub fn slice(self, range: Range) -> Self { + let mask = if range.end == MAX_BASE { + u128::MAX + } else { + (1u128 << range.end) - 1 + }; + if range.start == MAX_BASE { + Self { + chars: 0, + chars_utf16: 0, + newlines: 0, + text: "", + } + } else { + Self { + chars: (self.chars & mask) >> range.start, + chars_utf16: (self.chars_utf16 & mask) >> range.start, + newlines: (self.newlines & mask) >> range.start, + text: &self.text[range], + } + } + } + + #[inline(always)] + pub fn text_summary(&self) -> TextSummary { + let (longest_row, longest_row_chars) = self.longest_row(); + TextSummary { + len: self.len(), + len_utf16: self.len_utf16(), + lines: self.lines(), + first_line_chars: self.first_line_chars(), + last_line_chars: self.last_line_chars(), + last_line_len_utf16: self.last_line_len_utf16(), + longest_row, + longest_row_chars, + } + } + + /// Get length in bytes + #[inline(always)] + pub fn len(&self) -> usize { + self.text.len() + } + + /// Get length in UTF-16 code units + #[inline(always)] + pub fn len_utf16(&self) -> OffsetUtf16 { + OffsetUtf16(self.chars_utf16.count_ones() as usize) + } + + /// Get point representing number of lines and length of last line + #[inline(always)] + pub fn lines(&self) -> Point { + let row = self.newlines.count_ones(); + let column = self.newlines.leading_zeros() - (u128::BITS - self.text.len() as u32); + Point::new(row, column) + } + + /// Get number of chars in first line + #[inline(always)] + pub fn first_line_chars(&self) -> u32 { + if self.newlines == 0 { + self.chars.count_ones() + } else { + let mask = (1u128 << self.newlines.trailing_zeros()) - 1; + (self.chars & mask).count_ones() + } + } + + /// Get number of chars in last line + #[inline(always)] + pub fn last_line_chars(&self) -> u32 { + if self.newlines == 0 { + self.chars.count_ones() + } else { + let mask = !(u128::MAX >> self.newlines.leading_zeros()); + (self.chars & mask).count_ones() + } + } + + /// Get number of UTF-16 code units in last line + #[inline(always)] + pub fn last_line_len_utf16(&self) -> u32 { + if self.newlines == 0 { + self.chars_utf16.count_ones() + } else { + let mask = !(u128::MAX >> self.newlines.leading_zeros()); + (self.chars_utf16 & mask).count_ones() + } + } + + /// Get the longest row in the chunk and its length in characters. + #[inline(always)] + pub fn longest_row(&self) -> (u32, u32) { + let mut chars = self.chars; + let mut newlines = self.newlines; + let mut row = 0; + let mut longest_row = 0; + let mut longest_row_chars = 0; + while newlines > 0 { + let newline_ix = newlines.trailing_zeros(); + let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8; + if row_chars > longest_row_chars { + longest_row = row; + longest_row_chars = row_chars; + } + + newlines >>= newline_ix; + newlines >>= 1; + chars >>= newline_ix; + chars >>= 1; + row += 1; + } + + let row_chars = chars.count_ones() as u8; + if row_chars > longest_row_chars { + (row, row_chars as u32) + } else { + (longest_row, longest_row_chars as u32) + } + } + + #[inline(always)] + pub fn offset_to_point(&self, offset: usize) -> Point { + let mask = if offset == MAX_BASE { + u128::MAX + } else { + (1u128 << offset) - 1 + }; + let row = (self.newlines & mask).count_ones(); + let newline_ix = u128::BITS - (self.newlines & mask).leading_zeros(); + let column = (offset - newline_ix as usize) as u32; + Point::new(row, column) + } + + #[inline(always)] + pub fn point_to_offset(&self, point: Point) -> usize { + if point.row > self.lines().row { + debug_panic!( + "point {:?} extends beyond rows for string {:?}", + point, + self.text + ); + return self.len(); + } + + let row_offset_range = self.offset_range_for_row(point.row); + if point.column > row_offset_range.len() as u32 { + debug_panic!( + "point {:?} extends beyond row for string {:?}", + point, + self.text + ); + row_offset_range.end + } else { + row_offset_range.start + point.column as usize + } + } + + #[inline(always)] + pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 { + let mask = if offset == MAX_BASE { + u128::MAX + } else { + (1u128 << offset) - 1 + }; + OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize) + } + + #[inline(always)] + pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize { + if target.0 == 0 { + 0 + } else { + let ix = nth_set_bit(self.chars_utf16, target.0) + 1; + if ix == MAX_BASE { + MAX_BASE + } else { + let utf8_additional_len = cmp::min( + (self.chars_utf16 >> ix).trailing_zeros() as usize, + self.text.len() - ix, + ); + ix + utf8_additional_len + } + } + } + + #[inline(always)] + pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 { + let mask = if offset == MAX_BASE { + u128::MAX + } else { + (1u128 << offset) - 1 + }; + let row = (self.newlines & mask).count_ones(); + let newline_ix = u128::BITS - (self.newlines & mask).leading_zeros(); + let column = if newline_ix as usize == MAX_BASE { + 0 + } else { + ((self.chars_utf16 & mask) >> newline_ix).count_ones() + }; + PointUtf16::new(row, column) + } + + #[inline(always)] + pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 { + self.offset_to_point_utf16(self.point_to_offset(point)) + } + + #[inline(always)] + pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize { + let lines = self.lines(); + if point.row > lines.row { + if !clip { + debug_panic!( + "point {:?} is beyond this chunk's extent {:?}", + point, + self.text + ); + } + return self.len(); + } + + let row_offset_range = self.offset_range_for_row(point.row); + let line = self.slice(row_offset_range.clone()); + if point.column > line.last_line_len_utf16() { + if !clip { + debug_panic!( + "point {:?} is beyond the end of the line in chunk {:?}", + point, + self.text + ); + } + return line.len(); + } + + let mut offset = row_offset_range.start; + if point.column > 0 { + offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize)); + if !self.text.is_char_boundary(offset) { + offset -= 1; + while !self.text.is_char_boundary(offset) { + offset -= 1; + } + if !clip { + debug_panic!( + "point {:?} is within character in chunk {:?}", + point, + self.text, + ); + } + } + } + offset + } + + #[inline(always)] + pub fn unclipped_point_utf16_to_point(&self, point: Unclipped) -> Point { + let max_point = self.lines(); + if point.0.row > max_point.row { + return max_point; + } + + let row_offset_range = self.offset_range_for_row(point.0.row); + let line = self.slice(row_offset_range.clone()); + if point.0.column == 0 { + Point::new(point.0.row, 0) + } else if point.0.column >= line.len_utf16().0 as u32 { + Point::new(point.0.row, line.len() as u32) + } else { + let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize)); + while !line.text.is_char_boundary(column) { + column -= 1; + } + Point::new(point.0.row, column as u32) + } + } + + #[inline(always)] + pub fn clip_point(&self, point: Point, bias: Bias) -> Point { + let max_point = self.lines(); + if point.row > max_point.row { + return max_point; + } + + let line = self.slice(self.offset_range_for_row(point.row)); + if point.column == 0 { + point + } else if point.column >= line.len() as u32 { + Point::new(point.row, line.len() as u32) + } else { + let mut column = point.column as usize; + let bytes = line.text.as_bytes(); + if bytes[column - 1] < 128 && bytes[column] < 128 { + return Point::new(point.row, column as u32); + } + + let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true); + loop { + if line.is_char_boundary(column) + && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false) + { + break; + } + + match bias { + Bias::Left => column -= 1, + Bias::Right => column += 1, + } + grapheme_cursor.set_cursor(column); + } + Point::new(point.row, column as u32) + } + } + + #[inline(always)] + pub fn clip_point_utf16(&self, point: Unclipped, bias: Bias) -> PointUtf16 { + let max_point = self.lines(); + if point.0.row > max_point.row { + PointUtf16::new(max_point.row, self.last_line_len_utf16()) + } else { + let line = self.slice(self.offset_range_for_row(point.0.row)); + let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias); + PointUtf16::new(point.0.row, column.0 as u32) + } + } + + #[inline(always)] + pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 { + if target == OffsetUtf16::default() { + OffsetUtf16::default() + } else if target >= self.len_utf16() { + self.len_utf16() + } else { + let mut offset = self.offset_utf16_to_offset(target); + while !self.text.is_char_boundary(offset) { + if bias == Bias::Left { + offset -= 1; + } else { + offset += 1; + } + } + self.offset_to_offset_utf16(offset) + } + } + + #[inline(always)] + fn offset_range_for_row(&self, row: u32) -> Range { + let row_start = if row > 0 { + nth_set_bit(self.newlines, row as usize) + 1 + } else { + 0 + }; + let row_len = if row_start == MAX_BASE { + 0 + } else { + cmp::min( + (self.newlines >> row_start).trailing_zeros(), + (self.text.len() - row_start) as u32, + ) + }; + row_start..row_start + row_len as usize + } +} + +/// Finds the n-th bit that is set to 1. +#[inline(always)] +fn nth_set_bit(v: u128, n: usize) -> usize { + let low = v as u64; + let high = (v >> 64) as u64; + + let low_count = low.count_ones() as usize; + if n > low_count { + 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize + } else { + nth_set_bit_u64(low, n as u64) as usize + } +} + +#[inline(always)] +fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 { + let v = v.reverse_bits(); + let mut s: u64 = 64; + + // Parallel bit count intermediates + let a = v - ((v >> 1) & (u64::MAX / 3)); + let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5)); + let c = (b + (b >> 4)) & (u64::MAX / 0x11); + let d = (c + (c >> 8)) & (u64::MAX / 0x101); + + // Branchless select + let t = (d >> 32) + (d >> 48); + s -= (t.wrapping_sub(n) & 256) >> 3; + n -= t & (t.wrapping_sub(n) >> 8); + + let t = (d >> (s - 16)) & 0xff; + s -= (t.wrapping_sub(n) & 256) >> 4; + n -= t & (t.wrapping_sub(n) >> 8); + + let t = (c >> (s - 8)) & 0xf; + s -= (t.wrapping_sub(n) & 256) >> 5; + n -= t & (t.wrapping_sub(n) >> 8); + + let t = (b >> (s - 4)) & 0x7; + s -= (t.wrapping_sub(n) & 256) >> 6; + n -= t & (t.wrapping_sub(n) >> 8); + + let t = (a >> (s - 2)) & 0x3; + s -= (t.wrapping_sub(n) & 256) >> 7; + n -= t & (t.wrapping_sub(n) >> 8); + + let t = (v >> (s - 1)) & 0x1; + s -= (t.wrapping_sub(n) & 256) >> 8; + + 65 - s - 1 +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::prelude::*; + use util::RandomCharIter; + + #[gpui::test(iterations = 100)] + fn test_random_chunks(mut rng: StdRng) { + let chunk_len = rng.gen_range(0..=MAX_BASE); + let text = RandomCharIter::new(&mut rng) + .take(chunk_len) + .collect::(); + let mut ix = chunk_len; + while !text.is_char_boundary(ix) { + ix -= 1; + } + let text = &text[..ix]; + + log::info!("Chunk: {:?}", text); + let chunk = Chunk::new(&text); + verify_chunk(chunk.as_slice(), text); + + for _ in 0..10 { + let mut start = rng.gen_range(0..=chunk.text.len()); + let mut end = rng.gen_range(start..=chunk.text.len()); + while !chunk.text.is_char_boundary(start) { + start -= 1; + } + while !chunk.text.is_char_boundary(end) { + end -= 1; + } + let range = start..end; + log::info!("Range: {:?}", range); + let text_slice = &text[range.clone()]; + let chunk_slice = chunk.slice(range); + verify_chunk(chunk_slice, text_slice); + } + } + + #[gpui::test(iterations = 1000)] + fn test_nth_set_bit_random(mut rng: StdRng) { + let set_count = rng.gen_range(0..=128); + let mut set_bits = (0..128).choose_multiple(&mut rng, set_count); + set_bits.sort(); + let mut n = 0; + for ix in set_bits.iter().copied() { + n |= 1 << ix; + } + + for (mut ix, position) in set_bits.into_iter().enumerate() { + ix += 1; + assert_eq!( + nth_set_bit(n, ix), + position, + "nth_set_bit({:0128b}, {})", + n, + ix + ); + } + } + + fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) { + let mut offset = 0; + let mut offset_utf16 = OffsetUtf16(0); + let mut point = Point::zero(); + let mut point_utf16 = PointUtf16::zero(); + + log::info!("Verifying chunk {:?}", text); + assert_eq!(chunk.offset_to_point(0), Point::zero()); + + for c in text.chars() { + let expected_point = chunk.offset_to_point(offset); + assert_eq!(point, expected_point, "mismatch at offset {}", offset); + assert_eq!( + chunk.point_to_offset(point), + offset, + "mismatch at point {:?}", + point + ); + assert_eq!( + chunk.offset_to_offset_utf16(offset), + offset_utf16, + "mismatch at offset {}", + offset + ); + assert_eq!( + chunk.offset_utf16_to_offset(offset_utf16), + offset, + "mismatch at offset_utf16 {:?}", + offset_utf16 + ); + assert_eq!( + chunk.point_to_point_utf16(point), + point_utf16, + "mismatch at point {:?}", + point + ); + assert_eq!( + chunk.point_utf16_to_offset(point_utf16, false), + offset, + "mismatch at point_utf16 {:?}", + point_utf16 + ); + assert_eq!( + chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)), + point, + "mismatch for unclipped_point_utf16_to_point at {:?}", + point_utf16 + ); + + assert_eq!( + chunk.clip_point(point, Bias::Left), + point, + "incorrect left clip at {:?}", + point + ); + assert_eq!( + chunk.clip_point(point, Bias::Right), + point, + "incorrect right clip at {:?}", + point + ); + + for i in 1..c.len_utf8() { + let test_point = Point::new(point.row, point.column + i as u32); + assert_eq!( + chunk.clip_point(test_point, Bias::Left), + point, + "incorrect left clip within multi-byte char at {:?}", + test_point + ); + assert_eq!( + chunk.clip_point(test_point, Bias::Right), + Point::new(point.row, point.column + c.len_utf8() as u32), + "incorrect right clip within multi-byte char at {:?}", + test_point + ); + } + + for i in 1..c.len_utf16() { + let test_point = Unclipped(PointUtf16::new( + point_utf16.row, + point_utf16.column + i as u32, + )); + assert_eq!( + chunk.unclipped_point_utf16_to_point(test_point), + point, + "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}", + test_point + ); + assert_eq!( + chunk.clip_point_utf16(test_point, Bias::Left), + point_utf16, + "incorrect left clip_point_utf16 within multi-byte char at {:?}", + test_point + ); + assert_eq!( + chunk.clip_point_utf16(test_point, Bias::Right), + PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32), + "incorrect right clip_point_utf16 within multi-byte char at {:?}", + test_point + ); + + let test_offset = OffsetUtf16(offset_utf16.0 + i); + assert_eq!( + chunk.clip_offset_utf16(test_offset, Bias::Left), + offset_utf16, + "incorrect left clip_offset_utf16 within multi-byte char at {:?}", + test_offset + ); + assert_eq!( + chunk.clip_offset_utf16(test_offset, Bias::Right), + OffsetUtf16(offset_utf16.0 + c.len_utf16()), + "incorrect right clip_offset_utf16 within multi-byte char at {:?}", + test_offset + ); + } + + if c == '\n' { + point.row += 1; + point.column = 0; + point_utf16.row += 1; + point_utf16.column = 0; + } else { + point.column += c.len_utf8() as u32; + point_utf16.column += c.len_utf16() as u32; + } + + offset += c.len_utf8(); + offset_utf16.0 += c.len_utf16(); + } + + let final_point = chunk.offset_to_point(offset); + assert_eq!(point, final_point, "mismatch at final offset {}", offset); + assert_eq!( + chunk.point_to_offset(point), + offset, + "mismatch at point {:?}", + point + ); + assert_eq!( + chunk.offset_to_offset_utf16(offset), + offset_utf16, + "mismatch at offset {}", + offset + ); + assert_eq!( + chunk.offset_utf16_to_offset(offset_utf16), + offset, + "mismatch at offset_utf16 {:?}", + offset_utf16 + ); + assert_eq!( + chunk.point_to_point_utf16(point), + point_utf16, + "mismatch at final point {:?}", + point + ); + assert_eq!( + chunk.point_utf16_to_offset(point_utf16, false), + offset, + "mismatch at final point_utf16 {:?}", + point_utf16 + ); + assert_eq!( + chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)), + point, + "mismatch for unclipped_point_utf16_to_point at final point {:?}", + point_utf16 + ); + assert_eq!( + chunk.clip_point(point, Bias::Left), + point, + "incorrect left clip at final point {:?}", + point + ); + assert_eq!( + chunk.clip_point(point, Bias::Right), + point, + "incorrect right clip at final point {:?}", + point + ); + assert_eq!( + chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left), + point_utf16, + "incorrect left clip_point_utf16 at final point {:?}", + point_utf16 + ); + assert_eq!( + chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right), + point_utf16, + "incorrect right clip_point_utf16 at final point {:?}", + point_utf16 + ); + assert_eq!( + chunk.clip_offset_utf16(offset_utf16, Bias::Left), + offset_utf16, + "incorrect left clip_offset_utf16 at final offset {:?}", + offset_utf16 + ); + assert_eq!( + chunk.clip_offset_utf16(offset_utf16, Bias::Right), + offset_utf16, + "incorrect right clip_offset_utf16 at final offset {:?}", + offset_utf16 + ); + + // Verify length methods + assert_eq!(chunk.len(), text.len()); + assert_eq!( + chunk.len_utf16().0, + text.chars().map(|c| c.len_utf16()).sum::() + ); + + // Verify line counting + let lines = chunk.lines(); + let mut newline_count = 0; + let mut last_line_len = 0; + for c in text.chars() { + if c == '\n' { + newline_count += 1; + last_line_len = 0; + } else { + last_line_len += c.len_utf8() as u32; + } + } + assert_eq!(lines, Point::new(newline_count, last_line_len)); + + // Verify first/last line chars + if !text.is_empty() { + let first_line = text.split('\n').next().unwrap(); + assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32); + + let last_line = text.split('\n').last().unwrap(); + assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32); + assert_eq!( + chunk.last_line_len_utf16(), + last_line.chars().map(|c| c.len_utf16() as u32).sum::() + ); + } + + // Verify longest row + let (longest_row, longest_chars) = chunk.longest_row(); + let mut max_chars = 0; + let mut current_row = 0; + let mut current_chars = 0; + let mut max_row = 0; + + for c in text.chars() { + if c == '\n' { + if current_chars > max_chars { + max_chars = current_chars; + max_row = current_row; + } + current_row += 1; + current_chars = 0; + } else { + current_chars += 1; + } + } + + if current_chars > max_chars { + max_chars = current_chars; + max_row = current_row; + } + + assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars)); + } +} diff --git a/crates/rope/src/rope.rs b/crates/rope/src/rope.rs index 68ff7d5c69..89cb1e7b63 100644 --- a/crates/rope/src/rope.rs +++ b/crates/rope/src/rope.rs @@ -1,9 +1,11 @@ +mod chunk; mod offset_utf16; mod point; mod point_utf16; mod unclipped; -use arrayvec::ArrayString; +use chunk::{Chunk, ChunkSlice}; +use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; use smallvec::SmallVec; use std::{ cmp, fmt, io, mem, @@ -11,20 +13,12 @@ use std::{ str, }; use sum_tree::{Bias, Dimension, SumTree}; -use unicode_segmentation::GraphemeCursor; -use util::debug_panic; pub use offset_utf16::OffsetUtf16; pub use point::Point; pub use point_utf16::PointUtf16; pub use unclipped::Unclipped; -#[cfg(test)] -const CHUNK_BASE: usize = 6; - -#[cfg(not(test))] -const CHUNK_BASE: usize = 64; - #[derive(Clone, Default)] pub struct Rope { chunks: SumTree, @@ -36,18 +30,25 @@ impl Rope { } pub fn append(&mut self, rope: Rope) { - let mut chunks = rope.chunks.cursor::<()>(&()); - chunks.next(&()); - if let Some(chunk) = chunks.item() { - if self.chunks.last().map_or(false, |c| c.0.len() < CHUNK_BASE) - || chunk.0.len() < CHUNK_BASE + if let Some(chunk) = rope.chunks.first() { + if self + .chunks + .last() + .map_or(false, |c| c.text.len() < chunk::MIN_BASE) + || chunk.text.len() < chunk::MIN_BASE { - self.push(&chunk.0); + self.push_chunk(chunk.as_slice()); + + let mut chunks = rope.chunks.cursor::<()>(&()); chunks.next(&()); + chunks.next(&()); + self.chunks.append(chunks.suffix(&()), &()); + self.check_invariants(); + return; } } - self.chunks.append(chunks.suffix(&()), &()); + self.chunks.append(rope.chunks.clone(), &()); self.check_invariants(); } @@ -77,11 +78,13 @@ impl Rope { pub fn push(&mut self, mut text: &str) { self.chunks.update_last( |last_chunk| { - let split_ix = if last_chunk.0.len() + text.len() <= 2 * CHUNK_BASE { + let split_ix = if last_chunk.text.len() + text.len() <= chunk::MAX_BASE { text.len() } else { - let mut split_ix = - cmp::min(CHUNK_BASE.saturating_sub(last_chunk.0.len()), text.len()); + let mut split_ix = cmp::min( + chunk::MIN_BASE.saturating_sub(last_chunk.text.len()), + text.len(), + ); while !text.is_char_boundary(split_ix) { split_ix += 1; } @@ -89,7 +92,7 @@ impl Rope { }; let (suffix, remainder) = text.split_at(split_ix); - last_chunk.0.push_str(suffix); + last_chunk.push_str(suffix); text = remainder; }, &(), @@ -101,12 +104,12 @@ impl Rope { let mut new_chunks = SmallVec::<[_; 16]>::new(); while !text.is_empty() { - let mut split_ix = cmp::min(2 * CHUNK_BASE, text.len()); + let mut split_ix = cmp::min(chunk::MAX_BASE, text.len()); while !text.is_char_boundary(split_ix) { split_ix -= 1; } let (chunk, remainder) = text.split_at(split_ix); - new_chunks.push(Chunk(ArrayString::from(chunk).unwrap())); + new_chunks.push(chunk); text = remainder; } @@ -116,9 +119,11 @@ impl Rope { const PARALLEL_THRESHOLD: usize = 4 * (2 * sum_tree::TREE_BASE); if new_chunks.len() >= PARALLEL_THRESHOLD { - self.chunks.par_extend(new_chunks.into_vec(), &()); + self.chunks + .par_extend(new_chunks.into_vec().into_par_iter().map(Chunk::new), &()); } else { - self.chunks.extend(new_chunks, &()); + self.chunks + .extend(new_chunks.into_iter().map(Chunk::new), &()); } self.check_invariants(); @@ -135,7 +140,7 @@ impl Rope { // a chunk ends with 3 bytes of a 4-byte character. These 3 bytes end up being stored in the following chunk, thus wasting // 3 bytes of storage in current chunk. // For example, a 1024-byte string can occupy between 32 (full ASCII, 1024/32) and 36 (full 4-byte UTF-8, 1024 / 29 rounded up) chunks. - const MIN_CHUNK_SIZE: usize = 2 * CHUNK_BASE - 3; + const MIN_CHUNK_SIZE: usize = chunk::MAX_BASE - 3; // We also round up the capacity up by one, for a good measure; we *really* don't want to realloc here, as we assume that the # of characters // we're working with there is large. @@ -143,12 +148,12 @@ impl Rope { let mut new_chunks = Vec::with_capacity(capacity); while !text.is_empty() { - let mut split_ix = cmp::min(2 * CHUNK_BASE, text.len()); + let mut split_ix = cmp::min(chunk::MAX_BASE, text.len()); while !text.is_char_boundary(split_ix) { split_ix -= 1; } let (chunk, remainder) = text.split_at(split_ix); - new_chunks.push(Chunk(ArrayString::from(chunk).unwrap())); + new_chunks.push(chunk); text = remainder; } @@ -158,13 +163,44 @@ impl Rope { const PARALLEL_THRESHOLD: usize = 4 * (2 * sum_tree::TREE_BASE); if new_chunks.len() >= PARALLEL_THRESHOLD { - self.chunks.par_extend(new_chunks, &()); + self.chunks + .par_extend(new_chunks.into_par_iter().map(Chunk::new), &()); } else { - self.chunks.extend(new_chunks, &()); + self.chunks + .extend(new_chunks.into_iter().map(Chunk::new), &()); } self.check_invariants(); } + + fn push_chunk(&mut self, mut chunk: ChunkSlice) { + self.chunks.update_last( + |last_chunk| { + let split_ix = if last_chunk.text.len() + chunk.len() <= chunk::MAX_BASE { + chunk.len() + } else { + let mut split_ix = cmp::min( + chunk::MIN_BASE.saturating_sub(last_chunk.text.len()), + chunk.len(), + ); + while !chunk.is_char_boundary(split_ix) { + split_ix += 1; + } + split_ix + }; + + let (suffix, remainder) = chunk.split_at(split_ix); + last_chunk.append(suffix); + chunk = remainder; + }, + &(), + ); + + if !chunk.is_empty() { + self.chunks.push(chunk.into(), &()); + } + } + pub fn push_front(&mut self, text: &str) { let suffix = mem::replace(self, Rope::from(text)); self.append(suffix); @@ -178,7 +214,7 @@ impl Rope { let mut chunks = self.chunks.cursor::<()>(&()).peekable(); while let Some(chunk) = chunks.next() { if chunks.peek().is_some() { - assert!(chunk.0.len() + 3 >= CHUNK_BASE); + assert!(chunk.text.len() + 3 >= chunk::MIN_BASE); } } } @@ -250,7 +286,7 @@ impl Rope { let overshoot = offset - cursor.start().0; cursor.start().1 + cursor.item().map_or(Default::default(), |chunk| { - chunk.offset_to_offset_utf16(overshoot) + chunk.as_slice().offset_to_offset_utf16(overshoot) }) } @@ -263,7 +299,7 @@ impl Rope { let overshoot = offset - cursor.start().0; cursor.start().1 + cursor.item().map_or(Default::default(), |chunk| { - chunk.offset_utf16_to_offset(overshoot) + chunk.as_slice().offset_utf16_to_offset(overshoot) }) } @@ -275,9 +311,9 @@ impl Rope { cursor.seek(&offset, Bias::Left, &()); let overshoot = offset - cursor.start().0; cursor.start().1 - + cursor - .item() - .map_or(Point::zero(), |chunk| chunk.offset_to_point(overshoot)) + + cursor.item().map_or(Point::zero(), |chunk| { + chunk.as_slice().offset_to_point(overshoot) + }) } pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 { @@ -289,7 +325,7 @@ impl Rope { let overshoot = offset - cursor.start().0; cursor.start().1 + cursor.item().map_or(PointUtf16::zero(), |chunk| { - chunk.offset_to_point_utf16(overshoot) + chunk.as_slice().offset_to_point_utf16(overshoot) }) } @@ -302,7 +338,7 @@ impl Rope { let overshoot = point - cursor.start().0; cursor.start().1 + cursor.item().map_or(PointUtf16::zero(), |chunk| { - chunk.point_to_point_utf16(overshoot) + chunk.as_slice().point_to_point_utf16(overshoot) }) } @@ -316,7 +352,7 @@ impl Rope { cursor.start().1 + cursor .item() - .map_or(0, |chunk| chunk.point_to_offset(overshoot)) + .map_or(0, |chunk| chunk.as_slice().point_to_offset(overshoot)) } pub fn point_utf16_to_offset(&self, point: PointUtf16) -> usize { @@ -335,9 +371,9 @@ impl Rope { cursor.seek(&point, Bias::Left, &()); let overshoot = point - cursor.start().0; cursor.start().1 - + cursor - .item() - .map_or(0, |chunk| chunk.point_utf16_to_offset(overshoot, clip)) + + cursor.item().map_or(0, |chunk| { + chunk.as_slice().point_utf16_to_offset(overshoot, clip) + }) } pub fn unclipped_point_utf16_to_point(&self, point: Unclipped) -> Point { @@ -349,7 +385,7 @@ impl Rope { let overshoot = Unclipped(point.0 - cursor.start().0); cursor.start().1 + cursor.item().map_or(Point::zero(), |chunk| { - chunk.unclipped_point_utf16_to_point(overshoot) + chunk.as_slice().unclipped_point_utf16_to_point(overshoot) }) } @@ -358,7 +394,7 @@ impl Rope { cursor.seek(&offset, Bias::Left, &()); if let Some(chunk) = cursor.item() { let mut ix = offset - cursor.start(); - while !chunk.0.is_char_boundary(ix) { + while !chunk.text.is_char_boundary(ix) { match bias { Bias::Left => { ix -= 1; @@ -381,7 +417,7 @@ impl Rope { cursor.seek(&offset, Bias::Right, &()); if let Some(chunk) = cursor.item() { let overshoot = offset - cursor.start(); - *cursor.start() + chunk.clip_offset_utf16(overshoot, bias) + *cursor.start() + chunk.as_slice().clip_offset_utf16(overshoot, bias) } else { self.summary().len_utf16 } @@ -392,7 +428,7 @@ impl Rope { cursor.seek(&point, Bias::Right, &()); if let Some(chunk) = cursor.item() { let overshoot = point - cursor.start(); - *cursor.start() + chunk.clip_point(overshoot, bias) + *cursor.start() + chunk.as_slice().clip_point(overshoot, bias) } else { self.summary().lines } @@ -403,7 +439,7 @@ impl Rope { cursor.seek(&point.0, Bias::Right, &()); if let Some(chunk) = cursor.item() { let overshoot = Unclipped(point.0 - cursor.start()); - *cursor.start() + chunk.clip_point_utf16(overshoot, bias) + *cursor.start() + chunk.as_slice().clip_point_utf16(overshoot, bias) } else { self.summary().lines_utf16() } @@ -500,7 +536,7 @@ impl<'a> Cursor<'a> { if let Some(start_chunk) = self.chunks.item() { let start_ix = self.offset - self.chunks.start(); let end_ix = cmp::min(end_offset, self.chunks.end(&())) - self.chunks.start(); - slice.push(&start_chunk.0[start_ix..end_ix]); + slice.push_chunk(start_chunk.slice(start_ix..end_ix)); } if end_offset > self.chunks.end(&()) { @@ -510,7 +546,7 @@ impl<'a> Cursor<'a> { }); if let Some(end_chunk) = self.chunks.item() { let end_ix = end_offset - self.chunks.start(); - slice.push(&end_chunk.0[..end_ix]); + slice.push_chunk(end_chunk.slice(0..end_ix)); } } @@ -525,9 +561,7 @@ impl<'a> Cursor<'a> { if let Some(start_chunk) = self.chunks.item() { let start_ix = self.offset - self.chunks.start(); let end_ix = cmp::min(end_offset, self.chunks.end(&())) - self.chunks.start(); - summary.add_assign(&D::from_text_summary(&TextSummary::from( - &start_chunk.0[start_ix..end_ix], - ))); + summary.add_assign(&D::from_chunk(start_chunk.slice(start_ix..end_ix))); } if end_offset > self.chunks.end(&()) { @@ -535,9 +569,7 @@ impl<'a> Cursor<'a> { summary.add_assign(&self.chunks.summary(&end_offset, Bias::Right, &())); if let Some(end_chunk) = self.chunks.item() { let end_ix = end_offset - self.chunks.start(); - summary.add_assign(&D::from_text_summary(&TextSummary::from( - &end_chunk.0[..end_ix], - ))); + summary.add_assign(&D::from_chunk(end_chunk.slice(0..end_ix))); } } @@ -678,11 +710,11 @@ impl<'a> Chunks<'a> { if let Some(chunk) = self.chunks.item() { let mut end_ix = self.offset - *self.chunks.start(); - if chunk.0.as_bytes()[end_ix - 1] == b'\n' { + if chunk.text.as_bytes()[end_ix - 1] == b'\n' { end_ix -= 1; } - if let Some(newline_ix) = chunk.0[..end_ix].rfind('\n') { + if let Some(newline_ix) = chunk.text[..end_ix].rfind('\n') { self.offset = *self.chunks.start() + newline_ix + 1; if self.offset_is_valid() { return true; @@ -694,7 +726,7 @@ impl<'a> Chunks<'a> { .search_backward(|summary| summary.text.lines.row > 0, &()); self.offset = *self.chunks.start(); if let Some(chunk) = self.chunks.item() { - if let Some(newline_ix) = chunk.0.rfind('\n') { + if let Some(newline_ix) = chunk.text.rfind('\n') { self.offset += newline_ix + 1; if self.offset_is_valid() { if self.offset == self.chunks.end(&()) { @@ -731,7 +763,7 @@ impl<'a> Chunks<'a> { slice_start..slice_end }; - Some(&chunk.0[slice_range]) + Some(&chunk.text[slice_range]) } pub fn lines(self) -> Lines<'a> { @@ -798,7 +830,7 @@ impl<'a> Bytes<'a> { } let start = self.range.start.saturating_sub(chunk_start); let end = self.range.end - chunk_start; - Some(&chunk.0.as_bytes()[start..chunk.0.len().min(end)]) + Some(&chunk.text.as_bytes()[start..chunk.text.len().min(end)]) } } @@ -902,265 +934,13 @@ impl<'a> Lines<'a> { } } -#[derive(Clone, Debug, Default)] -struct Chunk(ArrayString<{ 2 * CHUNK_BASE }>); - -impl Chunk { - fn offset_to_offset_utf16(&self, target: usize) -> OffsetUtf16 { - let mut offset = 0; - let mut offset_utf16 = OffsetUtf16(0); - for ch in self.0.chars() { - if offset >= target { - break; - } - - offset += ch.len_utf8(); - offset_utf16.0 += ch.len_utf16(); - } - offset_utf16 - } - - fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize { - let mut offset_utf16 = OffsetUtf16(0); - let mut offset = 0; - for ch in self.0.chars() { - if offset_utf16 >= target { - break; - } - - offset += ch.len_utf8(); - offset_utf16.0 += ch.len_utf16(); - } - offset - } - - fn offset_to_point(&self, target: usize) -> Point { - let mut offset = 0; - let mut point = Point::new(0, 0); - for ch in self.0.chars() { - if offset >= target { - break; - } - - if ch == '\n' { - point.row += 1; - point.column = 0; - } else { - point.column += ch.len_utf8() as u32; - } - offset += ch.len_utf8(); - } - point - } - - fn offset_to_point_utf16(&self, target: usize) -> PointUtf16 { - let mut offset = 0; - let mut point = PointUtf16::new(0, 0); - for ch in self.0.chars() { - if offset >= target { - break; - } - - if ch == '\n' { - point.row += 1; - point.column = 0; - } else { - point.column += ch.len_utf16() as u32; - } - offset += ch.len_utf8(); - } - point - } - - fn point_to_offset(&self, target: Point) -> usize { - let mut offset = 0; - let mut point = Point::new(0, 0); - - for ch in self.0.chars() { - if point >= target { - if point > target { - debug_panic!("point {target:?} is inside of character {ch:?}"); - } - break; - } - - if ch == '\n' { - point.row += 1; - point.column = 0; - - if point.row > target.row { - debug_panic!( - "point {target:?} is beyond the end of a line with length {}", - point.column - ); - break; - } - } else { - point.column += ch.len_utf8() as u32; - } - - offset += ch.len_utf8(); - } - - offset - } - - fn point_to_point_utf16(&self, target: Point) -> PointUtf16 { - let mut point = Point::zero(); - let mut point_utf16 = PointUtf16::new(0, 0); - for ch in self.0.chars() { - if point >= target { - break; - } - - if ch == '\n' { - point_utf16.row += 1; - point_utf16.column = 0; - point.row += 1; - point.column = 0; - } else { - point_utf16.column += ch.len_utf16() as u32; - point.column += ch.len_utf8() as u32; - } - } - point_utf16 - } - - fn point_utf16_to_offset(&self, target: PointUtf16, clip: bool) -> usize { - let mut offset = 0; - let mut point = PointUtf16::new(0, 0); - - for ch in self.0.chars() { - if point == target { - break; - } - - if ch == '\n' { - point.row += 1; - point.column = 0; - - if point.row > target.row { - if !clip { - debug_panic!( - "point {target:?} is beyond the end of a line with length {}", - point.column - ); - } - // Return the offset of the newline - return offset; - } - } else { - point.column += ch.len_utf16() as u32; - } - - if point > target { - if !clip { - debug_panic!("point {target:?} is inside of codepoint {ch:?}"); - } - // Return the offset of the codepoint which we have landed within, bias left - return offset; - } - - offset += ch.len_utf8(); - } - - offset - } - - fn unclipped_point_utf16_to_point(&self, target: Unclipped) -> Point { - let mut point = Point::zero(); - let mut point_utf16 = PointUtf16::zero(); - - for ch in self.0.chars() { - if point_utf16 == target.0 { - break; - } - - if point_utf16 > target.0 { - // If the point is past the end of a line or inside of a code point, - // return the last valid point before the target. - return point; - } - - if ch == '\n' { - point_utf16 += PointUtf16::new(1, 0); - point += Point::new(1, 0); - } else { - point_utf16 += PointUtf16::new(0, ch.len_utf16() as u32); - point += Point::new(0, ch.len_utf8() as u32); - } - } - - point - } - - fn clip_point(&self, target: Point, bias: Bias) -> Point { - for (row, line) in self.0.split('\n').enumerate() { - if row == target.row as usize { - let bytes = line.as_bytes(); - let mut column = target.column.min(bytes.len() as u32) as usize; - if column == 0 - || column == bytes.len() - || (bytes[column - 1] < 128 && bytes[column] < 128) - { - return Point::new(row as u32, column as u32); - } - - let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true); - loop { - if line.is_char_boundary(column) - && grapheme_cursor.is_boundary(line, 0).unwrap_or(false) - { - break; - } - - match bias { - Bias::Left => column -= 1, - Bias::Right => column += 1, - } - grapheme_cursor.set_cursor(column); - } - return Point::new(row as u32, column as u32); - } - } - unreachable!() - } - - fn clip_point_utf16(&self, target: Unclipped, bias: Bias) -> PointUtf16 { - for (row, line) in self.0.split('\n').enumerate() { - if row == target.0.row as usize { - let mut code_units = line.encode_utf16(); - let mut column = code_units.by_ref().take(target.0.column as usize).count(); - if char::decode_utf16(code_units).next().transpose().is_err() { - match bias { - Bias::Left => column -= 1, - Bias::Right => column += 1, - } - } - return PointUtf16::new(row as u32, column as u32); - } - } - unreachable!() - } - - fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 { - let mut code_units = self.0.encode_utf16(); - let mut offset = code_units.by_ref().take(target.0).count(); - if char::decode_utf16(code_units).next().transpose().is_err() { - match bias { - Bias::Left => offset -= 1, - Bias::Right => offset += 1, - } - } - OffsetUtf16(offset) - } -} - impl sum_tree::Item for Chunk { type Summary = ChunkSummary; fn summary(&self, _cx: &()) -> Self::Summary { - ChunkSummary::from(self.0.as_str()) + ChunkSummary { + text: self.as_slice().text_summary(), + } } } @@ -1169,14 +949,6 @@ pub struct ChunkSummary { text: TextSummary, } -impl<'a> From<&'a str> for ChunkSummary { - fn from(text: &'a str) -> Self { - Self { - text: TextSummary::from(text), - } - } -} - impl sum_tree::Summary for ChunkSummary { type Context = (); @@ -1323,6 +1095,7 @@ impl std::ops::AddAssign for TextSummary { pub trait TextDimension: 'static + for<'a> Dimension<'a, ChunkSummary> { fn from_text_summary(summary: &TextSummary) -> Self; + fn from_chunk(chunk: ChunkSlice) -> Self; fn add_assign(&mut self, other: &Self); } @@ -1334,6 +1107,10 @@ impl TextDimension for (D1, D2) { ) } + fn from_chunk(chunk: ChunkSlice) -> Self { + (D1::from_chunk(chunk), D2::from_chunk(chunk)) + } + fn add_assign(&mut self, other: &Self) { self.0.add_assign(&other.0); self.1.add_assign(&other.1); @@ -1355,6 +1132,10 @@ impl TextDimension for TextSummary { summary.clone() } + fn from_chunk(chunk: ChunkSlice) -> Self { + chunk.text_summary() + } + fn add_assign(&mut self, other: &Self) { *self += other; } @@ -1375,6 +1156,10 @@ impl TextDimension for usize { summary.len } + fn from_chunk(chunk: ChunkSlice) -> Self { + chunk.len() + } + fn add_assign(&mut self, other: &Self) { *self += other; } @@ -1395,6 +1180,10 @@ impl TextDimension for OffsetUtf16 { summary.len_utf16 } + fn from_chunk(chunk: ChunkSlice) -> Self { + chunk.len_utf16() + } + fn add_assign(&mut self, other: &Self) { *self += other; } @@ -1415,6 +1204,10 @@ impl TextDimension for Point { summary.lines } + fn from_chunk(chunk: ChunkSlice) -> Self { + chunk.lines() + } + fn add_assign(&mut self, other: &Self) { *self += other; } @@ -1435,6 +1228,13 @@ impl TextDimension for PointUtf16 { summary.lines_utf16() } + fn from_chunk(chunk: ChunkSlice) -> Self { + PointUtf16 { + row: chunk.lines().row, + column: chunk.last_line_len_utf16(), + } + } + fn add_assign(&mut self, other: &Self) { *self += other; } @@ -1919,7 +1719,7 @@ mod tests { fn text(&self) -> String { let mut text = String::new(); for chunk in self.chunks.cursor::<()>(&()) { - text.push_str(&chunk.0); + text.push_str(&chunk.text); } text } diff --git a/crates/rope/src/unclipped.rs b/crates/rope/src/unclipped.rs index b3427e2cb9..679901875c 100644 --- a/crates/rope/src/unclipped.rs +++ b/crates/rope/src/unclipped.rs @@ -1,4 +1,4 @@ -use crate::{ChunkSummary, TextDimension, TextSummary}; +use crate::{chunk::ChunkSlice, ChunkSummary, TextDimension, TextSummary}; use std::ops::{Add, AddAssign, Sub, SubAssign}; #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] @@ -27,6 +27,10 @@ impl TextDimension for Unclipped { Unclipped(T::from_text_summary(summary)) } + fn from_chunk(chunk: ChunkSlice) -> Self { + Unclipped(T::from_chunk(chunk)) + } + fn add_assign(&mut self, other: &Self) { TextDimension::add_assign(&mut self.0, &other.0); } diff --git a/crates/sum_tree/Cargo.toml b/crates/sum_tree/Cargo.toml index b370e6df18..06ca955767 100644 --- a/crates/sum_tree/Cargo.toml +++ b/crates/sum_tree/Cargo.toml @@ -14,7 +14,7 @@ doctest = false [dependencies] arrayvec = "0.7.1" -rayon = "1.8" +rayon.workspace = true log.workspace = true [dev-dependencies]