Add a debug_assert! to verify utf8_char_boundary

Revise some comments
Inline a variable
2025-07-02 16:21:21 -04:00 · 2025-07-02 16:21:21 -04:00 · 2025-07-02 16:21:21 -04:00 · 2025-07-02 16:21:21 -04:00 · 2025-07-02 16:21:21 -04:00 · 2025-07-02 16:21:21 -04:00
1 changed files with 265 additions and 9 deletions
--- a/crates/editor/src/display_map/inlay_map.rs
+++ b/crates/editor/src/display_map/inlay_map.rs
@ -296,12 +296,10 @@ impl<'a> Iterator for InlayChunks<'a> {
                    *chunk = self.buffer_chunks.next().unwrap();
                }
-                let (prefix, suffix) = chunk.text.split_at(
+                let (prefix, suffix) = chunk.text.split_at(utf8_char_boundary(
-                    chunk
+                    chunk.text,
-                        .text
+                    self.transforms.end(&()).0.0 - self.output_offset.0,
-                        .len()
+                ));
                        .min(self.transforms.end(&()).0.0 - self.output_offset.0),
                );
                chunk.text = suffix;
                self.output_offset.0 += prefix.len();
@ -391,8 +389,10 @@ impl<'a> Iterator for InlayChunks<'a> {
                let inlay_chunk = self
                    .inlay_chunk
                    .get_or_insert_with(|| inlay_chunks.next().unwrap());
-                let (chunk, remainder) =
+                let (chunk, remainder) = inlay_chunk.split_at(utf8_char_boundary(
-                    inlay_chunk.split_at(inlay_chunk.len().min(next_inlay_highlight_endpoint));
+                    inlay_chunk,
                    next_inlay_highlight_endpoint,
                ));
                *inlay_chunk = remainder;
                if inlay_chunk.is_empty() {
                    self.inlay_chunk = None;
@ -412,7 +412,7 @@ impl<'a> Iterator for InlayChunks<'a> {
            }
        };
-        if self.output_offset == self.transforms.end(&()).0 {
+        if self.output_offset >= self.transforms.end(&()).0 {
            self.inlay_chunks = None;
            self.transforms.next(&());
        }
@ -1143,6 +1143,56 @@ fn push_isomorphic(sum_tree: &mut SumTree<Transform>, summary: TextSummary) {
    }
 }
 /// Given a byte offset into a nonempty string slice, returns the byte index of
 /// the previous valid `char` in the string. We look for the *previous* valid
 /// one because if the index is in the middle of a UTF-8 multibyte sequence, we
 /// can always get from there to a valid index by searching backwards, whereas
 /// if we search forward we may run out of string bytes before finding a `char`.
 ///
 /// Panics if given an empty slice.
 #[inline(always)]
 fn utf8_char_boundary(text: &str, byte_index: usize) -> usize {
    let mut byte_index = byte_index.min(text.len().saturating_sub(1));
    #[cfg(debug_assertions)]
    let start_byte_index = byte_index;
    loop {
        if let Some(byte) = text.as_bytes().get(byte_index) {
            // The bits in a UTF-8 continuation byte are always 10xxxxxx,
            // so if we see one of those, we'd be splitting on a continuation
            // byte instead of a Unicode Scalar Value like we need.
            if (byte >> 6) != 0b00000010 {
                return byte_index;
            }
        } else {
            // This should only happen if given an empty string, because we started at index
            // (text.len() - 1) and then decremented from there. A valid nonempty &str should
            // have at least one byte which passes the conditional, and the function's docs
            // note that it panics when given an empty string.
            panic!(
                "Tried to find UTF-8 char boundary at index {byte_index} in a string with length {}",
                text.len()
            );
        }
        // Eventually we'll get down to index 0, which in a &str is guaranteed
        // to not be a continuation byte.
        byte_index -= 1;
        #[cfg(debug_assertions)]
        {
            // UTF-8 can have at most 3 continuation bytes, so we should never
            // look back more than 4 bytes total (including the starting byte).
            // If we do, the &str was invalid UTF-8, which should never happen!
            debug_assert!(
                start_byte_index.abs_diff(byte_index) < 4,
                "Looked back {} bytes without finding a UTF-8 boundary - the given string must be malformed",
                start_byte_index.abs_diff(byte_index).saturating_add(1)
            );
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
@ -1882,4 +1932,210 @@ mod tests {
        cx.set_global(store);
        theme::init(theme::LoadThemes::JustBase, cx);
    }
    /// Helper to create test highlights for an inlay
    fn create_inlay_highlights(
        inlay_id: InlayId,
        highlight_range: Range<usize>,
        position: Anchor,
    ) -> TreeMap<TypeId, TreeMap<InlayId, (HighlightStyle, InlayHighlight)>> {
        let mut inlay_highlights = TreeMap::default();
        let mut type_highlights = TreeMap::default();
        type_highlights.insert(
            inlay_id,
            (
                HighlightStyle::default(),
                InlayHighlight {
                    inlay: inlay_id,
                    range: highlight_range,
                    inlay_position: position,
                },
            ),
        );
        inlay_highlights.insert(TypeId::of::<()>(), type_highlights);
        inlay_highlights
    }
    #[gpui::test]
    fn test_inlay_utf8_boundary_panic_fix(cx: &mut App) {
        init_test(cx);
        // This test verifies that we handle UTF-8 character boundaries correctly
        // when splitting inlay text for highlighting. Previously, this would panic
        // when trying to split at byte 13, which is in the middle of the '…' character.
        //
        // See https://github.com/zed-industries/zed/issues/33641
        let buffer = MultiBuffer::build_simple("fn main() {}\n", cx);
        let (mut inlay_map, _) = InlayMap::new(buffer.read(cx).snapshot(cx));
        // Create an inlay with text that contains a multi-byte character
        // The string "SortingDirec…" contains an ellipsis character '…' which is 3 bytes (E2 80 A6)
        let inlay_text = "SortingDirec…";
        let position = buffer.read(cx).snapshot(cx).anchor_before(Point::new(0, 5));
        let inlay = Inlay {
            id: InlayId::Hint(0),
            position,
            text: text::Rope::from(inlay_text),
            color: None,
        };
        let (inlay_snapshot, _) = inlay_map.splice(&[], vec![inlay]);
        // Create highlights that request a split at byte 13, which is in the middle
        // of the '…' character (bytes 12..14). We should round down to byte 12.
        let inlay_highlights = create_inlay_highlights(InlayId::Hint(0), 0..13, position);
        let highlights = crate::display_map::Highlights {
            text_highlights: None,
            inlay_highlights: Some(&inlay_highlights),
            styles: crate::display_map::HighlightStyles::default(),
        };
        // Collect chunks - this previously would panic
        let chunks: Vec<_> = inlay_snapshot
            .chunks(
                InlayOffset(0)..InlayOffset(inlay_snapshot.len().0),
                false,
                highlights,
            )
            .collect();
        // Verify the chunks are correct
        let full_text: String = chunks.iter().map(|c| c.chunk.text).collect();
        assert_eq!(full_text, "fn maSortingDirec…in() {}\n");
        // Verify the highlighted portion includes the complete ellipsis character
        let highlighted_chunks: Vec<_> = chunks
            .iter()
            .filter(|c| c.chunk.highlight_style.is_some() && c.chunk.is_inlay)
            .collect();
        assert_eq!(highlighted_chunks.len(), 1);
        assert_eq!(highlighted_chunks[0].chunk.text, "SortingDirec…");
    }
    #[gpui::test]
    fn test_inlay_utf8_boundaries(cx: &mut App) {
        init_test(cx);
        struct TestCase {
            inlay_text: &'static str,
            highlight_range: Range<usize>,
            expected_highlighted: &'static str,
            description: &'static str,
        }
        let test_cases = vec![
            TestCase {
                inlay_text: "Hello👋World",
                highlight_range: 0..7,
                expected_highlighted: "Hello👋",
                description: "Emoji boundary - rounds up to include full emoji",
            },
            TestCase {
                inlay_text: "Test→End",
                highlight_range: 0..5,
                expected_highlighted: "Test→",
                description: "Arrow boundary - rounds up to include full arrow",
            },
            TestCase {
                inlay_text: "café",
                highlight_range: 0..4,
                expected_highlighted: "café",
                description: "Accented char boundary - rounds up to include full é",
            },
            TestCase {
                inlay_text: "🎨🎭🎪",
                highlight_range: 0..5,
                expected_highlighted: "🎨🎭",
                description: "Multiple emojis - partial highlight",
            },
            TestCase {
                inlay_text: "普通话",
                highlight_range: 0..4,
                expected_highlighted: "普通",
                description: "Chinese characters - partial highlight",
            },
            TestCase {
                inlay_text: "Hello",
                highlight_range: 0..2,
                expected_highlighted: "He",
                description: "ASCII only - no adjustment needed",
            },
            TestCase {
                inlay_text: "👋",
                highlight_range: 0..1,
                expected_highlighted: "👋",
                description: "Single emoji - partial byte range includes whole char",
            },
            TestCase {
                inlay_text: "Test",
                highlight_range: 0..0,
                expected_highlighted: "",
                description: "Empty range",
            },
            TestCase {
                inlay_text: "🎨ABC",
                highlight_range: 2..5,
                expected_highlighted: "A",
                description: "Range starting mid-emoji skips the emoji",
            },
        ];
        for test_case in test_cases {
            let buffer = MultiBuffer::build_simple("test", cx);
            let (mut inlay_map, _) = InlayMap::new(buffer.read(cx).snapshot(cx));
            let position = buffer.read(cx).snapshot(cx).anchor_before(Point::new(0, 2));
            let inlay = Inlay {
                id: InlayId::Hint(0),
                position,
                text: text::Rope::from(test_case.inlay_text),
                color: None,
            };
            let (inlay_snapshot, _) = inlay_map.splice(&[], vec![inlay]);
            let inlay_highlights = create_inlay_highlights(
                InlayId::Hint(0),
                test_case.highlight_range.clone(),
                position,
            );
            let highlights = crate::display_map::Highlights {
                text_highlights: None,
                inlay_highlights: Some(&inlay_highlights),
                styles: crate::display_map::HighlightStyles::default(),
            };
            let chunks: Vec<_> = inlay_snapshot
                .chunks(
                    InlayOffset(0)..InlayOffset(inlay_snapshot.len().0),
                    false,
                    highlights,
                )
                .collect();
            // Verify we got chunks and they total to the expected text
            let full_text: String = chunks.iter().map(|c| c.chunk.text).collect();
            assert_eq!(
                full_text,
                format!("te{}st", test_case.inlay_text),
                "Full text mismatch for case: {}",
                test_case.description
            );
            // Verify that the highlighted portion matches expectations
            let highlighted_text: String = chunks
                .iter()
                .filter(|c| c.chunk.highlight_style.is_some() && c.chunk.is_inlay)
                .map(|c| c.chunk.text)
                .collect();
            assert_eq!(
                highlighted_text, test_case.expected_highlighted,
                "Highlighted text mismatch for case: {} (text: '{}', range: {:?})",
                test_case.description, test_case.inlay_text, test_case.highlight_range
            );
        }
    }
 }
Author	SHA1	Message	Date
Richard Feldman	211f20f41f	Add a debug_assert! to verify utf8_char_boundary	2025-07-02 16:21:21 -04:00
Richard Feldman	6107e7c604	Revise some comments	2025-07-02 16:21:21 -04:00
Richard Feldman	53ce77a0f7	Inline a variable	2025-07-02 16:21:21 -04:00
Richard Feldman	b69a09892b	Delete a redundant test	2025-07-02 16:21:21 -04:00
Richard Feldman	7e152e0439	Make a test name more concise	2025-07-02 16:21:21 -04:00
Richard Feldman	45fd87e63a	Placate spell checker	2025-07-02 16:21:21 -04:00
Richard Feldman	6e19923c27	Revise utf8_char_boundary	2025-07-02 16:21:21 -04:00
Richard Feldman	92fb7656c4	Only split inlay chunks at character boundaries	2025-07-02 16:21:21 -04:00
Richard Feldman	2de99369f4	Reproduce #33641 in a test	2025-07-02 16:21:21 -04:00