Fix text wrapping in commit message editors (#31030)

Don't hard wrap interactively; instead, soft wrap in `Bounded` mode
(editor width or 72 chars, whichever is smaller), and then hard wrap
before sending the commit message to git.

This also makes the soft wrap mode and width for commit messages
configurable in language settings.

Previously we didn't support soft wrap modes other than `EditorWidth` in
auto-height editors; I tried to add support for this by analogy with
code that was already there, and it seems to work pretty well.

Closes #27508

Release Notes:

- Fixed confusing wrapping behavior in commit message editors.
This commit is contained in:
Cole Miller 2025-05-26 09:11:56 -04:00 committed by GitHub
parent a58c48f629
commit f2601ce52c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 396 additions and 376 deletions

3
Cargo.lock generated
View file

@ -4729,7 +4729,6 @@ dependencies = [
"tree-sitter-rust", "tree-sitter-rust",
"tree-sitter-typescript", "tree-sitter-typescript",
"ui", "ui",
"unicode-script",
"unicode-segmentation", "unicode-segmentation",
"unindent", "unindent",
"url", "url",
@ -17106,6 +17105,8 @@ dependencies = [
"tempfile", "tempfile",
"tendril", "tendril",
"unicase", "unicase",
"unicode-script",
"unicode-segmentation",
"util_macros", "util_macros",
"walkdir", "walkdir",
"workspace-hack", "workspace-hack",

View file

@ -1431,7 +1431,9 @@
"language_servers": ["erlang-ls", "!elp", "..."] "language_servers": ["erlang-ls", "!elp", "..."]
}, },
"Git Commit": { "Git Commit": {
"allow_rewrap": "anywhere" "allow_rewrap": "anywhere",
"preferred_line_length": 72,
"soft_wrap": "bounded"
}, },
"Go": { "Go": {
"code_actions_on_format": { "code_actions_on_format": {

View file

@ -82,7 +82,6 @@ tree-sitter-rust = { workspace = true, optional = true }
tree-sitter-typescript = { workspace = true, optional = true } tree-sitter-typescript = { workspace = true, optional = true }
tree-sitter-python = { workspace = true, optional = true } tree-sitter-python = { workspace = true, optional = true }
unicode-segmentation.workspace = true unicode-segmentation.workspace = true
unicode-script.workspace = true
unindent = { workspace = true, optional = true } unindent = { workspace = true, optional = true }
ui.workspace = true ui.workspace = true
url.workspace = true url.workspace = true

View file

@ -201,7 +201,7 @@ use ui::{
ButtonSize, ButtonStyle, ContextMenu, Disclosure, IconButton, IconButtonShape, IconName, ButtonSize, ButtonStyle, ContextMenu, Disclosure, IconButton, IconButtonShape, IconName,
IconSize, Indicator, Key, Tooltip, h_flex, prelude::*, IconSize, Indicator, Key, Tooltip, h_flex, prelude::*,
}; };
use util::{RangeExt, ResultExt, TryFutureExt, maybe, post_inc}; use util::{RangeExt, ResultExt, TryFutureExt, maybe, post_inc, wrap_with_prefix};
use workspace::{ use workspace::{
CollaboratorId, Item as WorkspaceItem, ItemId, ItemNavHistory, OpenInTerminal, OpenTerminal, CollaboratorId, Item as WorkspaceItem, ItemId, ItemNavHistory, OpenInTerminal, OpenTerminal,
RestoreOnStartupBehavior, SERIALIZATION_THROTTLE_TIME, SplitDirection, TabBarSettings, Toast, RestoreOnStartupBehavior, SERIALIZATION_THROTTLE_TIME, SplitDirection, TabBarSettings, Toast,
@ -19440,347 +19440,6 @@ fn update_uncommitted_diff_for_buffer(
}) })
} }
fn char_len_with_expanded_tabs(offset: usize, text: &str, tab_size: NonZeroU32) -> usize {
let tab_size = tab_size.get() as usize;
let mut width = offset;
for ch in text.chars() {
width += if ch == '\t' {
tab_size - (width % tab_size)
} else {
1
};
}
width - offset
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_string_size_with_expanded_tabs() {
let nz = |val| NonZeroU32::new(val).unwrap();
assert_eq!(char_len_with_expanded_tabs(0, "", nz(4)), 0);
assert_eq!(char_len_with_expanded_tabs(0, "hello", nz(4)), 5);
assert_eq!(char_len_with_expanded_tabs(0, "\thello", nz(4)), 9);
assert_eq!(char_len_with_expanded_tabs(0, "abc\tab", nz(4)), 6);
assert_eq!(char_len_with_expanded_tabs(0, "hello\t", nz(4)), 8);
assert_eq!(char_len_with_expanded_tabs(0, "\t\t", nz(8)), 16);
assert_eq!(char_len_with_expanded_tabs(0, "x\t", nz(8)), 8);
assert_eq!(char_len_with_expanded_tabs(7, "x\t", nz(8)), 9);
}
}
/// Tokenizes a string into runs of text that should stick together, or that is whitespace.
struct WordBreakingTokenizer<'a> {
input: &'a str,
}
impl<'a> WordBreakingTokenizer<'a> {
fn new(input: &'a str) -> Self {
Self { input }
}
}
fn is_char_ideographic(ch: char) -> bool {
use unicode_script::Script::*;
use unicode_script::UnicodeScript;
matches!(ch.script(), Han | Tangut | Yi)
}
fn is_grapheme_ideographic(text: &str) -> bool {
text.chars().any(is_char_ideographic)
}
fn is_grapheme_whitespace(text: &str) -> bool {
text.chars().any(|x| x.is_whitespace())
}
fn should_stay_with_preceding_ideograph(text: &str) -> bool {
text.chars().next().map_or(false, |ch| {
matches!(ch, '。' | '、' | '' | '' | '' | '' | '' | '…')
})
}
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
enum WordBreakToken<'a> {
Word { token: &'a str, grapheme_len: usize },
InlineWhitespace { token: &'a str, grapheme_len: usize },
Newline,
}
impl<'a> Iterator for WordBreakingTokenizer<'a> {
/// Yields a span, the count of graphemes in the token, and whether it was
/// whitespace. Note that it also breaks at word boundaries.
type Item = WordBreakToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
use unicode_segmentation::UnicodeSegmentation;
if self.input.is_empty() {
return None;
}
let mut iter = self.input.graphemes(true).peekable();
let mut offset = 0;
let mut grapheme_len = 0;
if let Some(first_grapheme) = iter.next() {
let is_newline = first_grapheme == "\n";
let is_whitespace = is_grapheme_whitespace(first_grapheme);
offset += first_grapheme.len();
grapheme_len += 1;
if is_grapheme_ideographic(first_grapheme) && !is_whitespace {
if let Some(grapheme) = iter.peek().copied() {
if should_stay_with_preceding_ideograph(grapheme) {
offset += grapheme.len();
grapheme_len += 1;
}
}
} else {
let mut words = self.input[offset..].split_word_bound_indices().peekable();
let mut next_word_bound = words.peek().copied();
if next_word_bound.map_or(false, |(i, _)| i == 0) {
next_word_bound = words.next();
}
while let Some(grapheme) = iter.peek().copied() {
if next_word_bound.map_or(false, |(i, _)| i == offset) {
break;
};
if is_grapheme_whitespace(grapheme) != is_whitespace
|| (grapheme == "\n") != is_newline
{
break;
};
offset += grapheme.len();
grapheme_len += 1;
iter.next();
}
}
let token = &self.input[..offset];
self.input = &self.input[offset..];
if token == "\n" {
Some(WordBreakToken::Newline)
} else if is_whitespace {
Some(WordBreakToken::InlineWhitespace {
token,
grapheme_len,
})
} else {
Some(WordBreakToken::Word {
token,
grapheme_len,
})
}
} else {
None
}
}
}
#[test]
fn test_word_breaking_tokenizer() {
let tests: &[(&str, &[WordBreakToken<'static>])] = &[
("", &[]),
(" ", &[whitespace(" ", 2)]),
("Ʒ", &[word("Ʒ", 1)]),
("Ǽ", &[word("Ǽ", 1)]),
("", &[word("", 1)]),
("⋑⋑", &[word("⋑⋑", 2)]),
(
"原理,进而",
&[word("", 1), word("理,", 2), word("", 1), word("", 1)],
),
(
"hello world",
&[word("hello", 5), whitespace(" ", 1), word("world", 5)],
),
(
"hello, world",
&[word("hello,", 6), whitespace(" ", 1), word("world", 5)],
),
(
" hello world",
&[
whitespace(" ", 2),
word("hello", 5),
whitespace(" ", 1),
word("world", 5),
],
),
(
"这是什么 \n 钢笔",
&[
word("", 1),
word("", 1),
word("", 1),
word("", 1),
whitespace(" ", 1),
newline(),
whitespace(" ", 1),
word("", 1),
word("", 1),
],
),
("mutton", &[whitespace("", 1), word("mutton", 6)]),
];
fn word(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
WordBreakToken::Word {
token,
grapheme_len,
}
}
fn whitespace(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
WordBreakToken::InlineWhitespace {
token,
grapheme_len,
}
}
fn newline() -> WordBreakToken<'static> {
WordBreakToken::Newline
}
for (input, result) in tests {
assert_eq!(
WordBreakingTokenizer::new(input)
.collect::<Vec<_>>()
.as_slice(),
*result,
);
}
}
fn wrap_with_prefix(
line_prefix: String,
unwrapped_text: String,
wrap_column: usize,
tab_size: NonZeroU32,
preserve_existing_whitespace: bool,
) -> String {
let line_prefix_len = char_len_with_expanded_tabs(0, &line_prefix, tab_size);
let mut wrapped_text = String::new();
let mut current_line = line_prefix.clone();
let tokenizer = WordBreakingTokenizer::new(&unwrapped_text);
let mut current_line_len = line_prefix_len;
let mut in_whitespace = false;
for token in tokenizer {
let have_preceding_whitespace = in_whitespace;
match token {
WordBreakToken::Word {
token,
grapheme_len,
} => {
in_whitespace = false;
if current_line_len + grapheme_len > wrap_column
&& current_line_len != line_prefix_len
{
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
}
current_line.push_str(token);
current_line_len += grapheme_len;
}
WordBreakToken::InlineWhitespace {
mut token,
mut grapheme_len,
} => {
in_whitespace = true;
if have_preceding_whitespace && !preserve_existing_whitespace {
continue;
}
if !preserve_existing_whitespace {
token = " ";
grapheme_len = 1;
}
if current_line_len + grapheme_len > wrap_column {
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if current_line_len != line_prefix_len || preserve_existing_whitespace {
current_line.push_str(token);
current_line_len += grapheme_len;
}
}
WordBreakToken::Newline => {
in_whitespace = true;
if preserve_existing_whitespace {
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if have_preceding_whitespace {
continue;
} else if current_line_len + 1 > wrap_column && current_line_len != line_prefix_len
{
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if current_line_len != line_prefix_len {
current_line.push(' ');
current_line_len += 1;
}
}
}
}
if !current_line.is_empty() {
wrapped_text.push_str(&current_line);
}
wrapped_text
}
#[test]
fn test_wrap_with_prefix() {
assert_eq!(
wrap_with_prefix(
"# ".to_string(),
"abcdefg".to_string(),
4,
NonZeroU32::new(4).unwrap(),
false,
),
"# abcdefg"
);
assert_eq!(
wrap_with_prefix(
"".to_string(),
"\thello world".to_string(),
8,
NonZeroU32::new(4).unwrap(),
false,
),
"hello\nworld"
);
assert_eq!(
wrap_with_prefix(
"// ".to_string(),
"xx \nyy zz aa bb cc".to_string(),
12,
NonZeroU32::new(4).unwrap(),
false,
),
"// xx yy zz\n// aa bb cc"
);
assert_eq!(
wrap_with_prefix(
String::new(),
"这是什么 \n 钢笔".to_string(),
3,
NonZeroU32::new(4).unwrap(),
false,
),
"这是什\n么 钢\n"
);
}
pub trait CollaborationHub { pub trait CollaborationHub {
fn collaborators<'a>(&self, cx: &'a App) -> &'a HashMap<PeerId, Collaborator>; fn collaborators<'a>(&self, cx: &'a App) -> &'a HashMap<PeerId, Collaborator>;
fn user_participant_indices<'a>(&self, cx: &'a App) -> &'a HashMap<u64, ParticipantIndex>; fn user_participant_indices<'a>(&self, cx: &'a App) -> &'a HashMap<u64, ParticipantIndex>;

View file

@ -7396,10 +7396,7 @@ impl Element for EditorElement {
editor.gutter_dimensions = gutter_dimensions; editor.gutter_dimensions = gutter_dimensions;
editor.set_visible_line_count(bounds.size.height / line_height, window, cx); editor.set_visible_line_count(bounds.size.height / line_height, window, cx);
if matches!( if matches!(editor.mode, EditorMode::Minimap { .. }) {
editor.mode,
EditorMode::AutoHeight { .. } | EditorMode::Minimap { .. }
) {
snapshot snapshot
} else { } else {
let wrap_width_for = |column: u32| (column as f32 * em_advance).ceil(); let wrap_width_for = |column: u32| (column as f32 * em_advance).ceil();
@ -9390,6 +9387,7 @@ fn compute_auto_height_layout(
let font_size = style.text.font_size.to_pixels(window.rem_size()); let font_size = style.text.font_size.to_pixels(window.rem_size());
let line_height = style.text.line_height_in_pixels(window.rem_size()); let line_height = style.text.line_height_in_pixels(window.rem_size());
let em_width = window.text_system().em_width(font_id, font_size).unwrap(); let em_width = window.text_system().em_width(font_id, font_size).unwrap();
let em_advance = window.text_system().em_advance(font_id, font_size).unwrap();
let mut snapshot = editor.snapshot(window, cx); let mut snapshot = editor.snapshot(window, cx);
let gutter_dimensions = snapshot let gutter_dimensions = snapshot
@ -9406,10 +9404,18 @@ fn compute_auto_height_layout(
let overscroll = size(em_width, px(0.)); let overscroll = size(em_width, px(0.));
let editor_width = text_width - gutter_dimensions.margin - overscroll.width - em_width; let editor_width = text_width - gutter_dimensions.margin - overscroll.width - em_width;
if !matches!(editor.soft_wrap_mode(cx), SoftWrap::None) { let content_offset = point(gutter_dimensions.margin, Pixels::ZERO);
if editor.set_wrap_width(Some(editor_width), cx) { let editor_content_width = editor_width - content_offset.x;
snapshot = editor.snapshot(window, cx); let wrap_width_for = |column: u32| (column as f32 * em_advance).ceil();
} let wrap_width = match editor.soft_wrap_mode(cx) {
SoftWrap::GitDiff => None,
SoftWrap::None => Some(wrap_width_for(MAX_LINE_LEN as u32 / 2)),
SoftWrap::EditorWidth => Some(editor_content_width),
SoftWrap::Column(column) => Some(wrap_width_for(column)),
SoftWrap::Bounded(column) => Some(editor_content_width.min(wrap_width_for(column))),
};
if editor.set_wrap_width(wrap_width, cx) {
snapshot = editor.snapshot(window, cx);
} }
let scroll_height = (snapshot.max_point().row().next_row().0 as f32) * line_height; let scroll_height = (snapshot.max_point().row().next_row().0 as f32) * line_height;

View file

@ -54,6 +54,7 @@ use project::{
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use settings::{Settings as _, SettingsStore}; use settings::{Settings as _, SettingsStore};
use std::future::Future; use std::future::Future;
use std::num::NonZeroU32;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::{collections::HashSet, sync::Arc, time::Duration, usize}; use std::{collections::HashSet, sync::Arc, time::Duration, usize};
use strum::{IntoEnumIterator, VariantNames}; use strum::{IntoEnumIterator, VariantNames};
@ -62,7 +63,7 @@ use ui::{
Checkbox, ContextMenu, ElevationIndex, PopoverMenu, Scrollbar, ScrollbarState, SplitButton, Checkbox, ContextMenu, ElevationIndex, PopoverMenu, Scrollbar, ScrollbarState, SplitButton,
Tooltip, prelude::*, Tooltip, prelude::*,
}; };
use util::{ResultExt, TryFutureExt, maybe}; use util::{ResultExt, TryFutureExt, maybe, wrap_with_prefix};
use workspace::AppState; use workspace::AppState;
use notifications::status_toast::{StatusToast, ToastIcon}; use notifications::status_toast::{StatusToast, ToastIcon};
@ -382,7 +383,6 @@ pub(crate) fn commit_message_editor(
commit_editor.set_show_gutter(false, cx); commit_editor.set_show_gutter(false, cx);
commit_editor.set_show_wrap_guides(false, cx); commit_editor.set_show_wrap_guides(false, cx);
commit_editor.set_show_indent_guides(false, cx); commit_editor.set_show_indent_guides(false, cx);
commit_editor.set_hard_wrap(Some(72), cx);
let placeholder = placeholder.unwrap_or("Enter commit message".into()); let placeholder = placeholder.unwrap_or("Enter commit message".into());
commit_editor.set_placeholder_text(placeholder, cx); commit_editor.set_placeholder_text(placeholder, cx);
commit_editor commit_editor
@ -1484,8 +1484,22 @@ impl GitPanel {
fn custom_or_suggested_commit_message(&self, cx: &mut Context<Self>) -> Option<String> { fn custom_or_suggested_commit_message(&self, cx: &mut Context<Self>) -> Option<String> {
let message = self.commit_editor.read(cx).text(cx); let message = self.commit_editor.read(cx).text(cx);
let width = self
.commit_editor
.read(cx)
.buffer()
.read(cx)
.language_settings(cx)
.preferred_line_length as usize;
if !message.trim().is_empty() { if !message.trim().is_empty() {
let message = wrap_with_prefix(
String::new(),
message,
width,
NonZeroU32::new(8).unwrap(), // tab size doesn't matter when prefix is empty
false,
);
return Some(message); return Some(message);
} }

View file

@ -666,7 +666,7 @@ pub struct CodeLabel {
pub filter_range: Range<usize>, pub filter_range: Range<usize>,
} }
#[derive(Clone, Deserialize, JsonSchema)] #[derive(Clone, Debug, Deserialize, JsonSchema)]
pub struct LanguageConfig { pub struct LanguageConfig {
/// Human-readable name of the language. /// Human-readable name of the language.
pub name: LanguageName, pub name: LanguageName,
@ -777,7 +777,7 @@ pub struct LanguageMatcher {
} }
/// The configuration for JSX tag auto-closing. /// The configuration for JSX tag auto-closing.
#[derive(Clone, Deserialize, JsonSchema)] #[derive(Clone, Debug, Deserialize, JsonSchema)]
pub struct JsxTagAutoCloseConfig { pub struct JsxTagAutoCloseConfig {
/// The name of the node for a opening tag /// The name of the node for a opening tag
pub open_tag_node_name: String, pub open_tag_node_name: String,
@ -810,7 +810,7 @@ pub struct JsxTagAutoCloseConfig {
} }
/// The configuration for documentation block for this language. /// The configuration for documentation block for this language.
#[derive(Clone, Deserialize, JsonSchema)] #[derive(Clone, Debug, Deserialize, JsonSchema)]
pub struct DocumentationConfig { pub struct DocumentationConfig {
/// A start tag of documentation block. /// A start tag of documentation block.
pub start: Arc<str>, pub start: Arc<str>,

View file

@ -37,6 +37,8 @@ smol.workspace = true
take-until.workspace = true take-until.workspace = true
tempfile.workspace = true tempfile.workspace = true
unicase.workspace = true unicase.workspace = true
unicode-script.workspace = true
unicode-segmentation.workspace = true
util_macros = { workspace = true, optional = true } util_macros = { workspace = true, optional = true }
walkdir.workspace = true walkdir.workspace = true
workspace-hack.workspace = true workspace-hack.workspace = true

View file

@ -14,6 +14,7 @@ use anyhow::Result;
use futures::Future; use futures::Future;
use itertools::Either; use itertools::Either;
use regex::Regex; use regex::Regex;
use std::num::NonZeroU32;
use std::sync::{LazyLock, OnceLock}; use std::sync::{LazyLock, OnceLock};
use std::{ use std::{
borrow::Cow, borrow::Cow,
@ -183,29 +184,208 @@ pub fn truncate_lines_to_byte_limit(s: &str, max_bytes: usize) -> &str {
truncate_to_byte_limit(s, max_bytes) truncate_to_byte_limit(s, max_bytes)
} }
#[test] fn char_len_with_expanded_tabs(offset: usize, text: &str, tab_size: NonZeroU32) -> usize {
fn test_truncate_lines_to_byte_limit() { let tab_size = tab_size.get() as usize;
let text = "Line 1\nLine 2\nLine 3\nLine 4"; let mut width = offset;
// Limit that includes all lines for ch in text.chars() {
assert_eq!(truncate_lines_to_byte_limit(text, 100), text); width += if ch == '\t' {
tab_size - (width % tab_size)
} else {
1
};
}
// Exactly the first line width - offset
assert_eq!(truncate_lines_to_byte_limit(text, 7), "Line 1\n"); }
// Limit between lines /// Tokenizes a string into runs of text that should stick together, or that is whitespace.
assert_eq!(truncate_lines_to_byte_limit(text, 13), "Line 1\n"); struct WordBreakingTokenizer<'a> {
assert_eq!(truncate_lines_to_byte_limit(text, 20), "Line 1\nLine 2\n"); input: &'a str,
}
// Limit before first newline impl<'a> WordBreakingTokenizer<'a> {
assert_eq!(truncate_lines_to_byte_limit(text, 6), "Line "); fn new(input: &'a str) -> Self {
Self { input }
}
}
// Test with non-ASCII characters fn is_char_ideographic(ch: char) -> bool {
let text_utf8 = "Line 1\nLíne 2\nLine 3"; use unicode_script::Script::*;
assert_eq!( use unicode_script::UnicodeScript;
truncate_lines_to_byte_limit(text_utf8, 15), matches!(ch.script(), Han | Tangut | Yi)
"Line 1\nLíne 2\n" }
);
fn is_grapheme_ideographic(text: &str) -> bool {
text.chars().any(is_char_ideographic)
}
fn is_grapheme_whitespace(text: &str) -> bool {
text.chars().any(|x| x.is_whitespace())
}
fn should_stay_with_preceding_ideograph(text: &str) -> bool {
text.chars().next().map_or(false, |ch| {
matches!(ch, '。' | '、' | '' | '' | '' | '' | '' | '…')
})
}
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
enum WordBreakToken<'a> {
Word { token: &'a str, grapheme_len: usize },
InlineWhitespace { token: &'a str, grapheme_len: usize },
Newline,
}
impl<'a> Iterator for WordBreakingTokenizer<'a> {
/// Yields a span, the count of graphemes in the token, and whether it was
/// whitespace. Note that it also breaks at word boundaries.
type Item = WordBreakToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
use unicode_segmentation::UnicodeSegmentation;
if self.input.is_empty() {
return None;
}
let mut iter = self.input.graphemes(true).peekable();
let mut offset = 0;
let mut grapheme_len = 0;
if let Some(first_grapheme) = iter.next() {
let is_newline = first_grapheme == "\n";
let is_whitespace = is_grapheme_whitespace(first_grapheme);
offset += first_grapheme.len();
grapheme_len += 1;
if is_grapheme_ideographic(first_grapheme) && !is_whitespace {
if let Some(grapheme) = iter.peek().copied() {
if should_stay_with_preceding_ideograph(grapheme) {
offset += grapheme.len();
grapheme_len += 1;
}
}
} else {
let mut words = self.input[offset..].split_word_bound_indices().peekable();
let mut next_word_bound = words.peek().copied();
if next_word_bound.map_or(false, |(i, _)| i == 0) {
next_word_bound = words.next();
}
while let Some(grapheme) = iter.peek().copied() {
if next_word_bound.map_or(false, |(i, _)| i == offset) {
break;
};
if is_grapheme_whitespace(grapheme) != is_whitespace
|| (grapheme == "\n") != is_newline
{
break;
};
offset += grapheme.len();
grapheme_len += 1;
iter.next();
}
}
let token = &self.input[..offset];
self.input = &self.input[offset..];
if token == "\n" {
Some(WordBreakToken::Newline)
} else if is_whitespace {
Some(WordBreakToken::InlineWhitespace {
token,
grapheme_len,
})
} else {
Some(WordBreakToken::Word {
token,
grapheme_len,
})
}
} else {
None
}
}
}
pub fn wrap_with_prefix(
line_prefix: String,
unwrapped_text: String,
wrap_column: usize,
tab_size: NonZeroU32,
preserve_existing_whitespace: bool,
) -> String {
let line_prefix_len = char_len_with_expanded_tabs(0, &line_prefix, tab_size);
let mut wrapped_text = String::new();
let mut current_line = line_prefix.clone();
let tokenizer = WordBreakingTokenizer::new(&unwrapped_text);
let mut current_line_len = line_prefix_len;
let mut in_whitespace = false;
for token in tokenizer {
let have_preceding_whitespace = in_whitespace;
match token {
WordBreakToken::Word {
token,
grapheme_len,
} => {
in_whitespace = false;
if current_line_len + grapheme_len > wrap_column
&& current_line_len != line_prefix_len
{
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
}
current_line.push_str(token);
current_line_len += grapheme_len;
}
WordBreakToken::InlineWhitespace {
mut token,
mut grapheme_len,
} => {
in_whitespace = true;
if have_preceding_whitespace && !preserve_existing_whitespace {
continue;
}
if !preserve_existing_whitespace {
token = " ";
grapheme_len = 1;
}
if current_line_len + grapheme_len > wrap_column {
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if current_line_len != line_prefix_len || preserve_existing_whitespace {
current_line.push_str(token);
current_line_len += grapheme_len;
}
}
WordBreakToken::Newline => {
in_whitespace = true;
if preserve_existing_whitespace {
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if have_preceding_whitespace {
continue;
} else if current_line_len + 1 > wrap_column && current_line_len != line_prefix_len
{
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if current_line_len != line_prefix_len {
current_line.push(' ');
current_line_len += 1;
}
}
}
}
if !current_line.is_empty() {
wrapped_text.push_str(&current_line);
}
wrapped_text
} }
pub fn post_inc<T: From<u8> + AddAssign<T> + Copy>(value: &mut T) -> T { pub fn post_inc<T: From<u8> + AddAssign<T> + Copy>(value: &mut T) -> T {
@ -1302,4 +1482,161 @@ Line 3"#
(0..8).collect::<Vec<usize>>() (0..8).collect::<Vec<usize>>()
); );
} }
#[test]
fn test_truncate_lines_to_byte_limit() {
let text = "Line 1\nLine 2\nLine 3\nLine 4";
// Limit that includes all lines
assert_eq!(truncate_lines_to_byte_limit(text, 100), text);
// Exactly the first line
assert_eq!(truncate_lines_to_byte_limit(text, 7), "Line 1\n");
// Limit between lines
assert_eq!(truncate_lines_to_byte_limit(text, 13), "Line 1\n");
assert_eq!(truncate_lines_to_byte_limit(text, 20), "Line 1\nLine 2\n");
// Limit before first newline
assert_eq!(truncate_lines_to_byte_limit(text, 6), "Line ");
// Test with non-ASCII characters
let text_utf8 = "Line 1\nLíne 2\nLine 3";
assert_eq!(
truncate_lines_to_byte_limit(text_utf8, 15),
"Line 1\nLíne 2\n"
);
}
#[test]
fn test_string_size_with_expanded_tabs() {
let nz = |val| NonZeroU32::new(val).unwrap();
assert_eq!(char_len_with_expanded_tabs(0, "", nz(4)), 0);
assert_eq!(char_len_with_expanded_tabs(0, "hello", nz(4)), 5);
assert_eq!(char_len_with_expanded_tabs(0, "\thello", nz(4)), 9);
assert_eq!(char_len_with_expanded_tabs(0, "abc\tab", nz(4)), 6);
assert_eq!(char_len_with_expanded_tabs(0, "hello\t", nz(4)), 8);
assert_eq!(char_len_with_expanded_tabs(0, "\t\t", nz(8)), 16);
assert_eq!(char_len_with_expanded_tabs(0, "x\t", nz(8)), 8);
assert_eq!(char_len_with_expanded_tabs(7, "x\t", nz(8)), 9);
}
#[test]
fn test_word_breaking_tokenizer() {
let tests: &[(&str, &[WordBreakToken<'static>])] = &[
("", &[]),
(" ", &[whitespace(" ", 2)]),
("Ʒ", &[word("Ʒ", 1)]),
("Ǽ", &[word("Ǽ", 1)]),
("", &[word("", 1)]),
("⋑⋑", &[word("⋑⋑", 2)]),
(
"原理,进而",
&[word("", 1), word("理,", 2), word("", 1), word("", 1)],
),
(
"hello world",
&[word("hello", 5), whitespace(" ", 1), word("world", 5)],
),
(
"hello, world",
&[word("hello,", 6), whitespace(" ", 1), word("world", 5)],
),
(
" hello world",
&[
whitespace(" ", 2),
word("hello", 5),
whitespace(" ", 1),
word("world", 5),
],
),
(
"这是什么 \n 钢笔",
&[
word("", 1),
word("", 1),
word("", 1),
word("", 1),
whitespace(" ", 1),
newline(),
whitespace(" ", 1),
word("", 1),
word("", 1),
],
),
("mutton", &[whitespace("", 1), word("mutton", 6)]),
];
fn word(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
WordBreakToken::Word {
token,
grapheme_len,
}
}
fn whitespace(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
WordBreakToken::InlineWhitespace {
token,
grapheme_len,
}
}
fn newline() -> WordBreakToken<'static> {
WordBreakToken::Newline
}
for (input, result) in tests {
assert_eq!(
WordBreakingTokenizer::new(input)
.collect::<Vec<_>>()
.as_slice(),
*result,
);
}
}
#[test]
fn test_wrap_with_prefix() {
assert_eq!(
wrap_with_prefix(
"# ".to_string(),
"abcdefg".to_string(),
4,
NonZeroU32::new(4).unwrap(),
false,
),
"# abcdefg"
);
assert_eq!(
wrap_with_prefix(
"".to_string(),
"\thello world".to_string(),
8,
NonZeroU32::new(4).unwrap(),
false,
),
"hello\nworld"
);
assert_eq!(
wrap_with_prefix(
"// ".to_string(),
"xx \nyy zz aa bb cc".to_string(),
12,
NonZeroU32::new(4).unwrap(),
false,
),
"// xx yy zz\n// aa bb cc"
);
assert_eq!(
wrap_with_prefix(
String::new(),
"这是什么 \n 钢笔".to_string(),
3,
NonZeroU32::new(4).unwrap(),
false,
),
"这是什\n么 钢\n"
);
}
} }