Revert "Fix text wrapping in commit message editors (#31030)" (#31587)

This reverts commit f2601ce52c.

Release Notes:

- N/A
This commit is contained in:
Cole Miller 2025-05-28 10:16:34 -04:00 committed by GitHub
parent 2c4b75ab30
commit 218e8d09c5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 376 additions and 396 deletions

View file

@ -37,8 +37,6 @@ smol.workspace = true
take-until.workspace = true
tempfile.workspace = true
unicase.workspace = true
unicode-script.workspace = true
unicode-segmentation.workspace = true
util_macros = { workspace = true, optional = true }
walkdir.workspace = true
workspace-hack.workspace = true

View file

@ -14,7 +14,6 @@ use anyhow::Result;
use futures::Future;
use itertools::Either;
use regex::Regex;
use std::num::NonZeroU32;
use std::sync::{LazyLock, OnceLock};
use std::{
borrow::Cow,
@ -184,208 +183,29 @@ pub fn truncate_lines_to_byte_limit(s: &str, max_bytes: usize) -> &str {
truncate_to_byte_limit(s, max_bytes)
}
fn char_len_with_expanded_tabs(offset: usize, text: &str, tab_size: NonZeroU32) -> usize {
let tab_size = tab_size.get() as usize;
let mut width = offset;
#[test]
fn test_truncate_lines_to_byte_limit() {
let text = "Line 1\nLine 2\nLine 3\nLine 4";
for ch in text.chars() {
width += if ch == '\t' {
tab_size - (width % tab_size)
} else {
1
};
}
// Limit that includes all lines
assert_eq!(truncate_lines_to_byte_limit(text, 100), text);
width - offset
}
// Exactly the first line
assert_eq!(truncate_lines_to_byte_limit(text, 7), "Line 1\n");
/// Tokenizes a string into runs of text that should stick together, or that is whitespace.
struct WordBreakingTokenizer<'a> {
input: &'a str,
}
// Limit between lines
assert_eq!(truncate_lines_to_byte_limit(text, 13), "Line 1\n");
assert_eq!(truncate_lines_to_byte_limit(text, 20), "Line 1\nLine 2\n");
impl<'a> WordBreakingTokenizer<'a> {
fn new(input: &'a str) -> Self {
Self { input }
}
}
// Limit before first newline
assert_eq!(truncate_lines_to_byte_limit(text, 6), "Line ");
fn is_char_ideographic(ch: char) -> bool {
use unicode_script::Script::*;
use unicode_script::UnicodeScript;
matches!(ch.script(), Han | Tangut | Yi)
}
fn is_grapheme_ideographic(text: &str) -> bool {
text.chars().any(is_char_ideographic)
}
fn is_grapheme_whitespace(text: &str) -> bool {
text.chars().any(|x| x.is_whitespace())
}
fn should_stay_with_preceding_ideograph(text: &str) -> bool {
text.chars().next().map_or(false, |ch| {
matches!(ch, '。' | '、' | '' | '' | '' | '' | '' | '…')
})
}
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
enum WordBreakToken<'a> {
Word { token: &'a str, grapheme_len: usize },
InlineWhitespace { token: &'a str, grapheme_len: usize },
Newline,
}
impl<'a> Iterator for WordBreakingTokenizer<'a> {
/// Yields a span, the count of graphemes in the token, and whether it was
/// whitespace. Note that it also breaks at word boundaries.
type Item = WordBreakToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
use unicode_segmentation::UnicodeSegmentation;
if self.input.is_empty() {
return None;
}
let mut iter = self.input.graphemes(true).peekable();
let mut offset = 0;
let mut grapheme_len = 0;
if let Some(first_grapheme) = iter.next() {
let is_newline = first_grapheme == "\n";
let is_whitespace = is_grapheme_whitespace(first_grapheme);
offset += first_grapheme.len();
grapheme_len += 1;
if is_grapheme_ideographic(first_grapheme) && !is_whitespace {
if let Some(grapheme) = iter.peek().copied() {
if should_stay_with_preceding_ideograph(grapheme) {
offset += grapheme.len();
grapheme_len += 1;
}
}
} else {
let mut words = self.input[offset..].split_word_bound_indices().peekable();
let mut next_word_bound = words.peek().copied();
if next_word_bound.map_or(false, |(i, _)| i == 0) {
next_word_bound = words.next();
}
while let Some(grapheme) = iter.peek().copied() {
if next_word_bound.map_or(false, |(i, _)| i == offset) {
break;
};
if is_grapheme_whitespace(grapheme) != is_whitespace
|| (grapheme == "\n") != is_newline
{
break;
};
offset += grapheme.len();
grapheme_len += 1;
iter.next();
}
}
let token = &self.input[..offset];
self.input = &self.input[offset..];
if token == "\n" {
Some(WordBreakToken::Newline)
} else if is_whitespace {
Some(WordBreakToken::InlineWhitespace {
token,
grapheme_len,
})
} else {
Some(WordBreakToken::Word {
token,
grapheme_len,
})
}
} else {
None
}
}
}
pub fn wrap_with_prefix(
line_prefix: String,
unwrapped_text: String,
wrap_column: usize,
tab_size: NonZeroU32,
preserve_existing_whitespace: bool,
) -> String {
let line_prefix_len = char_len_with_expanded_tabs(0, &line_prefix, tab_size);
let mut wrapped_text = String::new();
let mut current_line = line_prefix.clone();
let tokenizer = WordBreakingTokenizer::new(&unwrapped_text);
let mut current_line_len = line_prefix_len;
let mut in_whitespace = false;
for token in tokenizer {
let have_preceding_whitespace = in_whitespace;
match token {
WordBreakToken::Word {
token,
grapheme_len,
} => {
in_whitespace = false;
if current_line_len + grapheme_len > wrap_column
&& current_line_len != line_prefix_len
{
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
}
current_line.push_str(token);
current_line_len += grapheme_len;
}
WordBreakToken::InlineWhitespace {
mut token,
mut grapheme_len,
} => {
in_whitespace = true;
if have_preceding_whitespace && !preserve_existing_whitespace {
continue;
}
if !preserve_existing_whitespace {
token = " ";
grapheme_len = 1;
}
if current_line_len + grapheme_len > wrap_column {
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if current_line_len != line_prefix_len || preserve_existing_whitespace {
current_line.push_str(token);
current_line_len += grapheme_len;
}
}
WordBreakToken::Newline => {
in_whitespace = true;
if preserve_existing_whitespace {
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if have_preceding_whitespace {
continue;
} else if current_line_len + 1 > wrap_column && current_line_len != line_prefix_len
{
wrapped_text.push_str(current_line.trim_end());
wrapped_text.push('\n');
current_line.truncate(line_prefix.len());
current_line_len = line_prefix_len;
} else if current_line_len != line_prefix_len {
current_line.push(' ');
current_line_len += 1;
}
}
}
}
if !current_line.is_empty() {
wrapped_text.push_str(&current_line);
}
wrapped_text
// Test with non-ASCII characters
let text_utf8 = "Line 1\nLíne 2\nLine 3";
assert_eq!(
truncate_lines_to_byte_limit(text_utf8, 15),
"Line 1\nLíne 2\n"
);
}
pub fn post_inc<T: From<u8> + AddAssign<T> + Copy>(value: &mut T) -> T {
@ -1581,163 +1401,6 @@ Line 3"#
);
}
#[test]
fn test_truncate_lines_to_byte_limit() {
let text = "Line 1\nLine 2\nLine 3\nLine 4";
// Limit that includes all lines
assert_eq!(truncate_lines_to_byte_limit(text, 100), text);
// Exactly the first line
assert_eq!(truncate_lines_to_byte_limit(text, 7), "Line 1\n");
// Limit between lines
assert_eq!(truncate_lines_to_byte_limit(text, 13), "Line 1\n");
assert_eq!(truncate_lines_to_byte_limit(text, 20), "Line 1\nLine 2\n");
// Limit before first newline
assert_eq!(truncate_lines_to_byte_limit(text, 6), "Line ");
// Test with non-ASCII characters
let text_utf8 = "Line 1\nLíne 2\nLine 3";
assert_eq!(
truncate_lines_to_byte_limit(text_utf8, 15),
"Line 1\nLíne 2\n"
);
}
#[test]
fn test_string_size_with_expanded_tabs() {
let nz = |val| NonZeroU32::new(val).unwrap();
assert_eq!(char_len_with_expanded_tabs(0, "", nz(4)), 0);
assert_eq!(char_len_with_expanded_tabs(0, "hello", nz(4)), 5);
assert_eq!(char_len_with_expanded_tabs(0, "\thello", nz(4)), 9);
assert_eq!(char_len_with_expanded_tabs(0, "abc\tab", nz(4)), 6);
assert_eq!(char_len_with_expanded_tabs(0, "hello\t", nz(4)), 8);
assert_eq!(char_len_with_expanded_tabs(0, "\t\t", nz(8)), 16);
assert_eq!(char_len_with_expanded_tabs(0, "x\t", nz(8)), 8);
assert_eq!(char_len_with_expanded_tabs(7, "x\t", nz(8)), 9);
}
#[test]
fn test_word_breaking_tokenizer() {
let tests: &[(&str, &[WordBreakToken<'static>])] = &[
("", &[]),
(" ", &[whitespace(" ", 2)]),
("Ʒ", &[word("Ʒ", 1)]),
("Ǽ", &[word("Ǽ", 1)]),
("", &[word("", 1)]),
("⋑⋑", &[word("⋑⋑", 2)]),
(
"原理,进而",
&[word("", 1), word("理,", 2), word("", 1), word("", 1)],
),
(
"hello world",
&[word("hello", 5), whitespace(" ", 1), word("world", 5)],
),
(
"hello, world",
&[word("hello,", 6), whitespace(" ", 1), word("world", 5)],
),
(
" hello world",
&[
whitespace(" ", 2),
word("hello", 5),
whitespace(" ", 1),
word("world", 5),
],
),
(
"这是什么 \n 钢笔",
&[
word("", 1),
word("", 1),
word("", 1),
word("", 1),
whitespace(" ", 1),
newline(),
whitespace(" ", 1),
word("", 1),
word("", 1),
],
),
("mutton", &[whitespace("", 1), word("mutton", 6)]),
];
fn word(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
WordBreakToken::Word {
token,
grapheme_len,
}
}
fn whitespace(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
WordBreakToken::InlineWhitespace {
token,
grapheme_len,
}
}
fn newline() -> WordBreakToken<'static> {
WordBreakToken::Newline
}
for (input, result) in tests {
assert_eq!(
WordBreakingTokenizer::new(input)
.collect::<Vec<_>>()
.as_slice(),
*result,
);
}
}
#[test]
fn test_wrap_with_prefix() {
assert_eq!(
wrap_with_prefix(
"# ".to_string(),
"abcdefg".to_string(),
4,
NonZeroU32::new(4).unwrap(),
false,
),
"# abcdefg"
);
assert_eq!(
wrap_with_prefix(
"".to_string(),
"\thello world".to_string(),
8,
NonZeroU32::new(4).unwrap(),
false,
),
"hello\nworld"
);
assert_eq!(
wrap_with_prefix(
"// ".to_string(),
"xx \nyy zz aa bb cc".to_string(),
12,
NonZeroU32::new(4).unwrap(),
false,
),
"// xx yy zz\n// aa bb cc"
);
assert_eq!(
wrap_with_prefix(
String::new(),
"这是什么 \n 钢笔".to_string(),
3,
NonZeroU32::new(4).unwrap(),
false,
),
"这是什\n么 钢\n"
);
}
#[test]
fn test_split_with_ranges() {
let input = "hi";