This reverts commit f2601ce52c
.
Release Notes:
- N/A
This commit is contained in:
parent
2c4b75ab30
commit
218e8d09c5
9 changed files with 376 additions and 396 deletions
|
@ -82,6 +82,7 @@ tree-sitter-rust = { workspace = true, optional = true }
|
|||
tree-sitter-typescript = { workspace = true, optional = true }
|
||||
tree-sitter-python = { workspace = true, optional = true }
|
||||
unicode-segmentation.workspace = true
|
||||
unicode-script.workspace = true
|
||||
unindent = { workspace = true, optional = true }
|
||||
ui.workspace = true
|
||||
url.workspace = true
|
||||
|
|
|
@ -201,7 +201,7 @@ use ui::{
|
|||
ButtonSize, ButtonStyle, ContextMenu, Disclosure, IconButton, IconButtonShape, IconName,
|
||||
IconSize, Indicator, Key, Tooltip, h_flex, prelude::*,
|
||||
};
|
||||
use util::{RangeExt, ResultExt, TryFutureExt, maybe, post_inc, wrap_with_prefix};
|
||||
use util::{RangeExt, ResultExt, TryFutureExt, maybe, post_inc};
|
||||
use workspace::{
|
||||
CollaboratorId, Item as WorkspaceItem, ItemId, ItemNavHistory, OpenInTerminal, OpenTerminal,
|
||||
RestoreOnStartupBehavior, SERIALIZATION_THROTTLE_TIME, SplitDirection, TabBarSettings, Toast,
|
||||
|
@ -19587,6 +19587,347 @@ fn update_uncommitted_diff_for_buffer(
|
|||
})
|
||||
}
|
||||
|
||||
fn char_len_with_expanded_tabs(offset: usize, text: &str, tab_size: NonZeroU32) -> usize {
|
||||
let tab_size = tab_size.get() as usize;
|
||||
let mut width = offset;
|
||||
|
||||
for ch in text.chars() {
|
||||
width += if ch == '\t' {
|
||||
tab_size - (width % tab_size)
|
||||
} else {
|
||||
1
|
||||
};
|
||||
}
|
||||
|
||||
width - offset
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_string_size_with_expanded_tabs() {
|
||||
let nz = |val| NonZeroU32::new(val).unwrap();
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "", nz(4)), 0);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "hello", nz(4)), 5);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "\thello", nz(4)), 9);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "abc\tab", nz(4)), 6);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "hello\t", nz(4)), 8);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "\t\t", nz(8)), 16);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "x\t", nz(8)), 8);
|
||||
assert_eq!(char_len_with_expanded_tabs(7, "x\t", nz(8)), 9);
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenizes a string into runs of text that should stick together, or that is whitespace.
|
||||
struct WordBreakingTokenizer<'a> {
|
||||
input: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> WordBreakingTokenizer<'a> {
|
||||
fn new(input: &'a str) -> Self {
|
||||
Self { input }
|
||||
}
|
||||
}
|
||||
|
||||
fn is_char_ideographic(ch: char) -> bool {
|
||||
use unicode_script::Script::*;
|
||||
use unicode_script::UnicodeScript;
|
||||
matches!(ch.script(), Han | Tangut | Yi)
|
||||
}
|
||||
|
||||
fn is_grapheme_ideographic(text: &str) -> bool {
|
||||
text.chars().any(is_char_ideographic)
|
||||
}
|
||||
|
||||
fn is_grapheme_whitespace(text: &str) -> bool {
|
||||
text.chars().any(|x| x.is_whitespace())
|
||||
}
|
||||
|
||||
fn should_stay_with_preceding_ideograph(text: &str) -> bool {
|
||||
text.chars().next().map_or(false, |ch| {
|
||||
matches!(ch, '。' | '、' | ',' | '?' | '!' | ':' | ';' | '…')
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
||||
enum WordBreakToken<'a> {
|
||||
Word { token: &'a str, grapheme_len: usize },
|
||||
InlineWhitespace { token: &'a str, grapheme_len: usize },
|
||||
Newline,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for WordBreakingTokenizer<'a> {
|
||||
/// Yields a span, the count of graphemes in the token, and whether it was
|
||||
/// whitespace. Note that it also breaks at word boundaries.
|
||||
type Item = WordBreakToken<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
if self.input.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut iter = self.input.graphemes(true).peekable();
|
||||
let mut offset = 0;
|
||||
let mut grapheme_len = 0;
|
||||
if let Some(first_grapheme) = iter.next() {
|
||||
let is_newline = first_grapheme == "\n";
|
||||
let is_whitespace = is_grapheme_whitespace(first_grapheme);
|
||||
offset += first_grapheme.len();
|
||||
grapheme_len += 1;
|
||||
if is_grapheme_ideographic(first_grapheme) && !is_whitespace {
|
||||
if let Some(grapheme) = iter.peek().copied() {
|
||||
if should_stay_with_preceding_ideograph(grapheme) {
|
||||
offset += grapheme.len();
|
||||
grapheme_len += 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let mut words = self.input[offset..].split_word_bound_indices().peekable();
|
||||
let mut next_word_bound = words.peek().copied();
|
||||
if next_word_bound.map_or(false, |(i, _)| i == 0) {
|
||||
next_word_bound = words.next();
|
||||
}
|
||||
while let Some(grapheme) = iter.peek().copied() {
|
||||
if next_word_bound.map_or(false, |(i, _)| i == offset) {
|
||||
break;
|
||||
};
|
||||
if is_grapheme_whitespace(grapheme) != is_whitespace
|
||||
|| (grapheme == "\n") != is_newline
|
||||
{
|
||||
break;
|
||||
};
|
||||
offset += grapheme.len();
|
||||
grapheme_len += 1;
|
||||
iter.next();
|
||||
}
|
||||
}
|
||||
let token = &self.input[..offset];
|
||||
self.input = &self.input[offset..];
|
||||
if token == "\n" {
|
||||
Some(WordBreakToken::Newline)
|
||||
} else if is_whitespace {
|
||||
Some(WordBreakToken::InlineWhitespace {
|
||||
token,
|
||||
grapheme_len,
|
||||
})
|
||||
} else {
|
||||
Some(WordBreakToken::Word {
|
||||
token,
|
||||
grapheme_len,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_word_breaking_tokenizer() {
|
||||
let tests: &[(&str, &[WordBreakToken<'static>])] = &[
|
||||
("", &[]),
|
||||
(" ", &[whitespace(" ", 2)]),
|
||||
("Ʒ", &[word("Ʒ", 1)]),
|
||||
("Ǽ", &[word("Ǽ", 1)]),
|
||||
("⋑", &[word("⋑", 1)]),
|
||||
("⋑⋑", &[word("⋑⋑", 2)]),
|
||||
(
|
||||
"原理,进而",
|
||||
&[word("原", 1), word("理,", 2), word("进", 1), word("而", 1)],
|
||||
),
|
||||
(
|
||||
"hello world",
|
||||
&[word("hello", 5), whitespace(" ", 1), word("world", 5)],
|
||||
),
|
||||
(
|
||||
"hello, world",
|
||||
&[word("hello,", 6), whitespace(" ", 1), word("world", 5)],
|
||||
),
|
||||
(
|
||||
" hello world",
|
||||
&[
|
||||
whitespace(" ", 2),
|
||||
word("hello", 5),
|
||||
whitespace(" ", 1),
|
||||
word("world", 5),
|
||||
],
|
||||
),
|
||||
(
|
||||
"这是什么 \n 钢笔",
|
||||
&[
|
||||
word("这", 1),
|
||||
word("是", 1),
|
||||
word("什", 1),
|
||||
word("么", 1),
|
||||
whitespace(" ", 1),
|
||||
newline(),
|
||||
whitespace(" ", 1),
|
||||
word("钢", 1),
|
||||
word("笔", 1),
|
||||
],
|
||||
),
|
||||
(" mutton", &[whitespace(" ", 1), word("mutton", 6)]),
|
||||
];
|
||||
|
||||
fn word(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
|
||||
WordBreakToken::Word {
|
||||
token,
|
||||
grapheme_len,
|
||||
}
|
||||
}
|
||||
|
||||
fn whitespace(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
|
||||
WordBreakToken::InlineWhitespace {
|
||||
token,
|
||||
grapheme_len,
|
||||
}
|
||||
}
|
||||
|
||||
fn newline() -> WordBreakToken<'static> {
|
||||
WordBreakToken::Newline
|
||||
}
|
||||
|
||||
for (input, result) in tests {
|
||||
assert_eq!(
|
||||
WordBreakingTokenizer::new(input)
|
||||
.collect::<Vec<_>>()
|
||||
.as_slice(),
|
||||
*result,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn wrap_with_prefix(
|
||||
line_prefix: String,
|
||||
unwrapped_text: String,
|
||||
wrap_column: usize,
|
||||
tab_size: NonZeroU32,
|
||||
preserve_existing_whitespace: bool,
|
||||
) -> String {
|
||||
let line_prefix_len = char_len_with_expanded_tabs(0, &line_prefix, tab_size);
|
||||
let mut wrapped_text = String::new();
|
||||
let mut current_line = line_prefix.clone();
|
||||
|
||||
let tokenizer = WordBreakingTokenizer::new(&unwrapped_text);
|
||||
let mut current_line_len = line_prefix_len;
|
||||
let mut in_whitespace = false;
|
||||
for token in tokenizer {
|
||||
let have_preceding_whitespace = in_whitespace;
|
||||
match token {
|
||||
WordBreakToken::Word {
|
||||
token,
|
||||
grapheme_len,
|
||||
} => {
|
||||
in_whitespace = false;
|
||||
if current_line_len + grapheme_len > wrap_column
|
||||
&& current_line_len != line_prefix_len
|
||||
{
|
||||
wrapped_text.push_str(current_line.trim_end());
|
||||
wrapped_text.push('\n');
|
||||
current_line.truncate(line_prefix.len());
|
||||
current_line_len = line_prefix_len;
|
||||
}
|
||||
current_line.push_str(token);
|
||||
current_line_len += grapheme_len;
|
||||
}
|
||||
WordBreakToken::InlineWhitespace {
|
||||
mut token,
|
||||
mut grapheme_len,
|
||||
} => {
|
||||
in_whitespace = true;
|
||||
if have_preceding_whitespace && !preserve_existing_whitespace {
|
||||
continue;
|
||||
}
|
||||
if !preserve_existing_whitespace {
|
||||
token = " ";
|
||||
grapheme_len = 1;
|
||||
}
|
||||
if current_line_len + grapheme_len > wrap_column {
|
||||
wrapped_text.push_str(current_line.trim_end());
|
||||
wrapped_text.push('\n');
|
||||
current_line.truncate(line_prefix.len());
|
||||
current_line_len = line_prefix_len;
|
||||
} else if current_line_len != line_prefix_len || preserve_existing_whitespace {
|
||||
current_line.push_str(token);
|
||||
current_line_len += grapheme_len;
|
||||
}
|
||||
}
|
||||
WordBreakToken::Newline => {
|
||||
in_whitespace = true;
|
||||
if preserve_existing_whitespace {
|
||||
wrapped_text.push_str(current_line.trim_end());
|
||||
wrapped_text.push('\n');
|
||||
current_line.truncate(line_prefix.len());
|
||||
current_line_len = line_prefix_len;
|
||||
} else if have_preceding_whitespace {
|
||||
continue;
|
||||
} else if current_line_len + 1 > wrap_column && current_line_len != line_prefix_len
|
||||
{
|
||||
wrapped_text.push_str(current_line.trim_end());
|
||||
wrapped_text.push('\n');
|
||||
current_line.truncate(line_prefix.len());
|
||||
current_line_len = line_prefix_len;
|
||||
} else if current_line_len != line_prefix_len {
|
||||
current_line.push(' ');
|
||||
current_line_len += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !current_line.is_empty() {
|
||||
wrapped_text.push_str(¤t_line);
|
||||
}
|
||||
wrapped_text
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_with_prefix() {
|
||||
assert_eq!(
|
||||
wrap_with_prefix(
|
||||
"# ".to_string(),
|
||||
"abcdefg".to_string(),
|
||||
4,
|
||||
NonZeroU32::new(4).unwrap(),
|
||||
false,
|
||||
),
|
||||
"# abcdefg"
|
||||
);
|
||||
assert_eq!(
|
||||
wrap_with_prefix(
|
||||
"".to_string(),
|
||||
"\thello world".to_string(),
|
||||
8,
|
||||
NonZeroU32::new(4).unwrap(),
|
||||
false,
|
||||
),
|
||||
"hello\nworld"
|
||||
);
|
||||
assert_eq!(
|
||||
wrap_with_prefix(
|
||||
"// ".to_string(),
|
||||
"xx \nyy zz aa bb cc".to_string(),
|
||||
12,
|
||||
NonZeroU32::new(4).unwrap(),
|
||||
false,
|
||||
),
|
||||
"// xx yy zz\n// aa bb cc"
|
||||
);
|
||||
assert_eq!(
|
||||
wrap_with_prefix(
|
||||
String::new(),
|
||||
"这是什么 \n 钢笔".to_string(),
|
||||
3,
|
||||
NonZeroU32::new(4).unwrap(),
|
||||
false,
|
||||
),
|
||||
"这是什\n么 钢\n笔"
|
||||
);
|
||||
}
|
||||
|
||||
pub trait CollaborationHub {
|
||||
fn collaborators<'a>(&self, cx: &'a App) -> &'a HashMap<PeerId, Collaborator>;
|
||||
fn user_participant_indices<'a>(&self, cx: &'a App) -> &'a HashMap<u64, ParticipantIndex>;
|
||||
|
|
|
@ -7607,7 +7607,10 @@ impl Element for EditorElement {
|
|||
editor.gutter_dimensions = gutter_dimensions;
|
||||
editor.set_visible_line_count(bounds.size.height / line_height, window, cx);
|
||||
|
||||
if matches!(editor.mode, EditorMode::Minimap { .. }) {
|
||||
if matches!(
|
||||
editor.mode,
|
||||
EditorMode::AutoHeight { .. } | EditorMode::Minimap { .. }
|
||||
) {
|
||||
snapshot
|
||||
} else {
|
||||
let wrap_width_for = |column: u32| (column as f32 * em_advance).ceil();
|
||||
|
@ -9626,7 +9629,6 @@ fn compute_auto_height_layout(
|
|||
let font_size = style.text.font_size.to_pixels(window.rem_size());
|
||||
let line_height = style.text.line_height_in_pixels(window.rem_size());
|
||||
let em_width = window.text_system().em_width(font_id, font_size).unwrap();
|
||||
let em_advance = window.text_system().em_advance(font_id, font_size).unwrap();
|
||||
|
||||
let mut snapshot = editor.snapshot(window, cx);
|
||||
let gutter_dimensions = snapshot
|
||||
|
@ -9643,18 +9645,10 @@ fn compute_auto_height_layout(
|
|||
let overscroll = size(em_width, px(0.));
|
||||
|
||||
let editor_width = text_width - gutter_dimensions.margin - overscroll.width - em_width;
|
||||
let content_offset = point(gutter_dimensions.margin, Pixels::ZERO);
|
||||
let editor_content_width = editor_width - content_offset.x;
|
||||
let wrap_width_for = |column: u32| (column as f32 * em_advance).ceil();
|
||||
let wrap_width = match editor.soft_wrap_mode(cx) {
|
||||
SoftWrap::GitDiff => None,
|
||||
SoftWrap::None => Some(wrap_width_for(MAX_LINE_LEN as u32 / 2)),
|
||||
SoftWrap::EditorWidth => Some(editor_content_width),
|
||||
SoftWrap::Column(column) => Some(wrap_width_for(column)),
|
||||
SoftWrap::Bounded(column) => Some(editor_content_width.min(wrap_width_for(column))),
|
||||
};
|
||||
if editor.set_wrap_width(wrap_width, cx) {
|
||||
snapshot = editor.snapshot(window, cx);
|
||||
if !matches!(editor.soft_wrap_mode(cx), SoftWrap::None) {
|
||||
if editor.set_wrap_width(Some(editor_width), cx) {
|
||||
snapshot = editor.snapshot(window, cx);
|
||||
}
|
||||
}
|
||||
|
||||
let scroll_height = (snapshot.max_point().row().next_row().0 as f32) * line_height;
|
||||
|
|
|
@ -54,7 +54,6 @@ use project::{
|
|||
use serde::{Deserialize, Serialize};
|
||||
use settings::{Settings as _, SettingsStore};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroU32;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashSet, sync::Arc, time::Duration, usize};
|
||||
use strum::{IntoEnumIterator, VariantNames};
|
||||
|
@ -63,7 +62,7 @@ use ui::{
|
|||
Checkbox, ContextMenu, ElevationIndex, PopoverMenu, Scrollbar, ScrollbarState, SplitButton,
|
||||
Tooltip, prelude::*,
|
||||
};
|
||||
use util::{ResultExt, TryFutureExt, maybe, wrap_with_prefix};
|
||||
use util::{ResultExt, TryFutureExt, maybe};
|
||||
use workspace::AppState;
|
||||
|
||||
use notifications::status_toast::{StatusToast, ToastIcon};
|
||||
|
@ -385,6 +384,7 @@ pub(crate) fn commit_message_editor(
|
|||
commit_editor.set_show_gutter(false, cx);
|
||||
commit_editor.set_show_wrap_guides(false, cx);
|
||||
commit_editor.set_show_indent_guides(false, cx);
|
||||
commit_editor.set_hard_wrap(Some(72), cx);
|
||||
let placeholder = placeholder.unwrap_or("Enter commit message".into());
|
||||
commit_editor.set_placeholder_text(placeholder, cx);
|
||||
commit_editor
|
||||
|
@ -1486,22 +1486,8 @@ impl GitPanel {
|
|||
|
||||
fn custom_or_suggested_commit_message(&self, cx: &mut Context<Self>) -> Option<String> {
|
||||
let message = self.commit_editor.read(cx).text(cx);
|
||||
let width = self
|
||||
.commit_editor
|
||||
.read(cx)
|
||||
.buffer()
|
||||
.read(cx)
|
||||
.language_settings(cx)
|
||||
.preferred_line_length as usize;
|
||||
|
||||
if !message.trim().is_empty() {
|
||||
let message = wrap_with_prefix(
|
||||
String::new(),
|
||||
message,
|
||||
width,
|
||||
NonZeroU32::new(8).unwrap(), // tab size doesn't matter when prefix is empty
|
||||
false,
|
||||
);
|
||||
return Some(message);
|
||||
}
|
||||
|
||||
|
|
|
@ -680,7 +680,7 @@ pub struct CodeLabel {
|
|||
pub filter_range: Range<usize>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, JsonSchema)]
|
||||
#[derive(Clone, Deserialize, JsonSchema)]
|
||||
pub struct LanguageConfig {
|
||||
/// Human-readable name of the language.
|
||||
pub name: LanguageName,
|
||||
|
@ -791,7 +791,7 @@ pub struct LanguageMatcher {
|
|||
}
|
||||
|
||||
/// The configuration for JSX tag auto-closing.
|
||||
#[derive(Clone, Debug, Deserialize, JsonSchema)]
|
||||
#[derive(Clone, Deserialize, JsonSchema)]
|
||||
pub struct JsxTagAutoCloseConfig {
|
||||
/// The name of the node for a opening tag
|
||||
pub open_tag_node_name: String,
|
||||
|
@ -824,7 +824,7 @@ pub struct JsxTagAutoCloseConfig {
|
|||
}
|
||||
|
||||
/// The configuration for documentation block for this language.
|
||||
#[derive(Clone, Debug, Deserialize, JsonSchema)]
|
||||
#[derive(Clone, Deserialize, JsonSchema)]
|
||||
pub struct DocumentationConfig {
|
||||
/// A start tag of documentation block.
|
||||
pub start: Arc<str>,
|
||||
|
|
|
@ -37,8 +37,6 @@ smol.workspace = true
|
|||
take-until.workspace = true
|
||||
tempfile.workspace = true
|
||||
unicase.workspace = true
|
||||
unicode-script.workspace = true
|
||||
unicode-segmentation.workspace = true
|
||||
util_macros = { workspace = true, optional = true }
|
||||
walkdir.workspace = true
|
||||
workspace-hack.workspace = true
|
||||
|
|
|
@ -14,7 +14,6 @@ use anyhow::Result;
|
|||
use futures::Future;
|
||||
use itertools::Either;
|
||||
use regex::Regex;
|
||||
use std::num::NonZeroU32;
|
||||
use std::sync::{LazyLock, OnceLock};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
|
@ -184,208 +183,29 @@ pub fn truncate_lines_to_byte_limit(s: &str, max_bytes: usize) -> &str {
|
|||
truncate_to_byte_limit(s, max_bytes)
|
||||
}
|
||||
|
||||
fn char_len_with_expanded_tabs(offset: usize, text: &str, tab_size: NonZeroU32) -> usize {
|
||||
let tab_size = tab_size.get() as usize;
|
||||
let mut width = offset;
|
||||
#[test]
|
||||
fn test_truncate_lines_to_byte_limit() {
|
||||
let text = "Line 1\nLine 2\nLine 3\nLine 4";
|
||||
|
||||
for ch in text.chars() {
|
||||
width += if ch == '\t' {
|
||||
tab_size - (width % tab_size)
|
||||
} else {
|
||||
1
|
||||
};
|
||||
}
|
||||
// Limit that includes all lines
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 100), text);
|
||||
|
||||
width - offset
|
||||
}
|
||||
// Exactly the first line
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 7), "Line 1\n");
|
||||
|
||||
/// Tokenizes a string into runs of text that should stick together, or that is whitespace.
|
||||
struct WordBreakingTokenizer<'a> {
|
||||
input: &'a str,
|
||||
}
|
||||
// Limit between lines
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 13), "Line 1\n");
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 20), "Line 1\nLine 2\n");
|
||||
|
||||
impl<'a> WordBreakingTokenizer<'a> {
|
||||
fn new(input: &'a str) -> Self {
|
||||
Self { input }
|
||||
}
|
||||
}
|
||||
// Limit before first newline
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 6), "Line ");
|
||||
|
||||
fn is_char_ideographic(ch: char) -> bool {
|
||||
use unicode_script::Script::*;
|
||||
use unicode_script::UnicodeScript;
|
||||
matches!(ch.script(), Han | Tangut | Yi)
|
||||
}
|
||||
|
||||
fn is_grapheme_ideographic(text: &str) -> bool {
|
||||
text.chars().any(is_char_ideographic)
|
||||
}
|
||||
|
||||
fn is_grapheme_whitespace(text: &str) -> bool {
|
||||
text.chars().any(|x| x.is_whitespace())
|
||||
}
|
||||
|
||||
fn should_stay_with_preceding_ideograph(text: &str) -> bool {
|
||||
text.chars().next().map_or(false, |ch| {
|
||||
matches!(ch, '。' | '、' | ',' | '?' | '!' | ':' | ';' | '…')
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
||||
enum WordBreakToken<'a> {
|
||||
Word { token: &'a str, grapheme_len: usize },
|
||||
InlineWhitespace { token: &'a str, grapheme_len: usize },
|
||||
Newline,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for WordBreakingTokenizer<'a> {
|
||||
/// Yields a span, the count of graphemes in the token, and whether it was
|
||||
/// whitespace. Note that it also breaks at word boundaries.
|
||||
type Item = WordBreakToken<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
if self.input.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut iter = self.input.graphemes(true).peekable();
|
||||
let mut offset = 0;
|
||||
let mut grapheme_len = 0;
|
||||
if let Some(first_grapheme) = iter.next() {
|
||||
let is_newline = first_grapheme == "\n";
|
||||
let is_whitespace = is_grapheme_whitespace(first_grapheme);
|
||||
offset += first_grapheme.len();
|
||||
grapheme_len += 1;
|
||||
if is_grapheme_ideographic(first_grapheme) && !is_whitespace {
|
||||
if let Some(grapheme) = iter.peek().copied() {
|
||||
if should_stay_with_preceding_ideograph(grapheme) {
|
||||
offset += grapheme.len();
|
||||
grapheme_len += 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let mut words = self.input[offset..].split_word_bound_indices().peekable();
|
||||
let mut next_word_bound = words.peek().copied();
|
||||
if next_word_bound.map_or(false, |(i, _)| i == 0) {
|
||||
next_word_bound = words.next();
|
||||
}
|
||||
while let Some(grapheme) = iter.peek().copied() {
|
||||
if next_word_bound.map_or(false, |(i, _)| i == offset) {
|
||||
break;
|
||||
};
|
||||
if is_grapheme_whitespace(grapheme) != is_whitespace
|
||||
|| (grapheme == "\n") != is_newline
|
||||
{
|
||||
break;
|
||||
};
|
||||
offset += grapheme.len();
|
||||
grapheme_len += 1;
|
||||
iter.next();
|
||||
}
|
||||
}
|
||||
let token = &self.input[..offset];
|
||||
self.input = &self.input[offset..];
|
||||
if token == "\n" {
|
||||
Some(WordBreakToken::Newline)
|
||||
} else if is_whitespace {
|
||||
Some(WordBreakToken::InlineWhitespace {
|
||||
token,
|
||||
grapheme_len,
|
||||
})
|
||||
} else {
|
||||
Some(WordBreakToken::Word {
|
||||
token,
|
||||
grapheme_len,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wrap_with_prefix(
|
||||
line_prefix: String,
|
||||
unwrapped_text: String,
|
||||
wrap_column: usize,
|
||||
tab_size: NonZeroU32,
|
||||
preserve_existing_whitespace: bool,
|
||||
) -> String {
|
||||
let line_prefix_len = char_len_with_expanded_tabs(0, &line_prefix, tab_size);
|
||||
let mut wrapped_text = String::new();
|
||||
let mut current_line = line_prefix.clone();
|
||||
|
||||
let tokenizer = WordBreakingTokenizer::new(&unwrapped_text);
|
||||
let mut current_line_len = line_prefix_len;
|
||||
let mut in_whitespace = false;
|
||||
for token in tokenizer {
|
||||
let have_preceding_whitespace = in_whitespace;
|
||||
match token {
|
||||
WordBreakToken::Word {
|
||||
token,
|
||||
grapheme_len,
|
||||
} => {
|
||||
in_whitespace = false;
|
||||
if current_line_len + grapheme_len > wrap_column
|
||||
&& current_line_len != line_prefix_len
|
||||
{
|
||||
wrapped_text.push_str(current_line.trim_end());
|
||||
wrapped_text.push('\n');
|
||||
current_line.truncate(line_prefix.len());
|
||||
current_line_len = line_prefix_len;
|
||||
}
|
||||
current_line.push_str(token);
|
||||
current_line_len += grapheme_len;
|
||||
}
|
||||
WordBreakToken::InlineWhitespace {
|
||||
mut token,
|
||||
mut grapheme_len,
|
||||
} => {
|
||||
in_whitespace = true;
|
||||
if have_preceding_whitespace && !preserve_existing_whitespace {
|
||||
continue;
|
||||
}
|
||||
if !preserve_existing_whitespace {
|
||||
token = " ";
|
||||
grapheme_len = 1;
|
||||
}
|
||||
if current_line_len + grapheme_len > wrap_column {
|
||||
wrapped_text.push_str(current_line.trim_end());
|
||||
wrapped_text.push('\n');
|
||||
current_line.truncate(line_prefix.len());
|
||||
current_line_len = line_prefix_len;
|
||||
} else if current_line_len != line_prefix_len || preserve_existing_whitespace {
|
||||
current_line.push_str(token);
|
||||
current_line_len += grapheme_len;
|
||||
}
|
||||
}
|
||||
WordBreakToken::Newline => {
|
||||
in_whitespace = true;
|
||||
if preserve_existing_whitespace {
|
||||
wrapped_text.push_str(current_line.trim_end());
|
||||
wrapped_text.push('\n');
|
||||
current_line.truncate(line_prefix.len());
|
||||
current_line_len = line_prefix_len;
|
||||
} else if have_preceding_whitespace {
|
||||
continue;
|
||||
} else if current_line_len + 1 > wrap_column && current_line_len != line_prefix_len
|
||||
{
|
||||
wrapped_text.push_str(current_line.trim_end());
|
||||
wrapped_text.push('\n');
|
||||
current_line.truncate(line_prefix.len());
|
||||
current_line_len = line_prefix_len;
|
||||
} else if current_line_len != line_prefix_len {
|
||||
current_line.push(' ');
|
||||
current_line_len += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !current_line.is_empty() {
|
||||
wrapped_text.push_str(¤t_line);
|
||||
}
|
||||
wrapped_text
|
||||
// Test with non-ASCII characters
|
||||
let text_utf8 = "Line 1\nLíne 2\nLine 3";
|
||||
assert_eq!(
|
||||
truncate_lines_to_byte_limit(text_utf8, 15),
|
||||
"Line 1\nLíne 2\n"
|
||||
);
|
||||
}
|
||||
|
||||
pub fn post_inc<T: From<u8> + AddAssign<T> + Copy>(value: &mut T) -> T {
|
||||
|
@ -1581,163 +1401,6 @@ Line 3"#
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncate_lines_to_byte_limit() {
|
||||
let text = "Line 1\nLine 2\nLine 3\nLine 4";
|
||||
|
||||
// Limit that includes all lines
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 100), text);
|
||||
|
||||
// Exactly the first line
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 7), "Line 1\n");
|
||||
|
||||
// Limit between lines
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 13), "Line 1\n");
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 20), "Line 1\nLine 2\n");
|
||||
|
||||
// Limit before first newline
|
||||
assert_eq!(truncate_lines_to_byte_limit(text, 6), "Line ");
|
||||
|
||||
// Test with non-ASCII characters
|
||||
let text_utf8 = "Line 1\nLíne 2\nLine 3";
|
||||
assert_eq!(
|
||||
truncate_lines_to_byte_limit(text_utf8, 15),
|
||||
"Line 1\nLíne 2\n"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_size_with_expanded_tabs() {
|
||||
let nz = |val| NonZeroU32::new(val).unwrap();
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "", nz(4)), 0);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "hello", nz(4)), 5);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "\thello", nz(4)), 9);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "abc\tab", nz(4)), 6);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "hello\t", nz(4)), 8);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "\t\t", nz(8)), 16);
|
||||
assert_eq!(char_len_with_expanded_tabs(0, "x\t", nz(8)), 8);
|
||||
assert_eq!(char_len_with_expanded_tabs(7, "x\t", nz(8)), 9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_word_breaking_tokenizer() {
|
||||
let tests: &[(&str, &[WordBreakToken<'static>])] = &[
|
||||
("", &[]),
|
||||
(" ", &[whitespace(" ", 2)]),
|
||||
("Ʒ", &[word("Ʒ", 1)]),
|
||||
("Ǽ", &[word("Ǽ", 1)]),
|
||||
("⋑", &[word("⋑", 1)]),
|
||||
("⋑⋑", &[word("⋑⋑", 2)]),
|
||||
(
|
||||
"原理,进而",
|
||||
&[word("原", 1), word("理,", 2), word("进", 1), word("而", 1)],
|
||||
),
|
||||
(
|
||||
"hello world",
|
||||
&[word("hello", 5), whitespace(" ", 1), word("world", 5)],
|
||||
),
|
||||
(
|
||||
"hello, world",
|
||||
&[word("hello,", 6), whitespace(" ", 1), word("world", 5)],
|
||||
),
|
||||
(
|
||||
" hello world",
|
||||
&[
|
||||
whitespace(" ", 2),
|
||||
word("hello", 5),
|
||||
whitespace(" ", 1),
|
||||
word("world", 5),
|
||||
],
|
||||
),
|
||||
(
|
||||
"这是什么 \n 钢笔",
|
||||
&[
|
||||
word("这", 1),
|
||||
word("是", 1),
|
||||
word("什", 1),
|
||||
word("么", 1),
|
||||
whitespace(" ", 1),
|
||||
newline(),
|
||||
whitespace(" ", 1),
|
||||
word("钢", 1),
|
||||
word("笔", 1),
|
||||
],
|
||||
),
|
||||
(" mutton", &[whitespace(" ", 1), word("mutton", 6)]),
|
||||
];
|
||||
|
||||
fn word(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
|
||||
WordBreakToken::Word {
|
||||
token,
|
||||
grapheme_len,
|
||||
}
|
||||
}
|
||||
|
||||
fn whitespace(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> {
|
||||
WordBreakToken::InlineWhitespace {
|
||||
token,
|
||||
grapheme_len,
|
||||
}
|
||||
}
|
||||
|
||||
fn newline() -> WordBreakToken<'static> {
|
||||
WordBreakToken::Newline
|
||||
}
|
||||
|
||||
for (input, result) in tests {
|
||||
assert_eq!(
|
||||
WordBreakingTokenizer::new(input)
|
||||
.collect::<Vec<_>>()
|
||||
.as_slice(),
|
||||
*result,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_with_prefix() {
|
||||
assert_eq!(
|
||||
wrap_with_prefix(
|
||||
"# ".to_string(),
|
||||
"abcdefg".to_string(),
|
||||
4,
|
||||
NonZeroU32::new(4).unwrap(),
|
||||
false,
|
||||
),
|
||||
"# abcdefg"
|
||||
);
|
||||
assert_eq!(
|
||||
wrap_with_prefix(
|
||||
"".to_string(),
|
||||
"\thello world".to_string(),
|
||||
8,
|
||||
NonZeroU32::new(4).unwrap(),
|
||||
false,
|
||||
),
|
||||
"hello\nworld"
|
||||
);
|
||||
assert_eq!(
|
||||
wrap_with_prefix(
|
||||
"// ".to_string(),
|
||||
"xx \nyy zz aa bb cc".to_string(),
|
||||
12,
|
||||
NonZeroU32::new(4).unwrap(),
|
||||
false,
|
||||
),
|
||||
"// xx yy zz\n// aa bb cc"
|
||||
);
|
||||
assert_eq!(
|
||||
wrap_with_prefix(
|
||||
String::new(),
|
||||
"这是什么 \n 钢笔".to_string(),
|
||||
3,
|
||||
NonZeroU32::new(4).unwrap(),
|
||||
false,
|
||||
),
|
||||
"这是什\n么 钢\n笔"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_with_ranges() {
|
||||
let input = "hi";
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue