search: Add heuristic for discarding matching of binary files (#23581)

Fixes #23398 
Closes #23398

We'll bail on searches of files that we know are binary (thus even if we
were to find a match in them, they'd be thrown away by buffer loader).

Release Notes:

- Improved project search performance in worktrees with binary files
This commit is contained in:
Piotr Osiewicz 2025-01-23 23:15:58 +01:00 committed by GitHub
parent 35ddb432b3
commit fb63f61755
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 23 additions and 7 deletions

View file

@ -210,14 +210,17 @@ impl SearchQuery {
}
}
pub fn detect<T: Read>(&self, stream: T) -> Result<bool> {
pub(crate) fn detect(
&self,
mut reader: BufReader<Box<dyn Read + Send + Sync>>,
) -> Result<bool> {
if self.as_str().is_empty() {
return Ok(false);
}
match self {
Self::Text { search, .. } => {
let mat = search.stream_find_iter(stream).next();
let mat = search.stream_find_iter(reader).next();
match mat {
Some(Ok(_)) => Ok(true),
Some(Err(err)) => Err(err.into()),
@ -227,7 +230,6 @@ impl SearchQuery {
Self::Regex {
regex, multiline, ..
} => {
let mut reader = BufReader::new(stream);
if *multiline {
let mut text = String::new();
if let Err(err) = reader.read_to_string(&mut text) {

View file

@ -1,4 +1,5 @@
use std::{
io::{BufRead, BufReader},
path::{Path, PathBuf},
pin::pin,
sync::{atomic::AtomicUsize, Arc},
@ -985,7 +986,6 @@ impl WorktreeStore {
}
repo.change_branch(&new_branch)?;
Ok(())
});
@ -1020,6 +1020,20 @@ impl WorktreeStore {
let Some(file) = fs.open_sync(&abs_path).await.log_err() else {
continue;
};
let mut file = BufReader::new(file);
let file_start = file.fill_buf()?;
if let Err(Some(starting_position)) =
std::str::from_utf8(file_start).map_err(|e| e.error_len())
{
// Before attempting to match the file content, throw away files that have invalid UTF-8 sequences early on;
// That way we can still match files in a streaming fashion without having look at "obviously binary" files.
return Err(anyhow!(
"Invalid UTF-8 sequence at position {starting_position}"
));
}
if query.detect(file).unwrap_or(false) {
entry.respond.send(entry.path).await?
}