search: Add heuristic for discarding matching of binary files (#23581)

Fixes #23398 
Closes #23398

We'll bail on searches of files that we know are binary (thus even if we
were to find a match in them, they'd be thrown away by buffer loader).

Release Notes:

- Improved project search performance in worktrees with binary files
This commit is contained in:
Piotr Osiewicz 2025-01-23 23:15:58 +01:00 committed by GitHub
parent 35ddb432b3
commit fb63f61755
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 23 additions and 7 deletions

View file

@ -101,7 +101,7 @@ pub trait Fs: Send + Sync {
self.remove_file(path, options).await self.remove_file(path, options).await
} }
async fn open_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>>; async fn open_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>>;
async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read>>; async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read + Send + Sync>>;
async fn load(&self, path: &Path) -> Result<String> { async fn load(&self, path: &Path) -> Result<String> {
Ok(String::from_utf8(self.load_bytes(path).await?)?) Ok(String::from_utf8(self.load_bytes(path).await?)?)
} }
@ -499,7 +499,7 @@ impl Fs for RealFs {
Ok(()) Ok(())
} }
async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read>> { async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read + Send + Sync>> {
Ok(Box::new(std::fs::File::open(path)?)) Ok(Box::new(std::fs::File::open(path)?))
} }
@ -1746,7 +1746,7 @@ impl Fs for FakeFs {
Ok(()) Ok(())
} }
async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read>> { async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read + Send + Sync>> {
let bytes = self.load_internal(path).await?; let bytes = self.load_internal(path).await?;
Ok(Box::new(io::Cursor::new(bytes))) Ok(Box::new(io::Cursor::new(bytes)))
} }

View file

@ -210,14 +210,17 @@ impl SearchQuery {
} }
} }
pub fn detect<T: Read>(&self, stream: T) -> Result<bool> { pub(crate) fn detect(
&self,
mut reader: BufReader<Box<dyn Read + Send + Sync>>,
) -> Result<bool> {
if self.as_str().is_empty() { if self.as_str().is_empty() {
return Ok(false); return Ok(false);
} }
match self { match self {
Self::Text { search, .. } => { Self::Text { search, .. } => {
let mat = search.stream_find_iter(stream).next(); let mat = search.stream_find_iter(reader).next();
match mat { match mat {
Some(Ok(_)) => Ok(true), Some(Ok(_)) => Ok(true),
Some(Err(err)) => Err(err.into()), Some(Err(err)) => Err(err.into()),
@ -227,7 +230,6 @@ impl SearchQuery {
Self::Regex { Self::Regex {
regex, multiline, .. regex, multiline, ..
} => { } => {
let mut reader = BufReader::new(stream);
if *multiline { if *multiline {
let mut text = String::new(); let mut text = String::new();
if let Err(err) = reader.read_to_string(&mut text) { if let Err(err) = reader.read_to_string(&mut text) {

View file

@ -1,4 +1,5 @@
use std::{ use std::{
io::{BufRead, BufReader},
path::{Path, PathBuf}, path::{Path, PathBuf},
pin::pin, pin::pin,
sync::{atomic::AtomicUsize, Arc}, sync::{atomic::AtomicUsize, Arc},
@ -985,7 +986,6 @@ impl WorktreeStore {
} }
repo.change_branch(&new_branch)?; repo.change_branch(&new_branch)?;
Ok(()) Ok(())
}); });
@ -1020,6 +1020,20 @@ impl WorktreeStore {
let Some(file) = fs.open_sync(&abs_path).await.log_err() else { let Some(file) = fs.open_sync(&abs_path).await.log_err() else {
continue; continue;
}; };
let mut file = BufReader::new(file);
let file_start = file.fill_buf()?;
if let Err(Some(starting_position)) =
std::str::from_utf8(file_start).map_err(|e| e.error_len())
{
// Before attempting to match the file content, throw away files that have invalid UTF-8 sequences early on;
// That way we can still match files in a streaming fashion without having look at "obviously binary" files.
return Err(anyhow!(
"Invalid UTF-8 sequence at position {starting_position}"
));
}
if query.detect(file).unwrap_or(false) { if query.detect(file).unwrap_or(false) {
entry.respond.send(entry.path).await? entry.respond.send(entry.path).await?
} }