agent: Don't track large and common binary files (#31352)

## Issue

The agent may run very slowly on projects that contain many or large
binary files not listed in `.gitignore`.


## Solution

Temporarily rewrite `.git/info/exludes` to ignore:
- Common binary files based on the extension
- Files larger than 2 MB

## Benchmark

I measure the time between sending an agent message in UI ("hitting
Enter") and actually sending it to an LLM. Ideally, it should be
instant. Numbers for a 7.7 GB Rust project with no .gitignore.

Filter                            | Time
----------------------------------|-----
No filter (= before this change)  | 62 s
Exclude common file types only    | 1.46 s
Exclude files >2MB only           | 1.16 s
Exclude both                      | 0.10 s


## Planned changes:

- [x] Exclude common binary file types
- [x] Exclude large files
- [ ] Track files added by agent so we could delete them (we can't rely
on git for that anymore)
- [ ] Don't block on waiting for a checkpoint to complete until we
really need it
- [ ] Only `git add` files that are about to change


Closes #ISSUE

Release Notes:

- Improved agent latency on repositories containing many files or large
files
This commit is contained in:
Oleksiy Syvokon 2025-05-26 14:31:25 +03:00 committed by GitHub
parent 134463f043
commit 9da9ef860b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 307 additions and 20 deletions

View file

@ -0,0 +1,91 @@
# This lists files that we don't track in checkpoints
# Compiled source and executables
*.exe
*.dll
*.so
*.dylib
*.a
*.lib
*.o
*.obj
*.elf
*.out
*.app
*.deb
*.rpm
*.dmg
*.pkg
*.msi
# Archives and compressed files
*.7z
*.zip
*.tar
*.tar.gz
*.tgz
*.tar.bz2
*.tbz2
*.tar.xz
*.txz
*.rar
*.jar
*.war
*.ear
# Media files
*.jpg
*.jpeg
*.png
*.gif
*.ico
*.svg
*.webp
*.bmp
*.tiff
*.mp3
*.mp4
*.avi
*.mov
*.wmv
*.flv
*.mkv
*.webm
*.wav
*.flac
*.aac
# Database files
*.db
*.sqlite
*.sqlite3
*.mdb
# Documents (often binary)
*.pdf
*.doc
*.docx
*.xls
*.xlsx
*.ppt
*.pptx
# IDE and editor files
.idea/
.vscode/
*.swp
*.swo
*~
.DS_Store
Thumbs.db
# Language-specific files
*.rlib
*.rmeta
*.pdb
*.class
*.egg
*.egg-info/
*.pyc
*.pto
__pycache__

View file

@ -193,6 +193,72 @@ pub enum ResetMode {
Mixed,
}
/// Modifies .git/info/exclude temporarily
pub struct GitExcludeOverride {
git_exclude_path: PathBuf,
original_excludes: Option<String>,
added_excludes: Option<String>,
}
impl GitExcludeOverride {
pub async fn new(git_exclude_path: PathBuf) -> Result<Self> {
let original_excludes = smol::fs::read_to_string(&git_exclude_path).await.ok();
Ok(GitExcludeOverride {
git_exclude_path,
original_excludes,
added_excludes: None,
})
}
pub async fn add_excludes(&mut self, excludes: &str) -> Result<()> {
self.added_excludes = Some(if let Some(ref already_added) = self.added_excludes {
format!("{already_added}\n{excludes}")
} else {
excludes.to_string()
});
let mut content = self.original_excludes.clone().unwrap_or_default();
content.push_str("\n\n# ====== Auto-added by Zed: =======\n");
content.push_str(self.added_excludes.as_ref().unwrap());
content.push('\n');
smol::fs::write(&self.git_exclude_path, content).await?;
Ok(())
}
pub async fn restore_original(&mut self) -> Result<()> {
if let Some(ref original) = self.original_excludes {
smol::fs::write(&self.git_exclude_path, original).await?;
} else {
if self.git_exclude_path.exists() {
smol::fs::remove_file(&self.git_exclude_path).await?;
}
}
self.added_excludes = None;
Ok(())
}
}
impl Drop for GitExcludeOverride {
fn drop(&mut self) {
if self.added_excludes.is_some() {
let git_exclude_path = self.git_exclude_path.clone();
let original_excludes = self.original_excludes.clone();
smol::spawn(async move {
if let Some(original) = original_excludes {
smol::fs::write(&git_exclude_path, original).await
} else {
smol::fs::remove_file(&git_exclude_path).await
}
})
.detach();
}
}
}
pub trait GitRepository: Send + Sync {
fn reload_index(&self);
@ -1263,10 +1329,12 @@ impl GitRepository for RealGitRepository {
self.executor
.spawn(async move {
let working_directory = working_directory?;
let mut git = GitBinary::new(git_binary_path, working_directory, executor)
let mut git = GitBinary::new(git_binary_path, working_directory.clone(), executor)
.envs(checkpoint_author_envs());
git.with_temp_index(async |git| {
let head_sha = git.run(&["rev-parse", "HEAD"]).await.ok();
let mut excludes = exclude_files(git).await?;
git.run(&["add", "--all"]).await?;
let tree = git.run(&["write-tree"]).await?;
let checkpoint_sha = if let Some(head_sha) = head_sha.as_deref() {
@ -1276,6 +1344,8 @@ impl GitRepository for RealGitRepository {
git.run(&["commit-tree", &tree, "-m", "Checkpoint"]).await?
};
excludes.restore_original().await?;
Ok(GitRepositoryCheckpoint {
commit_sha: checkpoint_sha.parse()?,
})
@ -1294,7 +1364,7 @@ impl GitRepository for RealGitRepository {
.spawn(async move {
let working_directory = working_directory?;
let mut git = GitBinary::new(git_binary_path, working_directory, executor);
let git = GitBinary::new(git_binary_path, working_directory, executor);
git.run(&[
"restore",
"--source",
@ -1304,12 +1374,16 @@ impl GitRepository for RealGitRepository {
])
.await?;
git.with_temp_index(async move |git| {
git.run(&["read-tree", &checkpoint.commit_sha.to_string()])
.await?;
git.run(&["clean", "-d", "--force"]).await
})
.await?;
// TODO: We don't track binary and large files anymore,
// so the following call would delete them.
// Implement an alternative way to track files added by agent.
//
// git.with_temp_index(async move |git| {
// git.run(&["read-tree", &checkpoint.commit_sha.to_string()])
// .await?;
// git.run(&["clean", "-d", "--force"]).await
// })
// .await?;
Ok(())
})
@ -1400,6 +1474,44 @@ fn git_status_args(path_prefixes: &[RepoPath]) -> Vec<OsString> {
args
}
/// Temporarily git-ignore commonly ignored files and files over 2MB
async fn exclude_files(git: &GitBinary) -> Result<GitExcludeOverride> {
const MAX_SIZE: u64 = 2 * 1024 * 1024; // 2 MB
let mut excludes = git.with_exclude_overrides().await?;
excludes
.add_excludes(include_str!("./checkpoint.gitignore"))
.await?;
let working_directory = git.working_directory.clone();
let untracked_files = git.list_untracked_files().await?;
let excluded_paths = untracked_files.into_iter().map(|path| {
let working_directory = working_directory.clone();
smol::spawn(async move {
let full_path = working_directory.join(path.clone());
match smol::fs::metadata(&full_path).await {
Ok(metadata) if metadata.is_file() && metadata.len() >= MAX_SIZE => {
Some(PathBuf::from("/").join(path.clone()))
}
_ => None,
}
})
});
let excluded_paths = futures::future::join_all(excluded_paths).await;
let excluded_paths = excluded_paths.into_iter().flatten().collect::<Vec<_>>();
if !excluded_paths.is_empty() {
let exclude_patterns = excluded_paths
.into_iter()
.map(|path| path.to_string_lossy().to_string())
.collect::<Vec<_>>()
.join("\n");
excludes.add_excludes(&exclude_patterns).await?;
}
Ok(excludes)
}
struct GitBinary {
git_binary_path: PathBuf,
working_directory: PathBuf,
@ -1423,6 +1535,19 @@ impl GitBinary {
}
}
async fn list_untracked_files(&self) -> Result<Vec<PathBuf>> {
let status_output = self
.run(&["status", "--porcelain=v1", "--untracked-files=all", "-z"])
.await?;
let paths = status_output
.split('\0')
.filter(|entry| entry.len() >= 3 && entry.starts_with("?? "))
.map(|entry| PathBuf::from(&entry[3..]))
.collect::<Vec<_>>();
Ok(paths)
}
fn envs(mut self, envs: HashMap<String, String>) -> Self {
self.envs = envs;
self
@ -1466,6 +1591,16 @@ impl GitBinary {
Ok(result)
}
pub async fn with_exclude_overrides(&self) -> Result<GitExcludeOverride> {
let path = self
.working_directory
.join(".git")
.join("info")
.join("exclude");
GitExcludeOverride::new(path).await
}
fn path_for_index_id(&self, id: Uuid) -> PathBuf {
self.working_directory
.join(".git")
@ -1878,12 +2013,13 @@ mod tests {
.unwrap(),
"1"
);
assert_eq!(
smol::fs::read_to_string(repo_dir.path().join("new_file_after_checkpoint"))
.await
.ok(),
None
);
// See TODO above
// assert_eq!(
// smol::fs::read_to_string(repo_dir.path().join("new_file_after_checkpoint"))
// .await
// .ok(),
// None
// );
}
#[gpui::test]
@ -1916,12 +2052,13 @@ mod tests {
.unwrap(),
"foo"
);
assert_eq!(
smol::fs::read_to_string(repo_dir.path().join("baz"))
.await
.ok(),
None
);
// See TODOs above
// assert_eq!(
// smol::fs::read_to_string(repo_dir.path().join("baz"))
// .await
// .ok(),
// None
// );
}
#[gpui::test]
@ -1958,6 +2095,65 @@ mod tests {
);
}
#[gpui::test]
async fn test_checkpoint_exclude_binary_files(cx: &mut TestAppContext) {
cx.executor().allow_parking();
let repo_dir = tempfile::tempdir().unwrap();
let text_path = repo_dir.path().join("main.rs");
let bin_path = repo_dir.path().join("binary.o");
git2::Repository::init(repo_dir.path()).unwrap();
smol::fs::write(&text_path, "fn main() {}").await.unwrap();
smol::fs::write(&bin_path, "some binary file here")
.await
.unwrap();
let repo =
RealGitRepository::new(&repo_dir.path().join(".git"), None, cx.executor()).unwrap();
// initial commit
repo.stage_paths(
vec![RepoPath::from_str("main.rs")],
Arc::new(HashMap::default()),
)
.await
.unwrap();
repo.commit(
"Initial commit".into(),
None,
CommitOptions::default(),
Arc::new(checkpoint_author_envs()),
)
.await
.unwrap();
let checkpoint = repo.checkpoint().await.unwrap();
smol::fs::write(&text_path, "fn main() { println!(\"Modified\"); }")
.await
.unwrap();
smol::fs::write(&bin_path, "Modified binary file")
.await
.unwrap();
repo.restore_checkpoint(checkpoint).await.unwrap();
// Text files should be restored to checkpoint state,
// but binaries should not (they aren't tracked)
assert_eq!(
smol::fs::read_to_string(&text_path).await.unwrap(),
"fn main() {}"
);
assert_eq!(
smol::fs::read_to_string(&bin_path).await.unwrap(),
"Modified binary file"
);
}
#[test]
fn test_branches_parsing() {
// suppress "help: octal escapes are not supported, `\0` is always null"