agent: Don't track large and common binary files (#31352)
## Issue The agent may run very slowly on projects that contain many or large binary files not listed in `.gitignore`. ## Solution Temporarily rewrite `.git/info/exludes` to ignore: - Common binary files based on the extension - Files larger than 2 MB ## Benchmark I measure the time between sending an agent message in UI ("hitting Enter") and actually sending it to an LLM. Ideally, it should be instant. Numbers for a 7.7 GB Rust project with no .gitignore. Filter | Time ----------------------------------|----- No filter (= before this change) | 62 s Exclude common file types only | 1.46 s Exclude files >2MB only | 1.16 s Exclude both | 0.10 s ## Planned changes: - [x] Exclude common binary file types - [x] Exclude large files - [ ] Track files added by agent so we could delete them (we can't rely on git for that anymore) - [ ] Don't block on waiting for a checkpoint to complete until we really need it - [ ] Only `git add` files that are about to change Closes #ISSUE Release Notes: - Improved agent latency on repositories containing many files or large files
This commit is contained in:
parent
134463f043
commit
9da9ef860b
2 changed files with 307 additions and 20 deletions
91
crates/git/src/checkpoint.gitignore
Normal file
91
crates/git/src/checkpoint.gitignore
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
# This lists files that we don't track in checkpoints
|
||||||
|
|
||||||
|
# Compiled source and executables
|
||||||
|
*.exe
|
||||||
|
*.dll
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
*.a
|
||||||
|
*.lib
|
||||||
|
*.o
|
||||||
|
*.obj
|
||||||
|
*.elf
|
||||||
|
*.out
|
||||||
|
*.app
|
||||||
|
*.deb
|
||||||
|
*.rpm
|
||||||
|
*.dmg
|
||||||
|
*.pkg
|
||||||
|
*.msi
|
||||||
|
|
||||||
|
# Archives and compressed files
|
||||||
|
*.7z
|
||||||
|
*.zip
|
||||||
|
*.tar
|
||||||
|
*.tar.gz
|
||||||
|
*.tgz
|
||||||
|
*.tar.bz2
|
||||||
|
*.tbz2
|
||||||
|
*.tar.xz
|
||||||
|
*.txz
|
||||||
|
*.rar
|
||||||
|
*.jar
|
||||||
|
*.war
|
||||||
|
*.ear
|
||||||
|
|
||||||
|
# Media files
|
||||||
|
*.jpg
|
||||||
|
*.jpeg
|
||||||
|
*.png
|
||||||
|
*.gif
|
||||||
|
*.ico
|
||||||
|
*.svg
|
||||||
|
*.webp
|
||||||
|
*.bmp
|
||||||
|
*.tiff
|
||||||
|
*.mp3
|
||||||
|
*.mp4
|
||||||
|
*.avi
|
||||||
|
*.mov
|
||||||
|
*.wmv
|
||||||
|
*.flv
|
||||||
|
*.mkv
|
||||||
|
*.webm
|
||||||
|
*.wav
|
||||||
|
*.flac
|
||||||
|
*.aac
|
||||||
|
|
||||||
|
# Database files
|
||||||
|
*.db
|
||||||
|
*.sqlite
|
||||||
|
*.sqlite3
|
||||||
|
*.mdb
|
||||||
|
|
||||||
|
# Documents (often binary)
|
||||||
|
*.pdf
|
||||||
|
*.doc
|
||||||
|
*.docx
|
||||||
|
*.xls
|
||||||
|
*.xlsx
|
||||||
|
*.ppt
|
||||||
|
*.pptx
|
||||||
|
|
||||||
|
# IDE and editor files
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Language-specific files
|
||||||
|
*.rlib
|
||||||
|
*.rmeta
|
||||||
|
*.pdb
|
||||||
|
*.class
|
||||||
|
*.egg
|
||||||
|
*.egg-info/
|
||||||
|
*.pyc
|
||||||
|
*.pto
|
||||||
|
__pycache__
|
|
@ -193,6 +193,72 @@ pub enum ResetMode {
|
||||||
Mixed,
|
Mixed,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Modifies .git/info/exclude temporarily
|
||||||
|
pub struct GitExcludeOverride {
|
||||||
|
git_exclude_path: PathBuf,
|
||||||
|
original_excludes: Option<String>,
|
||||||
|
added_excludes: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GitExcludeOverride {
|
||||||
|
pub async fn new(git_exclude_path: PathBuf) -> Result<Self> {
|
||||||
|
let original_excludes = smol::fs::read_to_string(&git_exclude_path).await.ok();
|
||||||
|
|
||||||
|
Ok(GitExcludeOverride {
|
||||||
|
git_exclude_path,
|
||||||
|
original_excludes,
|
||||||
|
added_excludes: None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn add_excludes(&mut self, excludes: &str) -> Result<()> {
|
||||||
|
self.added_excludes = Some(if let Some(ref already_added) = self.added_excludes {
|
||||||
|
format!("{already_added}\n{excludes}")
|
||||||
|
} else {
|
||||||
|
excludes.to_string()
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut content = self.original_excludes.clone().unwrap_or_default();
|
||||||
|
content.push_str("\n\n# ====== Auto-added by Zed: =======\n");
|
||||||
|
content.push_str(self.added_excludes.as_ref().unwrap());
|
||||||
|
content.push('\n');
|
||||||
|
|
||||||
|
smol::fs::write(&self.git_exclude_path, content).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn restore_original(&mut self) -> Result<()> {
|
||||||
|
if let Some(ref original) = self.original_excludes {
|
||||||
|
smol::fs::write(&self.git_exclude_path, original).await?;
|
||||||
|
} else {
|
||||||
|
if self.git_exclude_path.exists() {
|
||||||
|
smol::fs::remove_file(&self.git_exclude_path).await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.added_excludes = None;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for GitExcludeOverride {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if self.added_excludes.is_some() {
|
||||||
|
let git_exclude_path = self.git_exclude_path.clone();
|
||||||
|
let original_excludes = self.original_excludes.clone();
|
||||||
|
smol::spawn(async move {
|
||||||
|
if let Some(original) = original_excludes {
|
||||||
|
smol::fs::write(&git_exclude_path, original).await
|
||||||
|
} else {
|
||||||
|
smol::fs::remove_file(&git_exclude_path).await
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.detach();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub trait GitRepository: Send + Sync {
|
pub trait GitRepository: Send + Sync {
|
||||||
fn reload_index(&self);
|
fn reload_index(&self);
|
||||||
|
|
||||||
|
@ -1263,10 +1329,12 @@ impl GitRepository for RealGitRepository {
|
||||||
self.executor
|
self.executor
|
||||||
.spawn(async move {
|
.spawn(async move {
|
||||||
let working_directory = working_directory?;
|
let working_directory = working_directory?;
|
||||||
let mut git = GitBinary::new(git_binary_path, working_directory, executor)
|
let mut git = GitBinary::new(git_binary_path, working_directory.clone(), executor)
|
||||||
.envs(checkpoint_author_envs());
|
.envs(checkpoint_author_envs());
|
||||||
git.with_temp_index(async |git| {
|
git.with_temp_index(async |git| {
|
||||||
let head_sha = git.run(&["rev-parse", "HEAD"]).await.ok();
|
let head_sha = git.run(&["rev-parse", "HEAD"]).await.ok();
|
||||||
|
let mut excludes = exclude_files(git).await?;
|
||||||
|
|
||||||
git.run(&["add", "--all"]).await?;
|
git.run(&["add", "--all"]).await?;
|
||||||
let tree = git.run(&["write-tree"]).await?;
|
let tree = git.run(&["write-tree"]).await?;
|
||||||
let checkpoint_sha = if let Some(head_sha) = head_sha.as_deref() {
|
let checkpoint_sha = if let Some(head_sha) = head_sha.as_deref() {
|
||||||
|
@ -1276,6 +1344,8 @@ impl GitRepository for RealGitRepository {
|
||||||
git.run(&["commit-tree", &tree, "-m", "Checkpoint"]).await?
|
git.run(&["commit-tree", &tree, "-m", "Checkpoint"]).await?
|
||||||
};
|
};
|
||||||
|
|
||||||
|
excludes.restore_original().await?;
|
||||||
|
|
||||||
Ok(GitRepositoryCheckpoint {
|
Ok(GitRepositoryCheckpoint {
|
||||||
commit_sha: checkpoint_sha.parse()?,
|
commit_sha: checkpoint_sha.parse()?,
|
||||||
})
|
})
|
||||||
|
@ -1294,7 +1364,7 @@ impl GitRepository for RealGitRepository {
|
||||||
.spawn(async move {
|
.spawn(async move {
|
||||||
let working_directory = working_directory?;
|
let working_directory = working_directory?;
|
||||||
|
|
||||||
let mut git = GitBinary::new(git_binary_path, working_directory, executor);
|
let git = GitBinary::new(git_binary_path, working_directory, executor);
|
||||||
git.run(&[
|
git.run(&[
|
||||||
"restore",
|
"restore",
|
||||||
"--source",
|
"--source",
|
||||||
|
@ -1304,12 +1374,16 @@ impl GitRepository for RealGitRepository {
|
||||||
])
|
])
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
git.with_temp_index(async move |git| {
|
// TODO: We don't track binary and large files anymore,
|
||||||
git.run(&["read-tree", &checkpoint.commit_sha.to_string()])
|
// so the following call would delete them.
|
||||||
.await?;
|
// Implement an alternative way to track files added by agent.
|
||||||
git.run(&["clean", "-d", "--force"]).await
|
//
|
||||||
})
|
// git.with_temp_index(async move |git| {
|
||||||
.await?;
|
// git.run(&["read-tree", &checkpoint.commit_sha.to_string()])
|
||||||
|
// .await?;
|
||||||
|
// git.run(&["clean", "-d", "--force"]).await
|
||||||
|
// })
|
||||||
|
// .await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
|
@ -1400,6 +1474,44 @@ fn git_status_args(path_prefixes: &[RepoPath]) -> Vec<OsString> {
|
||||||
args
|
args
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Temporarily git-ignore commonly ignored files and files over 2MB
|
||||||
|
async fn exclude_files(git: &GitBinary) -> Result<GitExcludeOverride> {
|
||||||
|
const MAX_SIZE: u64 = 2 * 1024 * 1024; // 2 MB
|
||||||
|
let mut excludes = git.with_exclude_overrides().await?;
|
||||||
|
excludes
|
||||||
|
.add_excludes(include_str!("./checkpoint.gitignore"))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let working_directory = git.working_directory.clone();
|
||||||
|
let untracked_files = git.list_untracked_files().await?;
|
||||||
|
let excluded_paths = untracked_files.into_iter().map(|path| {
|
||||||
|
let working_directory = working_directory.clone();
|
||||||
|
smol::spawn(async move {
|
||||||
|
let full_path = working_directory.join(path.clone());
|
||||||
|
match smol::fs::metadata(&full_path).await {
|
||||||
|
Ok(metadata) if metadata.is_file() && metadata.len() >= MAX_SIZE => {
|
||||||
|
Some(PathBuf::from("/").join(path.clone()))
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
let excluded_paths = futures::future::join_all(excluded_paths).await;
|
||||||
|
let excluded_paths = excluded_paths.into_iter().flatten().collect::<Vec<_>>();
|
||||||
|
|
||||||
|
if !excluded_paths.is_empty() {
|
||||||
|
let exclude_patterns = excluded_paths
|
||||||
|
.into_iter()
|
||||||
|
.map(|path| path.to_string_lossy().to_string())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("\n");
|
||||||
|
excludes.add_excludes(&exclude_patterns).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(excludes)
|
||||||
|
}
|
||||||
|
|
||||||
struct GitBinary {
|
struct GitBinary {
|
||||||
git_binary_path: PathBuf,
|
git_binary_path: PathBuf,
|
||||||
working_directory: PathBuf,
|
working_directory: PathBuf,
|
||||||
|
@ -1423,6 +1535,19 @@ impl GitBinary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn list_untracked_files(&self) -> Result<Vec<PathBuf>> {
|
||||||
|
let status_output = self
|
||||||
|
.run(&["status", "--porcelain=v1", "--untracked-files=all", "-z"])
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let paths = status_output
|
||||||
|
.split('\0')
|
||||||
|
.filter(|entry| entry.len() >= 3 && entry.starts_with("?? "))
|
||||||
|
.map(|entry| PathBuf::from(&entry[3..]))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
Ok(paths)
|
||||||
|
}
|
||||||
|
|
||||||
fn envs(mut self, envs: HashMap<String, String>) -> Self {
|
fn envs(mut self, envs: HashMap<String, String>) -> Self {
|
||||||
self.envs = envs;
|
self.envs = envs;
|
||||||
self
|
self
|
||||||
|
@ -1466,6 +1591,16 @@ impl GitBinary {
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn with_exclude_overrides(&self) -> Result<GitExcludeOverride> {
|
||||||
|
let path = self
|
||||||
|
.working_directory
|
||||||
|
.join(".git")
|
||||||
|
.join("info")
|
||||||
|
.join("exclude");
|
||||||
|
|
||||||
|
GitExcludeOverride::new(path).await
|
||||||
|
}
|
||||||
|
|
||||||
fn path_for_index_id(&self, id: Uuid) -> PathBuf {
|
fn path_for_index_id(&self, id: Uuid) -> PathBuf {
|
||||||
self.working_directory
|
self.working_directory
|
||||||
.join(".git")
|
.join(".git")
|
||||||
|
@ -1878,12 +2013,13 @@ mod tests {
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
"1"
|
"1"
|
||||||
);
|
);
|
||||||
assert_eq!(
|
// See TODO above
|
||||||
smol::fs::read_to_string(repo_dir.path().join("new_file_after_checkpoint"))
|
// assert_eq!(
|
||||||
.await
|
// smol::fs::read_to_string(repo_dir.path().join("new_file_after_checkpoint"))
|
||||||
.ok(),
|
// .await
|
||||||
None
|
// .ok(),
|
||||||
);
|
// None
|
||||||
|
// );
|
||||||
}
|
}
|
||||||
|
|
||||||
#[gpui::test]
|
#[gpui::test]
|
||||||
|
@ -1916,12 +2052,13 @@ mod tests {
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
"foo"
|
"foo"
|
||||||
);
|
);
|
||||||
assert_eq!(
|
// See TODOs above
|
||||||
smol::fs::read_to_string(repo_dir.path().join("baz"))
|
// assert_eq!(
|
||||||
.await
|
// smol::fs::read_to_string(repo_dir.path().join("baz"))
|
||||||
.ok(),
|
// .await
|
||||||
None
|
// .ok(),
|
||||||
);
|
// None
|
||||||
|
// );
|
||||||
}
|
}
|
||||||
|
|
||||||
#[gpui::test]
|
#[gpui::test]
|
||||||
|
@ -1958,6 +2095,65 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[gpui::test]
|
||||||
|
async fn test_checkpoint_exclude_binary_files(cx: &mut TestAppContext) {
|
||||||
|
cx.executor().allow_parking();
|
||||||
|
|
||||||
|
let repo_dir = tempfile::tempdir().unwrap();
|
||||||
|
let text_path = repo_dir.path().join("main.rs");
|
||||||
|
let bin_path = repo_dir.path().join("binary.o");
|
||||||
|
|
||||||
|
git2::Repository::init(repo_dir.path()).unwrap();
|
||||||
|
|
||||||
|
smol::fs::write(&text_path, "fn main() {}").await.unwrap();
|
||||||
|
|
||||||
|
smol::fs::write(&bin_path, "some binary file here")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let repo =
|
||||||
|
RealGitRepository::new(&repo_dir.path().join(".git"), None, cx.executor()).unwrap();
|
||||||
|
|
||||||
|
// initial commit
|
||||||
|
repo.stage_paths(
|
||||||
|
vec![RepoPath::from_str("main.rs")],
|
||||||
|
Arc::new(HashMap::default()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
repo.commit(
|
||||||
|
"Initial commit".into(),
|
||||||
|
None,
|
||||||
|
CommitOptions::default(),
|
||||||
|
Arc::new(checkpoint_author_envs()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let checkpoint = repo.checkpoint().await.unwrap();
|
||||||
|
|
||||||
|
smol::fs::write(&text_path, "fn main() { println!(\"Modified\"); }")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
smol::fs::write(&bin_path, "Modified binary file")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
repo.restore_checkpoint(checkpoint).await.unwrap();
|
||||||
|
|
||||||
|
// Text files should be restored to checkpoint state,
|
||||||
|
// but binaries should not (they aren't tracked)
|
||||||
|
assert_eq!(
|
||||||
|
smol::fs::read_to_string(&text_path).await.unwrap(),
|
||||||
|
"fn main() {}"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
smol::fs::read_to_string(&bin_path).await.unwrap(),
|
||||||
|
"Modified binary file"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_branches_parsing() {
|
fn test_branches_parsing() {
|
||||||
// suppress "help: octal escapes are not supported, `\0` is always null"
|
// suppress "help: octal escapes are not supported, `\0` is always null"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue