From dda614091a99cb46433e8692bf328badf95e7dab Mon Sep 17 00:00:00 2001
From: Ben Brandt <benjamin.j.brandt@gmail.com>
Date: Thu, 5 Jun 2025 15:16:27 +0200
Subject: [PATCH] eval: Add eval unit tests as a CI job (#32152)

We run the unit evals once a day in the middle of the night, and trigger
a Slack post if it fails.


Release Notes:

- N/A

---------

Co-authored-by: Oleksiy Syvokon <oleksiy.syvokon@gmail.com>
---
 .github/workflows/unit_evals.yml              | 85 +++++++++++++++++++
 .../assistant_tools/src/edit_agent/evals.rs   |  2 +-
 2 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/unit_evals.yml

diff --git a/.github/workflows/unit_evals.yml b/.github/workflows/unit_evals.yml
new file mode 100644
index 0000000000..e8514a6edb
--- /dev/null
+++ b/.github/workflows/unit_evals.yml
@@ -0,0 +1,85 @@
+name: Run Unit Evals
+
+on:
+  schedule:
+    # GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
+    - cron: "47 1 * * *"
+  workflow_dispatch:
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: 0
+  RUST_BACKTRACE: 1
+  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
+
+jobs:
+  unit_evals:
+    timeout-minutes: 60
+    name: Run unit evals
+    runs-on:
+      - buildjet-16vcpu-ubuntu-2204
+    steps:
+      - name: Add Rust to the PATH
+        run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+
+      - name: Checkout repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+        with:
+          clean: false
+
+      - name: Cache dependencies
+        uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
+        with:
+          save-if: ${{ github.ref == 'refs/heads/main' }}
+          cache-provider: "buildjet"
+
+      - name: Install Linux dependencies
+        run: ./script/linux
+
+      - name: Configure CI
+        run: |
+          mkdir -p ./../.cargo
+          cp ./.cargo/ci-config.toml ./../.cargo/config.toml
+
+      - name: Install Rust
+        shell: bash -euxo pipefail {0}
+        run: |
+          cargo install cargo-nextest --locked
+
+      - name: Install Node
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
+        with:
+          node-version: "18"
+
+      - name: Limit target directory size
+        shell: bash -euxo pipefail {0}
+        run: script/clear-target-dir-if-larger-than 100
+
+      - name: Run unit evals
+        shell: bash -euxo pipefail {0}
+        run: cargo nextest run --workspace --no-fail-fast --features eval --no-capture -E 'test(::eval_)' --test-threads 1
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+
+      - name: Send the pull request link into the Slack channel
+        if: ${{ failure() }}
+        uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
+          payload: |
+            channel: C04UDRNNJFQ
+            text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
+
+      # Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
+      # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
+      # to clean up the config file, I’ve included the cleanup code here as a precaution.
+      # While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution.
+      - name: Clean CI config file
+        if: always()
+        run: rm -rf ./../.cargo
diff --git a/crates/assistant_tools/src/edit_agent/evals.rs b/crates/assistant_tools/src/edit_agent/evals.rs
index 5856dd83db..1ea3a4dbc8 100644
--- a/crates/assistant_tools/src/edit_agent/evals.rs
+++ b/crates/assistant_tools/src/edit_agent/evals.rs
@@ -1351,7 +1351,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
 
     let mismatched_tag_ratio =
         cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
-    if mismatched_tag_ratio > 0.05 {
+    if mismatched_tag_ratio > 0.10 {
         for eval_output in eval_outputs {
             println!("{}", eval_output);
         }