From dda614091a99cb46433e8692bf328badf95e7dab Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Thu, 5 Jun 2025 15:16:27 +0200 Subject: [PATCH] eval: Add eval unit tests as a CI job (#32152) We run the unit evals once a day in the middle of the night, and trigger a Slack post if it fails. Release Notes: - N/A --------- Co-authored-by: Oleksiy Syvokon --- .github/workflows/unit_evals.yml | 85 +++++++++++++++++++ .../assistant_tools/src/edit_agent/evals.rs | 2 +- 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/unit_evals.yml diff --git a/.github/workflows/unit_evals.yml b/.github/workflows/unit_evals.yml new file mode 100644 index 0000000000..e8514a6edb --- /dev/null +++ b/.github/workflows/unit_evals.yml @@ -0,0 +1,85 @@ +name: Run Unit Evals + +on: + schedule: + # GitHub might drop jobs at busy times, so we choose a random time in the middle of the night. + - cron: "47 1 * * *" + workflow_dispatch: + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: 0 + RUST_BACKTRACE: 1 + ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} + +jobs: + unit_evals: + timeout-minutes: 60 + name: Run unit evals + runs-on: + - buildjet-16vcpu-ubuntu-2204 + steps: + - name: Add Rust to the PATH + run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH + + - name: Checkout repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + clean: false + + - name: Cache dependencies + uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2 + with: + save-if: ${{ github.ref == 'refs/heads/main' }} + cache-provider: "buildjet" + + - name: Install Linux dependencies + run: ./script/linux + + - name: Configure CI + run: | + mkdir -p ./../.cargo + cp ./.cargo/ci-config.toml ./../.cargo/config.toml + + - name: Install Rust + shell: bash -euxo pipefail {0} + run: | + cargo install cargo-nextest --locked + + - name: Install Node + uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4 + with: + node-version: "18" + + - name: Limit target directory size + shell: bash -euxo pipefail {0} + run: script/clear-target-dir-if-larger-than 100 + + - name: Run unit evals + shell: bash -euxo pipefail {0} + run: cargo nextest run --workspace --no-fail-fast --features eval --no-capture -E 'test(::eval_)' --test-threads 1 + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + + - name: Send the pull request link into the Slack channel + if: ${{ failure() }} + uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }} + payload: | + channel: C04UDRNNJFQ + text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}" + + # Even the Linux runner is not stateful, in theory there is no need to do this cleanup. + # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code + # to clean up the config file, I’ve included the cleanup code here as a precaution. + # While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution. + - name: Clean CI config file + if: always() + run: rm -rf ./../.cargo diff --git a/crates/assistant_tools/src/edit_agent/evals.rs b/crates/assistant_tools/src/edit_agent/evals.rs index 5856dd83db..1ea3a4dbc8 100644 --- a/crates/assistant_tools/src/edit_agent/evals.rs +++ b/crates/assistant_tools/src/edit_agent/evals.rs @@ -1351,7 +1351,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) { let mismatched_tag_ratio = cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32; - if mismatched_tag_ratio > 0.05 { + if mismatched_tag_ratio > 0.10 { for eval_output in eval_outputs { println!("{}", eval_output); }