diff --git a/.github/workflows/unit_evals.yml b/.github/workflows/unit_evals.yml index e8514a6edb..e033ba40ce 100644 --- a/.github/workflows/unit_evals.yml +++ b/.github/workflows/unit_evals.yml @@ -66,7 +66,7 @@ jobs: env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - - name: Send the pull request link into the Slack channel + - name: Send failure message to Slack channel if needed if: ${{ failure() }} uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 with: diff --git a/crates/assistant_tools/src/edit_agent/evals.rs b/crates/assistant_tools/src/edit_agent/evals.rs index 63d0c7eace..f07edff09e 100644 --- a/crates/assistant_tools/src/edit_agent/evals.rs +++ b/crates/assistant_tools/src/edit_agent/evals.rs @@ -58,6 +58,7 @@ fn eval_extract_handle_command_output() { eval( 100, 0.7, // Taking the lower bar for Gemini + 0.05, EvalInput::from_conversation( vec![ message( @@ -116,6 +117,7 @@ fn eval_delete_run_git_blame() { eval( 100, 0.95, + 0.05, EvalInput::from_conversation( vec![ message( @@ -178,6 +180,7 @@ fn eval_translate_doc_comments() { eval( 200, 1., + 0.05, EvalInput::from_conversation( vec![ message( @@ -241,6 +244,7 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() { eval( 100, 0.95, + 0.05, EvalInput::from_conversation( vec![ message( @@ -365,6 +369,7 @@ fn eval_disable_cursor_blinking() { eval( 100, 0.95, + 0.05, EvalInput::from_conversation( vec![ message(User, [text("Let's research how to cursor blinking works.")]), @@ -448,6 +453,9 @@ fn eval_from_pixels_constructor() { eval( 100, 0.95, + // For whatever reason, this eval produces more mismatched tags. + // Increasing for now, let's see if we can bring this down. + 0.2, EvalInput::from_conversation( vec![ message( @@ -648,6 +656,7 @@ fn eval_zode() { eval( 50, 1., + 0.05, EvalInput::from_conversation( vec![ message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]), @@ -754,6 +763,7 @@ fn eval_add_overwrite_test() { eval( 200, 0.5, // TODO: make this eval better + 0.05, EvalInput::from_conversation( vec![ message( @@ -993,6 +1003,7 @@ fn eval_create_empty_file() { eval( 100, 0.99, + 0.05, EvalInput::from_conversation( vec![ message(User, [text("Create a second empty todo file ")]), @@ -1279,7 +1290,12 @@ impl EvalAssertion { } } -fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) { +fn eval( + iterations: usize, + expected_pass_ratio: f32, + mismatched_tag_threshold: f32, + mut eval: EvalInput, +) { let mut evaluated_count = 0; let mut failed_count = 0; report_progress(evaluated_count, failed_count, iterations); @@ -1351,7 +1367,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) { let mismatched_tag_ratio = cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32; - if mismatched_tag_ratio > 0.10 { + if mismatched_tag_ratio > mismatched_tag_threshold { for eval_output in eval_outputs { println!("{}", eval_output); }