Add mismatched tag threshold parameter to eval function (#32190)
Replace hardcoded 0.10 threshold with configurable parameter and set 0.05 default for most tests, with 0.2 for from_pixels_constructor eval that produces more mismatched tags. Release Notes: - N/A
This commit is contained in:
parent
8bd8435887
commit
ddf70b3bb8
2 changed files with 19 additions and 3 deletions
2
.github/workflows/unit_evals.yml
vendored
2
.github/workflows/unit_evals.yml
vendored
|
@ -66,7 +66,7 @@ jobs:
|
||||||
env:
|
env:
|
||||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||||
|
|
||||||
- name: Send the pull request link into the Slack channel
|
- name: Send failure message to Slack channel if needed
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
|
uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
|
||||||
with:
|
with:
|
||||||
|
|
|
@ -58,6 +58,7 @@ fn eval_extract_handle_command_output() {
|
||||||
eval(
|
eval(
|
||||||
100,
|
100,
|
||||||
0.7, // Taking the lower bar for Gemini
|
0.7, // Taking the lower bar for Gemini
|
||||||
|
0.05,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(
|
message(
|
||||||
|
@ -116,6 +117,7 @@ fn eval_delete_run_git_blame() {
|
||||||
eval(
|
eval(
|
||||||
100,
|
100,
|
||||||
0.95,
|
0.95,
|
||||||
|
0.05,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(
|
message(
|
||||||
|
@ -178,6 +180,7 @@ fn eval_translate_doc_comments() {
|
||||||
eval(
|
eval(
|
||||||
200,
|
200,
|
||||||
1.,
|
1.,
|
||||||
|
0.05,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(
|
message(
|
||||||
|
@ -241,6 +244,7 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
|
||||||
eval(
|
eval(
|
||||||
100,
|
100,
|
||||||
0.95,
|
0.95,
|
||||||
|
0.05,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(
|
message(
|
||||||
|
@ -365,6 +369,7 @@ fn eval_disable_cursor_blinking() {
|
||||||
eval(
|
eval(
|
||||||
100,
|
100,
|
||||||
0.95,
|
0.95,
|
||||||
|
0.05,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(User, [text("Let's research how to cursor blinking works.")]),
|
message(User, [text("Let's research how to cursor blinking works.")]),
|
||||||
|
@ -448,6 +453,9 @@ fn eval_from_pixels_constructor() {
|
||||||
eval(
|
eval(
|
||||||
100,
|
100,
|
||||||
0.95,
|
0.95,
|
||||||
|
// For whatever reason, this eval produces more mismatched tags.
|
||||||
|
// Increasing for now, let's see if we can bring this down.
|
||||||
|
0.2,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(
|
message(
|
||||||
|
@ -648,6 +656,7 @@ fn eval_zode() {
|
||||||
eval(
|
eval(
|
||||||
50,
|
50,
|
||||||
1.,
|
1.,
|
||||||
|
0.05,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
|
message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
|
||||||
|
@ -754,6 +763,7 @@ fn eval_add_overwrite_test() {
|
||||||
eval(
|
eval(
|
||||||
200,
|
200,
|
||||||
0.5, // TODO: make this eval better
|
0.5, // TODO: make this eval better
|
||||||
|
0.05,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(
|
message(
|
||||||
|
@ -993,6 +1003,7 @@ fn eval_create_empty_file() {
|
||||||
eval(
|
eval(
|
||||||
100,
|
100,
|
||||||
0.99,
|
0.99,
|
||||||
|
0.05,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(User, [text("Create a second empty todo file ")]),
|
message(User, [text("Create a second empty todo file ")]),
|
||||||
|
@ -1279,7 +1290,12 @@ impl EvalAssertion {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
|
fn eval(
|
||||||
|
iterations: usize,
|
||||||
|
expected_pass_ratio: f32,
|
||||||
|
mismatched_tag_threshold: f32,
|
||||||
|
mut eval: EvalInput,
|
||||||
|
) {
|
||||||
let mut evaluated_count = 0;
|
let mut evaluated_count = 0;
|
||||||
let mut failed_count = 0;
|
let mut failed_count = 0;
|
||||||
report_progress(evaluated_count, failed_count, iterations);
|
report_progress(evaluated_count, failed_count, iterations);
|
||||||
|
@ -1351,7 +1367,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
|
||||||
|
|
||||||
let mismatched_tag_ratio =
|
let mismatched_tag_ratio =
|
||||||
cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
|
cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
|
||||||
if mismatched_tag_ratio > 0.10 {
|
if mismatched_tag_ratio > mismatched_tag_threshold {
|
||||||
for eval_output in eval_outputs {
|
for eval_output in eval_outputs {
|
||||||
println!("{}", eval_output);
|
println!("{}", eval_output);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue