diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 0000000000..65d70cf856 --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,77 @@ +name: Run Agent Eval + +on: + schedule: + - cron: "0 * * * *" + push: + branches: + - main + - "v[0-9]+.[0-9]+.x" + tags: + - "v*" + + pull_request: + branches: + - "**" + types: [opened, synchronize, reopened, labeled] + + workflow_dispatch: + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: 0 + RUST_BACKTRACE: 1 + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} + ZED_EVAL_TELEMETRY: 1 + +jobs: + run_eval: + timeout-minutes: 60 + name: Run Agent Eval + if: > + github.repository_owner == 'zed-industries' && + (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval')) + runs-on: + - buildjet-16vcpu-ubuntu-2204 + steps: + - name: Add Rust to the PATH + run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH + + - name: Checkout repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + clean: false + + - name: Cache dependencies + uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2 + with: + save-if: ${{ github.ref == 'refs/heads/main' }} + cache-provider: "buildjet" + + - name: Install Linux dependencies + run: ./script/linux + + - name: Configure CI + run: | + mkdir -p ./../.cargo + cp ./.cargo/ci-config.toml ./../.cargo/config.toml + + - name: Compile eval + run: cargo build --package=eval + + - name: Run eval + run: cargo run --package=eval + + # Even the Linux runner is not stateful, in theory there is no need to do this cleanup. + # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code + # to clean up the config file, I’ve included the cleanup code here as a precaution. + # While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution. + - name: Clean CI config file + if: always() + run: rm -rf ./../.cargo diff --git a/.github/workflows/run_agent_eval_daily.yml b/.github/workflows/run_agent_eval_daily.yml deleted file mode 100644 index 6f55ca569d..0000000000 --- a/.github/workflows/run_agent_eval_daily.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Run Eval Daily - -on: - schedule: - - cron: "0 2 * * *" - workflow_dispatch: - -env: - CARGO_TERM_COLOR: always - CARGO_INCREMENTAL: 0 - RUST_BACKTRACE: 1 - -jobs: - run_eval: - name: Run Eval - if: github.repository_owner == 'zed-industries' - runs-on: ubuntu-latest - steps: - - name: Checkout repo - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - with: - clean: false - - - name: Setup Rust - uses: dtolnay/rust-toolchain@stable - - - name: Run cargo eval - run: cargo run -p eval diff --git a/crates/agent/src/active_thread.rs b/crates/agent/src/active_thread.rs index 49f04a4887..f0063722e3 100644 --- a/crates/agent/src/active_thread.rs +++ b/crates/agent/src/active_thread.rs @@ -1010,6 +1010,7 @@ impl ActiveThread { } } ThreadEvent::CheckpointChanged => cx.notify(), + ThreadEvent::ReceivedTextChunk => {} } } diff --git a/crates/agent/src/thread.rs b/crates/agent/src/thread.rs index 99dab5f939..608cbe9711 100644 --- a/crates/agent/src/thread.rs +++ b/crates/agent/src/thread.rs @@ -1231,6 +1231,7 @@ impl Thread { current_token_usage = token_usage; } LanguageModelCompletionEvent::Text(chunk) => { + cx.emit(ThreadEvent::ReceivedTextChunk); if let Some(last_message) = thread.messages.last_mut() { if last_message.role == Role::Assistant { last_message.push_text(&chunk); @@ -1780,7 +1781,7 @@ impl Thread { thread_data, final_project_snapshot ); - client.telemetry().flush_events(); + client.telemetry().flush_events().await; Ok(()) }) @@ -1825,7 +1826,7 @@ impl Thread { thread_data, final_project_snapshot ); - client.telemetry().flush_events(); + client.telemetry().flush_events().await; Ok(()) }) @@ -2081,7 +2082,7 @@ impl Thread { github_login = github_login ); - client.telemetry().flush_events(); + client.telemetry().flush_events().await; } } }) @@ -2199,6 +2200,7 @@ pub enum ThreadEvent { ShowError(ThreadError), UsageUpdated(RequestUsage), StreamedCompletion, + ReceivedTextChunk, StreamedAssistantText(MessageId, String), StreamedAssistantThinking(MessageId, String), StreamedToolUse { diff --git a/crates/client/src/telemetry.rs b/crates/client/src/telemetry.rs index 47df2a0239..fa7690a6a3 100644 --- a/crates/client/src/telemetry.rs +++ b/crates/client/src/telemetry.rs @@ -4,7 +4,7 @@ use crate::TelemetrySettings; use anyhow::Result; use clock::SystemClock; use futures::channel::mpsc; -use futures::{Future, StreamExt}; +use futures::{Future, FutureExt, StreamExt}; use gpui::{App, AppContext as _, BackgroundExecutor, Task}; use http_client::{self, AsyncBody, HttpClient, HttpClientWithUrl, Method, Request}; use parking_lot::Mutex; @@ -290,6 +290,10 @@ impl Telemetry { paths::logs_dir().join("telemetry.log") } + pub fn has_checksum_seed(&self) -> bool { + ZED_CLIENT_CHECKSUM_SEED.is_some() + } + pub fn start( self: &Arc, system_id: Option, @@ -430,7 +434,7 @@ impl Telemetry { let executor = self.executor.clone(); state.flush_events_task = Some(self.executor.spawn(async move { executor.timer(FLUSH_INTERVAL).await; - this.flush_events(); + this.flush_events().detach(); })); } @@ -456,7 +460,7 @@ impl Telemetry { if state.installation_id.is_some() && state.events_queue.len() >= state.max_queue_size { drop(state); - self.flush_events(); + self.flush_events().detach(); } } @@ -499,60 +503,59 @@ impl Telemetry { .body(json_bytes.into())?) } - pub fn flush_events(self: &Arc) { + pub fn flush_events(self: &Arc) -> Task<()> { let mut state = self.state.lock(); state.first_event_date_time = None; let mut events = mem::take(&mut state.events_queue); state.flush_events_task.take(); drop(state); if events.is_empty() { - return; + return Task::ready(()); } let this = self.clone(); - self.executor - .spawn( - async move { - let mut json_bytes = Vec::new(); + self.executor.spawn( + async move { + let mut json_bytes = Vec::new(); - if let Some(file) = &mut this.state.lock().log_file { - for event in &mut events { - json_bytes.clear(); - serde_json::to_writer(&mut json_bytes, event)?; - file.write_all(&json_bytes)?; - file.write_all(b"\n")?; - } + if let Some(file) = &mut this.state.lock().log_file { + for event in &mut events { + json_bytes.clear(); + serde_json::to_writer(&mut json_bytes, event)?; + file.write_all(&json_bytes)?; + file.write_all(b"\n")?; } - - let request_body = { - let state = this.state.lock(); - - EventRequestBody { - system_id: state.system_id.as_deref().map(Into::into), - installation_id: state.installation_id.as_deref().map(Into::into), - session_id: state.session_id.clone(), - metrics_id: state.metrics_id.as_deref().map(Into::into), - is_staff: state.is_staff, - app_version: state.app_version.clone(), - os_name: state.os_name.clone(), - os_version: state.os_version.clone(), - architecture: state.architecture.to_string(), - - release_channel: state.release_channel.map(Into::into), - events, - } - }; - - let request = this.build_request(json_bytes, request_body)?; - let response = this.http_client.send(request).await?; - if response.status() != 200 { - log::error!("Failed to send events: HTTP {:?}", response.status()); - } - anyhow::Ok(()) } - .log_err(), - ) - .detach(); + + let request_body = { + let state = this.state.lock(); + + EventRequestBody { + system_id: state.system_id.as_deref().map(Into::into), + installation_id: state.installation_id.as_deref().map(Into::into), + session_id: state.session_id.clone(), + metrics_id: state.metrics_id.as_deref().map(Into::into), + is_staff: state.is_staff, + app_version: state.app_version.clone(), + os_name: state.os_name.clone(), + os_version: state.os_version.clone(), + architecture: state.architecture.to_string(), + + release_channel: state.release_channel.map(Into::into), + events, + } + }; + + let request = this.build_request(json_bytes, request_body)?; + let response = this.http_client.send(request).await?; + if response.status() != 200 { + log::error!("Failed to send events: HTTP {:?}", response.status()); + } + anyhow::Ok(()) + } + .log_err() + .map(|_| ()), + ) } } diff --git a/crates/collab/src/api/events.rs b/crates/collab/src/api/events.rs index 56db4e5e3e..6ccc86c520 100644 --- a/crates/collab/src/api/events.rs +++ b/crates/collab/src/api/events.rs @@ -516,6 +516,7 @@ pub async fn post_events( if let Some(kinesis_client) = app.kinesis_client.clone() { if let Some(stream) = app.config.kinesis_stream.clone() { let mut request = kinesis_client.put_records().stream_name(stream); + let mut has_records = false; for row in for_snowflake( request_body.clone(), first_event_at, @@ -530,9 +531,12 @@ pub async fn post_events( .build() .unwrap(), ); + has_records = true; } } - request.send().await.log_err(); + if has_records { + request.send().await.log_err(); + } } }; @@ -555,7 +559,7 @@ fn for_snowflake( country_code: Option, checksum_matched: bool, ) -> impl Iterator { - body.events.into_iter().flat_map(move |event| { + body.events.into_iter().filter_map(move |event| { let timestamp = first_event_at + Duration::milliseconds(event.milliseconds_since_first_event); // We will need to double check, but I believe all of the events that @@ -744,9 +748,11 @@ fn for_snowflake( // NOTE: most amplitude user properties are read out of our event_properties // dictionary. See https://app.amplitude.com/data/zed/Zed/sources/detail/production/falcon%3A159998 // for how that is configured. - let user_properties = Some(serde_json::json!({ - "is_staff": body.is_staff, - })); + let user_properties = body.is_staff.map(|is_staff| { + serde_json::json!({ + "is_staff": is_staff, + }) + }); Some(SnowflakeRow { time: timestamp, diff --git a/crates/eval/examples/add_arp_protocol_support/base.toml b/crates/eval/examples/add_arp_protocol_support/base.toml deleted file mode 100644 index 70e022e4ed..0000000000 --- a/crates/eval/examples/add_arp_protocol_support/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/GyulyVGC/sniffnet.git" -revision = "cfb5b6519bd7838f279e5be9d360445aaffaa647" -language_extension = "rs" diff --git a/crates/eval/examples/add_arp_protocol_support/diff_criteria.md b/crates/eval/examples/add_arp_protocol_support/diff_criteria.md deleted file mode 100644 index 143c1ad3a7..0000000000 --- a/crates/eval/examples/add_arp_protocol_support/diff_criteria.md +++ /dev/null @@ -1,16 +0,0 @@ -1. **Protocol Enumeration:** Ensure the `Protocol` enum includes the `ARP` variant and is integrated in `Protocol::ALL`. -2. **Packet Analysis Logic:** - - Properly detect ARP packets within `analyze_headers` and `analyze_network_header`. - - Appropriately extract ARP sender/target IPs based on the protocol (IPv4 or IPv6). - - Track and store ARP operations (Request, Reply) using the `ArpType` enum. -3. **Display & User Interface:** - - Accurately represent ARP packet types in the UI (`connection_details_page.rs`) alongside ICMP types. - - Skip displaying service information for ARP packets in line with ICMP behavior. -4. **Data Struct Enhancements:** - - Update `InfoAddressPortPair` to store and count ARP operation types. - - Ensure filtering and presentation logic uses ARP data correctly. -5. **Default Behaviors:** - - Set default `protocol` in `PacketFiltersFields` to `ARP` for consistency. -6. **Testing:** - - Update unit tests for `Protocol::ALL` and `get_service` to account for ARP behavior. - - Confirm that ARP protocol toggling works properly in the GUI protocol filter handling. diff --git a/crates/eval/examples/add_arp_protocol_support/prompt.md b/crates/eval/examples/add_arp_protocol_support/prompt.md deleted file mode 100644 index a194ca681b..0000000000 --- a/crates/eval/examples/add_arp_protocol_support/prompt.md +++ /dev/null @@ -1 +0,0 @@ -Add full support for the Address Resolution Protocol (ARP) in the packet sniffer. This includes recognizing ARP packets during packet analysis, displaying ARP operation types in the UI, and updating data structures to track ARP-specific metadata. Integrate ARP into the protocol filtering system, update all relevant UI logic to ensure it handles ARP packets similarly to ICMP, and ensure proper test coverage for all new functionality. Update `Protocol::ALL` to include ARP and skip service detection for ARP packets, as they don’t use ports. Finally, ensure the `connection_details_page` displays the ARP operation types with counts, using a `pretty_print_types` method similar to ICMP types. diff --git a/crates/eval/examples/buffer_string_input_support/base.toml b/crates/eval/examples/buffer_string_input_support/base.toml deleted file mode 100644 index 0d58678dae..0000000000 --- a/crates/eval/examples/buffer_string_input_support/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/swc-project/swc.git" -revision = "787d5fabf410fafe6595ec00c197181b27578cb1" -language_extension = "rs" diff --git a/crates/eval/examples/buffer_string_input_support/diff_criteria.md b/crates/eval/examples/buffer_string_input_support/diff_criteria.md deleted file mode 100644 index 15a1e4bdf1..0000000000 --- a/crates/eval/examples/buffer_string_input_support/diff_criteria.md +++ /dev/null @@ -1,6 +0,0 @@ -1. The `parse` and `parse_sync` functions must support both `Buffer` and `String` inputs for the `src` parameter, using the `Either` type from `napi` to avoid breaking existing string-based usage while adding buffer support. -2. A helper function `stringify` must handle conversion of `Either` to a unified `String` representation internally, ensuring consistent UTF-8 decoding for buffers and direct string passthrough. -3. The TypeScript binding declarations (`binding.d.ts`) must reflect the updated parameter types for `parse` and `parse_sync` to accept `Buffer | string`, ensuring compatibility with JavaScript/TypeScript callers. -4. Unit tests must validate both buffer and string input paths for asynchronous (`parse`) and synchronous (`parse_sync`) APIs, ensuring parity in functionality and output correctness. -5. The `filename` parameter must remain optional but use `FileName::Real` when provided and fall back to `FileName::Anon` if omitted, preserving existing file resolution logic. -6. No regressions in error handling, abort signal support, or serialization/deserialization of `ParseOptions` during the refactor. diff --git a/crates/eval/examples/buffer_string_input_support/prompt.md b/crates/eval/examples/buffer_string_input_support/prompt.md deleted file mode 100644 index abe48e9520..0000000000 --- a/crates/eval/examples/buffer_string_input_support/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I need to extend the SWC parsing APIs to support both `Buffer` and `string` inputs for the source code. Please update the `parse` and `parse_sync` functions to accept `Either` instead of just `String`. Add a helper function to convert the `Either` type into a UTF-8 string, using `String::from_utf8_lossy` for buffers to handle invalid characters gracefully. Ensure the TypeScript definitions in `binding.d.ts` reflect the new parameter types. Include unit tests for both buffer and string inputs in `api_test.js`, verifying that asynchronous and synchronous parsing produce identical results regardless of input type. Maintain backward compatibility with existing string-based calls and ensure the `filename` fallback logic remains unchanged. Simplify the `src` handling to avoid code duplication between async/sync paths. diff --git a/crates/eval/examples/email_verification_refactor/base.toml b/crates/eval/examples/email_verification_refactor/base.toml deleted file mode 100644 index 04c26ca6b9..0000000000 --- a/crates/eval/examples/email_verification_refactor/base.toml +++ /dev/null @@ -1,4 +0,0 @@ -url = "https://github.com/dani-garcia/vaultwarden.git" -revision = "3a1f1bae002bebf26ce3a38b879c1ba26529af1e" -language_extension = "rs" -allow_preexisting_diagnostics = true diff --git a/crates/eval/examples/email_verification_refactor/diff_criteria.md b/crates/eval/examples/email_verification_refactor/diff_criteria.md deleted file mode 100644 index 6574bd7406..0000000000 --- a/crates/eval/examples/email_verification_refactor/diff_criteria.md +++ /dev/null @@ -1,6 +0,0 @@ -1. Refactors the `register_verification_email` logic to generate the JWT verification token earlier in the control flow, reducing duplication and improving readability. -2. Improves conditional logic for sending verification emails by only querying the database when mail should be sent, reducing unnecessary operations. -3. Refines the user existence check to specifically filter for users that have a `private_key`, adding stricter criteria before skipping email sending. -4. Preserves existing timing attack mitigation by retaining randomized sleep behavior when user exists but an email is not sent. -5. Ensures the email is sent only if appropriate, preserving previous behavior while streamlining logic and improving maintainability. -6. Removes redundant code paths and unnecessary reassignments, improving clarity without affecting functionality. diff --git a/crates/eval/examples/email_verification_refactor/prompt.md b/crates/eval/examples/email_verification_refactor/prompt.md deleted file mode 100644 index eb2fabc8c3..0000000000 --- a/crates/eval/examples/email_verification_refactor/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I want to refactor the `register_verification_email` function to streamline how verification emails are handled. Currently, the code checks if a user exists and then sends an email or returns early. I’d like to move the JWT token generation to the top of the function to avoid duplication. Then, if mail sending is enabled, the code should check for the user, but only send the verification email if the user exists and has a `private_key` (otherwise it should send the email). Keep the random sleep logic for timing mitigation in the branch where no email is sent. Remove the old duplicated token generation logic and any redundant conditionals, while ensuring the core behavior and response flow stays the same. diff --git a/crates/eval/examples/exif_rotation_support/base.toml b/crates/eval/examples/exif_rotation_support/base.toml deleted file mode 100644 index cda05c7565..0000000000 --- a/crates/eval/examples/exif_rotation_support/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/qarmin/czkawka.git" -revision = "db164d3698198dd46653b1c3bb0384f8a9e38fab" -language_extension = "rs" diff --git a/crates/eval/examples/exif_rotation_support/diff_criteria.md b/crates/eval/examples/exif_rotation_support/diff_criteria.md deleted file mode 100644 index 9a7bb530d5..0000000000 --- a/crates/eval/examples/exif_rotation_support/diff_criteria.md +++ /dev/null @@ -1,7 +0,0 @@ -1. **EXIF-based Rotation Handling**: Introduces image orientation correction using EXIF metadata by parsing orientation tags and applying corresponding image transformations (e.g., flip, rotate). This improves accuracy for displaying and analyzing images with embedded rotation data. -2. **New Dependencies and Parsing Logic**: Adds `nom-exif`, `iso6709parse`, and related dependencies for reading EXIF metadata, and implements robust parsing logic using `MediaParser`, `ExifIter`, and orientation matching for clean integration. -3. **Expanded `common_image.rs` Logic**: Refactors image loading in `get_dynamic_image_from_path` to automatically apply EXIF-based orientation corrections, adding new helper methods (`get_rotation_from_exif`) and an `ExifOrientation` enum to encapsulate the rotation logic clearly and maintainably. -4. **Versioning and Compatibility Updates**: Updates minimum Rust version to 1.80.0 across all packages and workflows, ensuring compatibility with newly introduced crates and language features. -5. **Internal Versioning Sync**: Increments `CACHE_IMAGE_VERSION` to ensure cache invalidation reflects new image processing logic, preventing mismatches due to transformed image data. -6. **Dependency Management and Cargo.toml Additions**: Adds new crate dependencies to `Cargo.toml` files where necessary (`czkawka_core`, `Cargo.lock`) and aligns versions to reflect new EXIF parsing functionality. -7. **GUI State Initialization Adjustment**: Modifies GUI default tab state from `SimilarImages` to `DuplicateFiles`—likely for improved UX or alignment with application focus. diff --git a/crates/eval/examples/exif_rotation_support/prompt.md b/crates/eval/examples/exif_rotation_support/prompt.md deleted file mode 100644 index d85e421d6f..0000000000 --- a/crates/eval/examples/exif_rotation_support/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I'd like to implement support for automatic image orientation correction based on EXIF metadata in our Rust project. Specifically, I want to use the `nom-exif` crate to read EXIF orientation tags and adjust the image accordingly (e.g., flip horizontally, rotate 90° CW, etc.) when loading it in `get_dynamic_image_from_path`. Please integrate the EXIF parsing flow using `MediaParser`, `ExifIter`, and match the orientation codes 1–8 to a custom `ExifOrientation` enum. Ensure that these transformations are applied directly to the `DynamicImage` output when applicable. Also, bump the `CACHE_IMAGE_VERSION` to invalidate any outdated cached formats and update the Rust version across the codebase to `1.80.0` to support the latest dependencies. Make any required changes to Cargo.toml and lockfiles, and default the GUI to open the Duplicate Files tab instead of Similar Images for consistency. diff --git a/crates/eval/examples/lhs_join_update_callbacks/base.toml b/crates/eval/examples/lhs_join_update_callbacks/base.toml deleted file mode 100644 index b8e6a7500b..0000000000 --- a/crates/eval/examples/lhs_join_update_callbacks/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/clockworklabs/SpacetimeDB.git" -revision = "68d23d4c25548fd74f1bde28a57d8858022b9671" -language_extension = "rs" diff --git a/crates/eval/examples/lhs_join_update_callbacks/diff_criteria.md b/crates/eval/examples/lhs_join_update_callbacks/diff_criteria.md deleted file mode 100644 index 86a9fc49e0..0000000000 --- a/crates/eval/examples/lhs_join_update_callbacks/diff_criteria.md +++ /dev/null @@ -1,6 +0,0 @@ -1. A `JOIN` query with conditions on both sides (LHS and RHS) correctly triggers subscription updates when only the LHS table is updated. -2. Callback functions (`on_insert`, `on_update`) are invoked exactly once and in the expected order. -3. Queries with logically equivalent WHERE conditions (e.g., `x > 0 and x < 5` vs. `0 < x and x < 5`) yield consistent subscription behavior. -4. Complex disjoint queries that restrict the RHS via additional constraints (e.g., `u.n != 1`) still properly identify matching LHS updates. -5. Type inference and expression normalization correctly handle literals on the left-hand side of binary operations in WHERE clauses. -6. Physical execution plans normalize expressions like `3 < l.x` into `l.x > 3` with appropriate operator inversion (`Lt ↔ Gt`, `Lte ↔ Gte`), maintaining logical correctness. diff --git a/crates/eval/examples/lhs_join_update_callbacks/prompt.md b/crates/eval/examples/lhs_join_update_callbacks/prompt.md deleted file mode 100644 index 4242d7de65..0000000000 --- a/crates/eval/examples/lhs_join_update_callbacks/prompt.md +++ /dev/null @@ -1,12 +0,0 @@ -Add a new test case to validate join subscription updates when the **LHS table is updated**, and ensure correct invocation of reducer callbacks. The test should: - -- Subscribe to a join query with a filter involving fields from both tables (e.g., `SELECT p.* FROM pk_u32 p JOIN unique_u32 u ON p.n = u.n WHERE u.data > 0 AND u.data < 5`). -- Insert rows into both LHS (`pk_u32`) and RHS (`unique_u32`) that satisfy the join condition. -- Verify the initial subscription callback is triggered via `on_insert`. -- Update the LHS (`pk_u32`) such that the row remains part of the join result. -- Validate that: - - `on_update` is invoked correctly. - - An immediate follow-up update back to the original value also triggers `on_update`. -- Repeat the above with disjoint filters (e.g., `u.n != 1`) and confirm behavior remains correct. - -Also, ensure that literal-first SQL expressions like `3 < x` are correctly interpreted and inverted in the physical execution plan (converted to `x > 3`) and behave identically during query evaluation and execution. diff --git a/crates/eval/examples/libdevice_symbol_reexport/base.toml b/crates/eval/examples/libdevice_symbol_reexport/base.toml deleted file mode 100644 index bd800aef81..0000000000 --- a/crates/eval/examples/libdevice_symbol_reexport/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/Rust-GPU/Rust-CUDA.git" -revision = "728013419b6c4c80e099a42413574c36a9aff9c7" -language_extension = "rs" diff --git a/crates/eval/examples/libdevice_symbol_reexport/diff_criteria.md b/crates/eval/examples/libdevice_symbol_reexport/diff_criteria.md deleted file mode 100644 index eb7ba1f920..0000000000 --- a/crates/eval/examples/libdevice_symbol_reexport/diff_criteria.md +++ /dev/null @@ -1,4 +0,0 @@ -1. **Reexports `LIBDEVICE_BITCODE` for cleaner dependency usage:** The `LIBDEVICE_BITCODE` symbol from the `cust_raw::nvvm_sys` crate is now reexported via the `nvvm` crate. This allows consuming crates to access the symbol directly from `nvvm`, abstracting away the internal structure and reducing tight coupling to `cust_raw`. -2. **Simplifies dependency graph and usage of NVVM internals:** By removing the direct dependency on `cust_raw` from `rustc_codegen_nvvm`, the changes streamline the crate's external interface, reducing maintenance overhead and improving modularity. Consumers now only need to rely on the higher-level `nvvm` crate. -3. **Improves code readability and encapsulation:** The change makes the source cleaner by reducing low-level, verbose paths like `nvvm_sys::LIBDEVICE_BITCODE` to a concise `LIBDEVICE_BITCODE`, enhancing readability and reinforcing a layered architecture. -4. **Maintains existing functionality:** The code logic remains unchanged in behavior—`LIBDEVICE_BITCODE` is still used in the same way, ensuring that the refactor is safe, non-breaking, and purely organizational. diff --git a/crates/eval/examples/libdevice_symbol_reexport/prompt.md b/crates/eval/examples/libdevice_symbol_reexport/prompt.md deleted file mode 100644 index ae715bc3cb..0000000000 --- a/crates/eval/examples/libdevice_symbol_reexport/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I'd like to improve the modularity and encapsulation of the NVVM codegen setup. Please refactor the code to reexport `LIBDEVICE_BITCODE` from the `nvvm` crate instead of accessing it directly from `cust_raw::nvvm_sys`. This involves updating the `nvvm` crate to publicly reexport the symbol, and then modifying `rustc_codegen_nvvm` to use the reexported path. Additionally, remove the direct dependency on `cust_raw` from `rustc_codegen_nvvm/Cargo.toml` and clean up any redundant `use` statements that reference `cust_raw` directly. The goal is to simplify usage of `nvvm_sys` internals by encapsulating them within `nvvm`, making the codebase more maintainable without changing its behavior. diff --git a/crates/eval/examples/license_management/base.toml b/crates/eval/examples/license_management/base.toml deleted file mode 100644 index 0691dd519a..0000000000 --- a/crates/eval/examples/license_management/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/SAP-samples/abap-cheat-sheets.git" -revision = "262c0472eeb03e05ff8235767356a328d97850e6" -language_extension = "rs" diff --git a/crates/eval/examples/license_management/diff_criteria.md b/crates/eval/examples/license_management/diff_criteria.md deleted file mode 100644 index ad270f4ccf..0000000000 --- a/crates/eval/examples/license_management/diff_criteria.md +++ /dev/null @@ -1,3 +0,0 @@ -1. The file `.reuse/dep5` has been deleted. This file previously contained copyright and licensing information in Debian's copyright format, including details about API usage with SAP products, copyright notice (2022 SAP SE or affiliates), and Apache-2.0 license information. -2. A new file `REUSE.toml` has been created with similar copyright and licensing information but in a different format. It includes the package name, supplier information, download location, and the same detailed disclaimer about API usage with SAP products that was in the deleted file. -3. The new `REUSE.toml` file also contains annotations specifying that the copyright text and Apache-2.0 license apply to all files (`path = "**"`) with aggregate precedence, effectively maintaining the same licensing terms but in a different configuration format. diff --git a/crates/eval/examples/license_management/prompt.md b/crates/eval/examples/license_management/prompt.md deleted file mode 100644 index df6901fc16..0000000000 --- a/crates/eval/examples/license_management/prompt.md +++ /dev/null @@ -1,17 +0,0 @@ -I need to switch our license stuff from the old .reuse/dep5 file to the new REUSE.toml format. basically same info, just different format. here's what's in the old file: - -project name: abap-cheat-sheets -contact: daniel reger's email -repo link -that long SAP API disclaimer -copyright: SAP + contributors, 2022 -license: Apache-2.0 -need to: - -delete the old .reuse/dep5 file -make a new REUSE.toml with: -same project info (name, contact, repo) -same exact API disclaimer text -SPDX-style copyright & license fields -apply to all files (** glob) with aggregate precedence -not changing any actual license terms, just updating the format. can you give me the exact REUSE.toml file we need? diff --git a/crates/eval/examples/metal_i64_support/base.toml b/crates/eval/examples/metal_i64_support/base.toml deleted file mode 100644 index 4648f148b8..0000000000 --- a/crates/eval/examples/metal_i64_support/base.toml +++ /dev/null @@ -1,4 +0,0 @@ -url = "https://github.com/huggingface/candle.git" -revision = "3164a19a5dc18f5e0f7a063ae85a0cfd289e98f1" -language_extension = "rs" -allow_preexisting_diagnostics = true diff --git a/crates/eval/examples/metal_i64_support/diff_criteria.md b/crates/eval/examples/metal_i64_support/diff_criteria.md deleted file mode 100644 index 35741151c9..0000000000 --- a/crates/eval/examples/metal_i64_support/diff_criteria.md +++ /dev/null @@ -1,4 +0,0 @@ -1. The changes improve the configurability of the `TextGeneration` struct and its initialization by refactoring generation parameters (`temperature`, `top_p`) to use non-optional types with default values, simplifying their use throughout the codebase. -2. The argument parser is updated to enhance usability: `verbose_prompt` is renamed to a more general `verbose` flag, several arguments are given default values (e.g., `temperature`, `top_p`, `sample_len`), and optional arguments like `cache_path` and `weight_path` are now properly handled with conditional logic and fallbacks. -3. The code loading the model configuration is updated to support deserializing from a JSON config file using Serde, and the `Config` struct is extended with a new `rope_ratio` field with a default value via a helper function, improving flexibility for different model setups. -4. Import statements and general code layout are cleaned up for clarity and consistency, including reorganizing imports and removing unnecessary unwraps or panics, while maintaining the same core functionality of the text generation pipeline. diff --git a/crates/eval/examples/metal_i64_support/prompt.md b/crates/eval/examples/metal_i64_support/prompt.md deleted file mode 100644 index bdc365b1cd..0000000000 --- a/crates/eval/examples/metal_i64_support/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I'd like to improve the configurability and usability of the text generation script for the CodeGeeX4-9B model. Please refactor the argument parsing to set more user-friendly defaults where possible, especially for generation parameters like temperature and top-p, and change fields like verbose_prompt to a more general verbose flag. Simplify the handling of optional paths like cache or weight paths, making them truly optional with fallbacks. I also want the model config to support deserialization from a JSON file instead of relying on hardcoded defaults, including support for a rope_ratio parameter with a sensible default. Lastly, please clean up the code for consistency—such as import ordering—and ensure everything aligns with these improvements without changing the overall functionality. diff --git a/crates/eval/examples/metrics_data_size_updates/base.toml b/crates/eval/examples/metrics_data_size_updates/base.toml deleted file mode 100644 index ccc096a79f..0000000000 --- a/crates/eval/examples/metrics_data_size_updates/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/clockworklabs/SpacetimeDB.git" -revision = "13dfb031351c3adf308c74b2a085ca15aa797db1" -language_extension = "rs" diff --git a/crates/eval/examples/metrics_data_size_updates/diff_criteria.md b/crates/eval/examples/metrics_data_size_updates/diff_criteria.md deleted file mode 100644 index 9642987671..0000000000 --- a/crates/eval/examples/metrics_data_size_updates/diff_criteria.md +++ /dev/null @@ -1,6 +0,0 @@ -1. The function `report_data_size` has been refactored to have a more accessible visibility by changing from `pub(super)` to `pub` in the `CommittedState` struct, making it usable outside of its previous scope. -2. The `record_tx_metrics` function has been modified to remove the previously commented-out code that invoked `report_data_size` from `committed_state`. The intention to possibly inline this functionality or refactor the metrics updates is noted. -3. A new function `update_data_size_metrics` has been introduced in the `RelationalDB` struct to simplify calling the `report_data_size` method, enhancing clarity and direct usage within the database context. -4. The `storage_monitor` function has been renamed and refactored to `metric_reporter`, which is tasked with collecting disk usage statistics and invoking `update_data_size_metrics` for database state updates. -5. Various asynchronous operations involving time intervals for disk usage measurement and reporting have been restructured for improved metric collection, reducing unnecessary operations and improving clarity. -6. Comments and TODOs are placed where further improvements, such as adding heap usage metrics, are possible, guiding future enhancements. diff --git a/crates/eval/examples/metrics_data_size_updates/prompt.md b/crates/eval/examples/metrics_data_size_updates/prompt.md deleted file mode 100644 index d5824abcce..0000000000 --- a/crates/eval/examples/metrics_data_size_updates/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I'd like to refactor and improve the metric collection system in the database layer. Specifically, please refactor the `report_data_size` function to make it publicly accessible by changing its visibility from `pub(super)` to `pub`. Then, remove the commented-out `report_data_size` invocation from `record_tx_metrics` in the `datastore.rs` file and ensure that metric collection is more streamlined. Add a new function in the `RelationalDB` struct named `update_data_size_metrics` to simplify invoking `report_data_size` and enhance its usage across the code. Finally, refactor the `storage_monitor` function by renaming it to `metric_reporter`, and ensure that it periodically collects disk usage statistics and updates data size metrics. Additionally, leave a TODO in the code for adding heap usage metrics in the future. Please ensure that these changes maintain the core functionality while improving the overall organization and clarity of the code. diff --git a/crates/eval/examples/never_type_workaround/base.toml b/crates/eval/examples/never_type_workaround/base.toml deleted file mode 100644 index 8ae48cb92a..0000000000 --- a/crates/eval/examples/never_type_workaround/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/bevyengine/bevy.git" -revision = "ac52cca033b351cc966cd3d40eb99ffbefbdb104" -language_extension = "rs" diff --git a/crates/eval/examples/never_type_workaround/diff_criteria.md b/crates/eval/examples/never_type_workaround/diff_criteria.md deleted file mode 100644 index 00d0108445..0000000000 --- a/crates/eval/examples/never_type_workaround/diff_criteria.md +++ /dev/null @@ -1,5 +0,0 @@ -1. Introduces a stable-Rust-compatible workaround for the unstable `!` (never) type by implementing a custom `Never` alias based on a trait (`FnRet`) and function signature (`fn() -> !`), mimicking the behavior of the `never_say_never` crate without an external dependency. -2. Adds trait impls that enable Bevy systems and commands to accept `Never` as an output type, ensuring compatibility with panicking closures or intentionally non-returning functions like `todo!()` or `panic!()`. -3. Updates internal wrappers (`InfallibleSystemWrapper`, `InfallibleObserverWrapper`) and trait bounds across observer and schedule systems to support this workaround by allowing `Never` as a valid output type while maintaining existing fallible/infallible behavior. -4. Adds robust regression test coverage to ensure these `Never`-based trait implementations compile and function as expected, specifically targeting closures and functions that use `todo!()` or diverge without returning. -5. Ensures this workaround does not compromise stability guarantees by isolating `Never` usage to internal APIs and clearly documenting the risks and rationale in the new `never.rs` module. diff --git a/crates/eval/examples/never_type_workaround/prompt.md b/crates/eval/examples/never_type_workaround/prompt.md deleted file mode 100644 index 16a207173f..0000000000 --- a/crates/eval/examples/never_type_workaround/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I'd like to add stable Rust support for handling the `!` (never) type in Bevy's ECS systems, in light of changes introduced in the Rust 2024 edition around never type fallback inference. Please create a new internal module (e.g., `never.rs`) that provides a type alias `Never` using a workaround based on a trait and `fn() -> !` to simulate the behavior of the unstable `!` type. Update the necessary traits and system wrappers (such as `HandleError`, `IntoScheduleConfigs`, and `IntoObserverSystem`) to accept `Never` as a valid output type, ensuring that closures or systems using `todo!()` or panics can still compile and behave correctly. Add a set of regression tests that exercise this compatibility by queuing and scheduling systems and commands with `todo!()` as their body, ensuring trait impls are resolved properly. Make sure to document this hack in the new module with a clear explanation of why it's being used and the risks involved. diff --git a/crates/eval/examples/replace_hold_with_drain_on_exit/base.toml b/crates/eval/examples/replace_hold_with_drain_on_exit/base.toml deleted file mode 100644 index 08ac11e5c8..0000000000 --- a/crates/eval/examples/replace_hold_with_drain_on_exit/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/alacritty/alacritty.git" -revision = "c9c41e637ac49f3cd67cf0362c596ae9d947f896" -language_extension = "rs" diff --git a/crates/eval/examples/replace_hold_with_drain_on_exit/diff_criteria.md b/crates/eval/examples/replace_hold_with_drain_on_exit/diff_criteria.md deleted file mode 100644 index 2c10a15a41..0000000000 --- a/crates/eval/examples/replace_hold_with_drain_on_exit/diff_criteria.md +++ /dev/null @@ -1,6 +0,0 @@ -1. **Field Renaming and Semantic Clarification**: The `hold` field in `Options`, `TerminalOptions`, and `PtyOptions` has been renamed to `drain_on_exit` across the codebase. This improves semantic clarity by distinguishing between two distinct behaviors: draining output before exit versus holding the terminal open after exit. -2. **Behavioral Shift in Exit Logic**: The logic previously controlled by `hold` now uses `drain_on_exit`, ensuring the child process’s output is drained upon termination, but the window may still close unless explicitly held open via external means. Exit handling in `event_loop.rs` has been updated to reflect this behavioral distinction. -3. **Config and Struct Updates**: All related structs (`UiConfig`, `Window`, `WindowContext`, `EventLoop`) have been updated to reflect the new `drain_on_exit` naming. This ensures consistent naming and avoids legacy confusion. -4. **UI Window Behavior**: A new `hold` field has been added to the `Window` struct to manage whether the terminal window should remain open on exit, separating UI behavior from terminal process behavior. -5. **Exit Control Improvements**: When a user closes a window manually (`CloseRequested` event), `hold` is explicitly set to `false` to allow proper shutdown, ensuring manual control supersedes configuration-based persistence. -6. **Documentation and Changelog Updates**: The CHANGELOG entry for version `0.25.0-dev` documents the replacement of `hold` with `drain_on_exit`, providing visibility into this breaking change and its rationale (terminal holding should now be handled externally). diff --git a/crates/eval/examples/replace_hold_with_drain_on_exit/prompt.md b/crates/eval/examples/replace_hold_with_drain_on_exit/prompt.md deleted file mode 100644 index fe6f2bb44e..0000000000 --- a/crates/eval/examples/replace_hold_with_drain_on_exit/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I'd like to rename and refactor the `hold` behavior in the Alacritty terminal codebase to better reflect its actual use and separate terminal process handling from window behavior. Please rename the `hold` field in all relevant config structs (`Options`, `TerminalOptions`, `PtyOptions`) to `drain_on_exit` to make it clear that the terminal should drain its output before exiting, not necessarily hold the window open. Update all associated logic and struct initializations accordingly. Additionally, add a new `hold` field specifically to the `Window` struct to control whether the terminal window should remain open after the terminal process exits. Ensure that when a user explicitly closes the window (e.g., via `WindowEvent::CloseRequested`), this `hold` flag is set to false to allow normal shutdown. Update any logic that previously depended on `hold` to use the appropriate new field, and include a changelog entry explaining this semantic split and why the change was made. diff --git a/crates/eval/examples/restore_version_api_support/base.toml b/crates/eval/examples/restore_version_api_support/base.toml deleted file mode 100644 index be62c1af5b..0000000000 --- a/crates/eval/examples/restore_version_api_support/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/lancedb/lancedb.git" -revision = "698f329598bcfa8a5bf0feedfdd4344a4cdc7e4d" -language_extension = "rs" diff --git a/crates/eval/examples/restore_version_api_support/diff_criteria.md b/crates/eval/examples/restore_version_api_support/diff_criteria.md deleted file mode 100644 index 6de75ddff1..0000000000 --- a/crates/eval/examples/restore_version_api_support/diff_criteria.md +++ /dev/null @@ -1,6 +0,0 @@ -1. The `restore` method is updated across Python and Rust components of LanceDB to accept an optional `version` argument, enabling more flexible restoration of historical table versions. -2. Python async bindings in `_lancedb.pyi` and `table.py` are updated to reflect the new method signature `restore(version: Optional[int] = None)`, aligning type hints and implementations. -3. The remote table interface in `remote/table.py` includes a corresponding `restore` method, bridging the sync API to the async backend with version support. -4. The Rust FFI layer (`table.rs`) is modified to accept the optional `version` argument, with logic that performs a `checkout(version)` if specified, before proceeding to `restore()`, improving control over the restore flow. -5. The `RemoteTable` implementation in `remote/table.rs` now constructs and sends a versioned restore request via HTTP, enabling client-side version-specific restoration even in cloud deployments. -6. Docstrings and comments are added or expanded to explain the behavior of the `restore` function, particularly the no-op case when restoring the latest version, enhancing code maintainability and developer understanding. diff --git a/crates/eval/examples/restore_version_api_support/prompt.md b/crates/eval/examples/restore_version_api_support/prompt.md deleted file mode 100644 index 7a7b2b268e..0000000000 --- a/crates/eval/examples/restore_version_api_support/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I'd like to update the `restore` method in LanceDB to support restoring to a specific historical version of a table. Please modify all relevant files to add an optional `version` parameter to `restore`, defaulting to `None`. When `version` is provided, the implementation should perform a checkout to that version before executing the restore. If `version` is not specified, it should restore the currently checked-out version. Update the Python async bindings (`_lancedb.pyi`, `table.py`, and `remote/table.py`) to reflect the new method signature and behavior. In the Rust FFI layer (`python/src/table.rs`), modify the `restore` function to accept and correctly handle the optional version argument. For the cloud-backed `RemoteTable` in Rust (`rust/lancedb/src/remote/table.rs`), ensure that the version is included in the HTTP request body during a restore operation. Add or update docstrings and comments as needed to clarify how restore behaves with and without the `version` argument. diff --git a/crates/eval/examples/time_detail_merge_update/base.toml b/crates/eval/examples/time_detail_merge_update/base.toml deleted file mode 100644 index a9353d19e1..0000000000 --- a/crates/eval/examples/time_detail_merge_update/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/tikv/tikv.git" -revision = "be74cadcdd6608e5788d0c2a6784c456b4ce84e6" -language_extension = "rs" diff --git a/crates/eval/examples/time_detail_merge_update/diff_criteria.md b/crates/eval/examples/time_detail_merge_update/diff_criteria.md deleted file mode 100644 index 6a14a6507a..0000000000 --- a/crates/eval/examples/time_detail_merge_update/diff_criteria.md +++ /dev/null @@ -1,5 +0,0 @@ -1. **Function Modification**: The `write_time_detail` function has been refactored into `merge_time_detail` to modify the behavior of merging time details instead of overwriting them. The `merge_time_detail` function now adds new values to the existing ones, preserving the data and allowing for cumulative updates, which ensures more accurate tracking of time metrics. -2. **Usage of New Function**: All instances where `write_time_detail` was called have been updated to use `merge_time_detail`, including in the `src/coprocessor/endpoint.rs`, `src/server/service/kv.rs`, `src/storage/txn/tracker.rs`, and test files. The modification ensures consistency across the codebase by merging time details rather than replacing them. -3. **Test Coverage**: A new test, `test_select_time_details`, has been added in `tests/integrations/coprocessor/test_select.rs` to validate the proper functioning of time detail merging. The test checks that the `process_wall_time_ns` field is not zero, ensuring the correct time metrics are being tracked and merged. -4. **Backward Compatibility**: The changes do not affect any external functionality or break compatibility. The merging of time details is backward-compatible, as it preserves existing values and adds new ones, which makes the system more flexible for future extensions. -5. **Code Consistency**: The naming convention and function signature have been aligned with existing code practices, making the codebase more consistent and easier to maintain. diff --git a/crates/eval/examples/time_detail_merge_update/prompt.md b/crates/eval/examples/time_detail_merge_update/prompt.md deleted file mode 100644 index 28ac29c5d8..0000000000 --- a/crates/eval/examples/time_detail_merge_update/prompt.md +++ /dev/null @@ -1 +0,0 @@ -I want to refactor the existing time detail handling in the codebase. Specifically, I'd like to replace the `write_time_detail` function with a new `merge_time_detail` function, which will add new time details to the existing ones rather than overwriting them. This change should be applied across the codebase wherever `write_time_detail` is used, including in `src/coprocessor/endpoint.rs`, `src/server/service/kv.rs`, `src/storage/txn/tracker.rs`, and any related test cases. Please ensure that all occurrences of the old function are updated to use the new one. Additionally, add a test to validate that the `process_wall_time_ns` is correctly merged and is not zero, which will ensure the merging is functioning as intended. Make sure these changes preserve backward compatibility and do not introduce any regressions in functionality. diff --git a/crates/eval/examples/tool_response_handling/base.toml b/crates/eval/examples/tool_response_handling/base.toml deleted file mode 100644 index cd499cefb3..0000000000 --- a/crates/eval/examples/tool_response_handling/base.toml +++ /dev/null @@ -1,3 +0,0 @@ -url = "https://github.com/block/goose.git" -revision = "d7308457fe3f1b9c7253de45b2f81ddc4f005fe5" -language_extension = "rs" diff --git a/crates/eval/examples/tool_response_handling/diff_criteria.md b/crates/eval/examples/tool_response_handling/diff_criteria.md deleted file mode 100644 index 9aaaa83b43..0000000000 --- a/crates/eval/examples/tool_response_handling/diff_criteria.md +++ /dev/null @@ -1,3 +0,0 @@ -1. All Goose packages (`goose`, `goose-bench`, `goose-cli`, `goose-mcp`, `goose-server`) were updated from version `1.0.17` to `1.0.18` in `Cargo.lock`. These updates ensure compatibility and consistency across related packages. -2. The `goose-app` version in `ui/desktop/package-lock.json` was also updated to `1.0.18`, maintaining alignment with the backend and shared libraries. -3. In `App.tsx`, the `useConfig` hook was destructured to directly use `addExtension` instead of the older `addExtensionToConfig` function. All occurrences of the old function name were updated, including inside effects and async calls, to use the new unified method. This change simplifies extension handling logic while preserving current behavior. diff --git a/crates/eval/examples/tool_response_handling/prompt.md b/crates/eval/examples/tool_response_handling/prompt.md deleted file mode 100644 index 3358ad6eec..0000000000 --- a/crates/eval/examples/tool_response_handling/prompt.md +++ /dev/null @@ -1 +0,0 @@ -Upgrade all Goose-related packages and apps from version 1.0.17 to 1.0.18 throughout the codebase. This includes updating version references in Cargo.lock, package-lock.json, and source files where applicable. In addition, streamline the addExtension logic in App.tsx by removing the outdated addExtensionToConfig references and replacing them with the new unified addExtension function. Ensure that all function dependencies and hooks reflect this updated usage. The goal is to improve maintainability and consistency across the codebase without introducing any functional changes. diff --git a/crates/eval/examples/virtio_block_request_refactor/base.toml b/crates/eval/examples/virtio_block_request_refactor/base.toml deleted file mode 100644 index 58fdc0a963..0000000000 --- a/crates/eval/examples/virtio_block_request_refactor/base.toml +++ /dev/null @@ -1,4 +0,0 @@ -url = "https://github.com/firecracker-microvm/firecracker.git" -revision = "5eaa6e08e350cd38c8102848913a096312e59097" -language_extension = "rs" -allow_preexisting_diagnostics = true diff --git a/crates/eval/examples/virtio_block_request_refactor/diff_criteria.md b/crates/eval/examples/virtio_block_request_refactor/diff_criteria.md deleted file mode 100644 index 579910887c..0000000000 --- a/crates/eval/examples/virtio_block_request_refactor/diff_criteria.md +++ /dev/null @@ -1,5 +0,0 @@ -1. The changes remove unnecessary generic type parameters from the `FileEngine`, `AsyncFileEngine`, and related structures by directly using the `PendingRequest` type, simplifying type signatures and improving code clarity. -2. Error handling is unified through the replacement of `UserDataError` with `RequestError` that specifically carries `PendingRequest` information, ensuring consistent error propagation with request context. -3. The `WrappedUserData` struct is renamed to `WrappedRequest` and directly embeds `PendingRequest`, aligning terminology with the virtio block device’s request lifecycle and improving traceability. -4. Test code is updated to use `PendingRequest::default()` instead of placeholder `()` types, ensuring type consistency and proper request initialization in all scenarios. -5. Code organization is improved by consolidating imports (e.g., merging `IO_URING_NUM_ENTRIES` and `PendingRequest` imports) and removing redundant type parameters across async/sync I/O implementations. diff --git a/crates/eval/examples/virtio_block_request_refactor/prompt.md b/crates/eval/examples/virtio_block_request_refactor/prompt.md deleted file mode 100644 index 523fa9c7cf..0000000000 --- a/crates/eval/examples/virtio_block_request_refactor/prompt.md +++ /dev/null @@ -1 +0,0 @@ -Refactor the virtio block device’s I/O handling to eliminate generic type parameters from file engine structures, replacing them with the concrete `PendingRequest` type. Update the `AsyncFileEngine` and `FileEngine` implementations to directly handle `PendingRequest` in all operations, ensuring error types like `RequestError` propagate this request context. Rename `UserDataError`/`UserDataOk` to `RequestError`/`RequestOk` and adjust their internals to store `PendingRequest` instead of a generic `user_data`. Simplify imports (e.g., consolidate `io_uring` imports) and modify test code to initialize `PendingRequest` properly with default values where needed. Maintain all existing async/sync I/O functionality, including dirty memory tracking and request completion logic. diff --git a/crates/eval/src/eval.rs b/crates/eval/src/eval.rs index c5be29f1c1..986cc76354 100644 --- a/crates/eval/src/eval.rs +++ b/crates/eval/src/eval.rs @@ -24,13 +24,11 @@ use prompt_store::PromptBuilder; use release_channel::AppVersion; use reqwest_client::ReqwestClient; use settings::{Settings, SettingsStore}; +use std::env; use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::usize; use util::ResultExt as _; -pub const RUNS_DIR: &str = "./crates/eval/runs"; - #[derive(Parser, Debug)] #[command(name = "eval", disable_version_flag = true)] struct Args { @@ -57,8 +55,36 @@ struct Args { fn main() { env_logger::init(); + let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok(); + let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok(); + let session_id = uuid::Uuid::new_v4().to_string(); + let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S"); + let run_id = match env::var("GITHUB_RUN_ID") { + Ok(run_id) => format!("github/{}", run_id), + Err(_) => format!("local/{}", run_timestamp), + }; + + let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .parent() + .unwrap(); + let eval_crate_dir = root_dir.join("crates/eval"); + let repos_dir = eval_crate_dir.join("repos"); + let worktrees_dir = eval_crate_dir.join("worktrees"); + let examples_dir = eval_crate_dir.join("examples"); + let runs_dir = eval_crate_dir.join("runs"); + let run_dir = runs_dir.join(format!("{}", run_timestamp)); + std::fs::create_dir_all(&run_dir).unwrap(); + std::fs::create_dir_all(&repos_dir).unwrap(); + std::fs::create_dir_all(&worktrees_dir).unwrap(); + std::fs::create_dir_all(&examples_dir).unwrap(); + std::fs::create_dir_all(&paths::config_dir()).unwrap(); + + let zed_commit_sha = commit_sha_for_path(root_dir); + let zed_branch_name = git_branch_for_path(root_dir); let args = Args::parse(); - let all_available_examples = list_all_examples().unwrap(); + let all_available_examples = list_all_examples(&examples_dir).unwrap(); let example_paths = all_available_examples .iter() @@ -83,14 +109,20 @@ fn main() { app.run(move |cx| { let app_state = init(cx); - let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok(); - let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok(); - let session_id = uuid::Uuid::new_v4().to_string(); + let telemetry = app_state.client.telemetry(); + telemetry.start(system_id, installation_id, session_id, cx); - app_state - .client - .telemetry() - .start(system_id, installation_id, session_id, cx); + let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1") + && telemetry.has_checksum_seed(); + if enable_telemetry { + println!("Telemetry enabled"); + telemetry::event!( + "Agent Eval Started", + zed_commit_sha = zed_commit_sha, + zed_branch_name = zed_branch_name, + run_id = run_id, + ); + } let mut cumulative_tool_metrics = ToolMetrics::default(); @@ -114,15 +146,6 @@ fn main() { cx.spawn(async move |cx| { authenticate_task.await.unwrap(); - std::fs::create_dir_all(REPOS_DIR)?; - std::fs::create_dir_all(WORKTREES_DIR)?; - - let run_dir = Path::new(RUNS_DIR).join(format!( - "{}", - chrono::Local::now().format("%Y-%m-%d_%H-%M-%S") - )); - std::fs::create_dir_all(&run_dir)?; - let mut examples = Vec::new(); const COLORS: [&str; 12] = [ @@ -144,7 +167,12 @@ fn main() { let mut skipped = Vec::new(); for example_path in &example_paths { - let example = Example::load_from_directory(example_path, &run_dir)?; + let example = Example::load_from_directory( + example_path, + &run_dir, + &worktrees_dir, + &repos_dir, + )?; if !example .base @@ -194,7 +222,7 @@ fn main() { let repo_url = example.base.url.clone(); if repo_urls.insert(repo_url.clone()) { - let repo_path = repo_path_for_url(&repo_url); + let repo_path = example.repo_path.clone(); if !repo_path.join(".git").is_dir() { println!( @@ -245,6 +273,9 @@ fn main() { let app_state = app_state.clone(); let model = model.clone(); let example = example.clone(); + let zed_commit_sha = zed_commit_sha.clone(); + let zed_branch_name = zed_branch_name.clone(); + let run_id = run_id.clone(); cx.spawn(async move |cx| { let result = async { let run_output = cx @@ -254,8 +285,12 @@ fn main() { run_judge_repetition( example.clone(), model.clone(), + &zed_commit_sha, + &zed_branch_name, + &run_id, &run_output, round, + enable_telemetry, cx, ) }); @@ -367,9 +402,7 @@ fn main() { print_header("CUMULATIVE TOOL METRICS"); println!("{}", cumulative_tool_metrics); - std::thread::sleep(std::time::Duration::from_secs(2)); - - app_state.client.telemetry().flush_events(); + app_state.client.telemetry().flush_events().await; cx.update(|cx| cx.quit()) }) @@ -377,8 +410,8 @@ fn main() { }); } -fn list_all_examples() -> Result> { - let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap(); +fn list_all_examples(examples_dir: &Path) -> Result> { + let path = std::fs::canonicalize(examples_dir).unwrap(); let entries = std::fs::read_dir(path).unwrap(); let mut result_paths = Vec::new(); for entry in entries { @@ -532,79 +565,66 @@ pub fn find_model( Ok(model) } -pub async fn get_current_commit_id(repo_path: &Path) -> Option { - (run_git(repo_path, &["rev-parse", "HEAD"]).await).ok() +pub fn commit_sha_for_path(repo_path: &Path) -> String { + futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap() } -pub fn get_current_commit_id_sync(repo_path: &Path) -> String { - futures::executor::block_on(async { - get_current_commit_id(repo_path).await.unwrap_or_default() - }) +pub fn git_branch_for_path(repo_path: &Path) -> String { + match std::env::var("GITHUB_REF_NAME") { + Ok(branch) => branch, + Err(_) => { + futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"])) + .unwrap_or_else(|_| "unknown".to_string()) + } + } } async fn run_judge_repetition( example: Example, model: Arc, + zed_commit_sha: &str, + zed_branch_name: &str, + run_id: &str, run_output: &RunOutput, round: u32, + enable_telemetry: bool, cx: &AsyncApp, ) -> Result { - let judge_result = example.judge(model.clone(), &run_output, round, cx).await; + let judge_output = example.judge(model.clone(), &run_output, round, cx).await; - if let Ok(judge_output) = &judge_result { - let cohort_id = example - .run_directory_path - .file_name() - .map(|name| name.to_string_lossy().to_string()) - .unwrap_or(chrono::Local::now().format("%Y-%m-%d_%H-%M-%S").to_string()); - - let path = std::path::Path::new("."); - let commit_id = get_current_commit_id(path).await.unwrap_or_default(); - - if let Some(thread) = &judge_output.thread { - telemetry::event!( - "Agent Eval Completed", - cohort_id = cohort_id, - example_name = example.name.clone(), - round = round, - diff_score = judge_output.diff.score, - diff_analysis = judge_output.diff.analysis, - thread_score = thread.score, - thread_analysis = thread.analysis, - tool_metrics = run_output.tool_metrics, - response_count = run_output.response_count, - token_usage = run_output.token_usage, - model = model.telemetry_id(), - model_provider = model.provider_id().to_string(), - repository_url = example.base.url.clone(), - repository_revision = example.base.revision.clone(), - diagnostics_before = run_output.diagnostics_before, - diagnostics_after = run_output.diagnostics_after, - commit_id = commit_id - ); - } else { - telemetry::event!( - "Agent Eval Completed", - cohort_id = cohort_id, - example_name = example.name.clone(), - round = round, - diff_score = judge_output.diff.score, - diff_analysis = judge_output.diff.analysis, - tool_metrics = run_output.tool_metrics, - response_count = run_output.response_count, - token_usage = run_output.token_usage, - model = model.telemetry_id(), - model_provider = model.provider_id().to_string(), - repository_url = example.base.url.clone(), - repository_revision = example.base.revision.clone(), - diagnostics_before = run_output.diagnostics_before, - diagnostics_after = run_output.diagnostics_after, - commit_id = commit_id - ); - } + let diff_evaluation; + let thread_diff_evaluation; + if let Ok(output) = judge_output.as_ref() { + diff_evaluation = Some(output.diff.clone()); + thread_diff_evaluation = output.thread.clone(); + } else { + diff_evaluation = None; + thread_diff_evaluation = None; } - judge_result + if enable_telemetry { + telemetry::event!( + "Agent Example Evaluated", + zed_commit_sha = zed_commit_sha, + zed_branch_name = zed_branch_name, + run_id = run_id, + example_name = example.name.clone(), + round = round, + diff_evaluation = diff_evaluation, + thread_evaluation = thread_diff_evaluation, + tool_metrics = run_output.tool_metrics, + response_count = run_output.response_count, + token_usage = run_output.token_usage, + model = model.telemetry_id(), + model_provider = model.provider_id().to_string(), + repository_url = example.base.url.clone(), + repository_revision = example.base.revision.clone(), + diagnostics_before = run_output.diagnostics_before, + diagnostics_after = run_output.diagnostics_after, + ); + } + + judge_output } fn print_header(header: &str) { diff --git a/crates/eval/src/example.rs b/crates/eval/src/example.rs index ed445d3c68..56ec694f3f 100644 --- a/crates/eval/src/example.rs +++ b/crates/eval/src/example.rs @@ -31,10 +31,6 @@ use util::command::new_smol_command; use util::markdown::MarkdownString; use util::serde::default_true; -pub const EXAMPLES_DIR: &str = "./crates/eval/examples"; -pub const REPOS_DIR: &str = "./crates/eval/repos"; -pub const WORKTREES_DIR: &str = "./crates/eval/worktrees"; - const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2); const ZED_REPO_URL: &str = "https://github.com/zed-industries/zed.git"; @@ -77,6 +73,8 @@ pub struct Example { pub run_directory_path: PathBuf, /// Prefix used for logging that identifies this example pub log_prefix: String, + pub worktree_path: PathBuf, + pub repo_path: PathBuf, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -122,7 +120,12 @@ pub struct JudgeOutput { impl Example { /// Load an example from a directory containing base.toml, prompt.md, and criteria.md - pub fn load_from_directory(dir_path: &Path, run_dir: &Path) -> Result { + pub fn load_from_directory( + dir_path: &Path, + run_dir: &Path, + worktrees_dir: &Path, + repos_dir: &Path, + ) -> Result { let name = Self::name_from_path(dir_path); let base_path = dir_path.join("base.toml"); let prompt_path = dir_path.join("prompt.md"); @@ -134,13 +137,25 @@ impl Example { None }; + let base: ExampleBase = toml::from_str(&fs::read_to_string(&base_path)?)?; + + let repo_path = repo_path_for_url(repos_dir, &base.url); + + let worktree_path = worktrees_dir + .canonicalize() + .unwrap() + .join(&name) + .join(&base.repo_name()); + Ok(Example { name: name.clone(), - base: toml::from_str(&fs::read_to_string(&base_path)?)?, + base, prompt: fs::read_to_string(prompt_path.clone())?, thread_criteria, diff_criteria: fs::read_to_string(diff_criteria_path.clone())?, run_directory_path: run_dir.to_path_buf(), + worktree_path, + repo_path, log_prefix: name, }) } @@ -168,21 +183,10 @@ impl Example { path.file_name().unwrap().to_string_lossy().to_string() } - pub fn worktree_path(&self) -> PathBuf { - Path::new(WORKTREES_DIR) - .canonicalize() - .context(format!("No such directory {WORKTREES_DIR}")) - .unwrap() - .join(&self.name) - .join(self.base.repo_name()) - } - /// Set up the example by checking out the specified Git revision pub async fn setup(&mut self) -> Result<()> { - let repo_path = repo_path_for_url(&self.base.url); - let revision_exists = run_git( - &repo_path, + &self.repo_path, &["rev-parse", &format!("{}^{{commit}}", self.base.revision)], ) .await @@ -194,29 +198,27 @@ impl Example { self.log_prefix, &self.base.revision ); run_git( - &repo_path, + &self.repo_path, &["fetch", "--depth", "1", "origin", &self.base.revision], ) .await?; } - let worktree_path = self.worktree_path(); - - if worktree_path.is_dir() { + if self.worktree_path.is_dir() { println!("{}Resetting existing worktree", self.log_prefix); // TODO: consider including "-x" to remove ignored files. The downside of this is that // it will also remove build artifacts, and so prevent incremental reuse there. - run_git(&worktree_path, &["clean", "--force", "-d"]).await?; - run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?; - run_git(&worktree_path, &["checkout", &self.base.revision]).await?; + run_git(&self.worktree_path, &["clean", "--force", "-d"]).await?; + run_git(&self.worktree_path, &["reset", "--hard", "HEAD"]).await?; + run_git(&self.worktree_path, &["checkout", &self.base.revision]).await?; } else { println!("{}Creating worktree", self.log_prefix); - let worktree_path_string = worktree_path.to_string_lossy().to_string(); + let worktree_path_string = self.worktree_path.to_string_lossy().to_string(); run_git( - &repo_path, + &self.repo_path, &[ "worktree", "add", @@ -229,7 +231,7 @@ impl Example { } if self.base.url == ZED_REPO_URL { - std::fs::write(worktree_path.join(".rules"), std::fs::read(".rules")?)?; + std::fs::write(self.worktree_path.join(".rules"), std::fs::read(".rules")?)?; } std::fs::create_dir_all(self.example_output_directory())?; @@ -253,9 +255,8 @@ impl Example { cx, ); - let worktree_path = self.worktree_path(); let worktree = project.update(cx, |project, cx| { - project.create_worktree(&worktree_path, true, cx) + project.create_worktree(&self.worktree_path, true, cx) }); let tools = cx.new(|_| ToolWorkingSet::default()); @@ -460,6 +461,7 @@ impl Example { ThreadEvent::SummaryChanged | ThreadEvent::SummaryGenerated | ThreadEvent::CheckpointChanged | + ThreadEvent::ReceivedTextChunk | ThreadEvent::UsageUpdated(_) => { if std::env::var("ZED_EVAL_DEBUG").is_ok() { println!("{}Event: {:#?}", log_prefix, event); @@ -664,13 +666,12 @@ impl Example { } async fn repository_diff(&self) -> Result { - let worktree_path = self.worktree_path(); - run_git(&worktree_path, &["add", "."]).await?; + run_git(&self.worktree_path, &["add", "."]).await?; let mut diff_args = vec!["diff", "--staged"]; if self.base.url == ZED_REPO_URL { diff_args.push(":(exclude).rules"); } - run_git(&worktree_path, &diff_args).await + run_git(&self.worktree_path, &diff_args).await } } @@ -831,13 +832,13 @@ fn get_tag(name: &'static str, response: &str) -> Result { anyhow::Ok(content) } -pub fn repo_path_for_url(repo_url: &str) -> PathBuf { +pub fn repo_path_for_url(repos_dir: &Path, repo_url: &str) -> PathBuf { let repo_name = repo_url .trim_start_matches("https://") .replace(|c: char| !c.is_alphanumeric(), "-"); - Path::new(REPOS_DIR) + Path::new(repos_dir) .canonicalize() - .context(format!("No such directory {REPOS_DIR}")) + .context(format!("No such directory {}", repos_dir.display())) .unwrap() .join(repo_name) } diff --git a/crates/eval/src/ids.rs b/crates/eval/src/ids.rs index ea381d5dc7..d35feed25d 100644 --- a/crates/eval/src/ids.rs +++ b/crates/eval/src/ids.rs @@ -1,4 +1,4 @@ -use anyhow::Result; +use anyhow::{Result, anyhow}; use std::fs; use std::path::{Path, PathBuf}; use uuid::Uuid; @@ -11,6 +11,7 @@ pub fn get_or_create_id(path: &Path) -> Result { } } let new_id = Uuid::new_v4().to_string(); + fs::create_dir_all(path.parent().ok_or_else(|| anyhow!("invalid id path"))?)?; fs::write(path, &new_id)?; Ok(new_id) } diff --git a/crates/languages/src/rust.rs b/crates/languages/src/rust.rs index 7c3c651a07..2188dfc9ef 100644 --- a/crates/languages/src/rust.rs +++ b/crates/languages/src/rust.rs @@ -129,7 +129,7 @@ impl LspAdapter for RustLspAdapter { }) .await; if let Err(err) = result { - log::error!( + log::debug!( "failed to run rust-analyzer after detecting it in PATH: binary: {:?}: {}", path, err diff --git a/crates/telemetry_events/src/telemetry_events.rs b/crates/telemetry_events/src/telemetry_events.rs index 6c1b133d50..dfe167fcd4 100644 --- a/crates/telemetry_events/src/telemetry_events.rs +++ b/crates/telemetry_events/src/telemetry_events.rs @@ -15,6 +15,7 @@ pub struct EventRequestBody { pub session_id: Option, pub metrics_id: Option, /// True for Zed staff, otherwise false + #[serde(skip_serializing_if = "Option::is_none")] pub is_staff: Option, /// Zed version number pub app_version: String, diff --git a/crates/zed/src/main.rs b/crates/zed/src/main.rs index c418e76b01..93503d739a 100644 --- a/crates/zed/src/main.rs +++ b/crates/zed/src/main.rs @@ -604,7 +604,7 @@ fn main() { setting = "keymap", value = BaseKeymap::get_global(cx).to_string() ); - telemetry.flush_events(); + telemetry.flush_events().detach(); let fs = app_state.fs.clone(); load_user_themes_in_background(fs.clone(), cx); diff --git a/crates/zeta/src/zeta.rs b/crates/zeta/src/zeta.rs index bec8efbcff..1a95cc7ba0 100644 --- a/crates/zeta/src/zeta.rs +++ b/crates/zeta/src/zeta.rs @@ -982,7 +982,7 @@ and then another output_excerpt = completion.output_excerpt, feedback ); - self.client.telemetry().flush_events(); + self.client.telemetry().flush_events().detach(); cx.notify(); }