remove some evals

2026-02-01 22:48:03 +00:00 · 2026-01-28 17:24:15 -08:00
parent bfdb12de9f
commit 32515b49b3
2 changed files with 0 additions and 194 deletions
--- a/evals/analysis-mode.eval.ts
+++ b/evals/analysis-mode.eval.ts
@@ -1,122 +0,0 @@
-/**
- * @license
- * Copyright 2026 Google LLC
- * SPDX-License-Identifier: Apache-2.0
- */
-
-import { describe, expect, it } from 'vitest';
-import { evalTest, TestRig } from './test-helper.js';
-
-const FILES = {
-  'app.ts': 'const add = (a: number, b: number) => a - b;',
-} as const;
-
-describe('analysis mode eval', () => {
-  /**
-   * Ensures that when the user asks to "inspect" for bugs, the agent does NOT
-   * automatically modify the file, but instead asks for permission.
-   */
-  evalTest('ALWAYS_PASSES', {
-    name: 'should not edit files when asked to inspect for bugs',
-    prompt: 'Inspect app.ts for bugs',
-    files: FILES,
-    assert: async (rig, result) => {
-      const toolLogs = rig.readToolLogs();
-
-      // Verify NO edit tools called
-      const editCalls = toolLogs.filter((log) =>
-        ['replace', 'write_file', 'edit'].includes(log.toolRequest.name),
-      );
-      expect(editCalls.length).toBe(0);
-
-      // Verify file unchanged
-      const content = rig.readFile('app.ts');
-      expect(content).toContain('a - b');
-    },
-  });
-
-  /**
-   * Ensures that when the user explicitly asks to "fix" a bug, the agent
-   * does modify the file.
-   */
-  evalTest('ALWAYS_PASSES', {
-    name: 'should edit files when asked to fix bug',
-    prompt: 'Fix the bug in app.ts - it should add numbers not subtract',
-    files: FILES,
-    assert: async (rig, result) => {
-      const toolLogs = rig.readToolLogs();
-
-      // Verify edit tools WERE called
-      const editCalls = toolLogs.filter(
-        (log) =>
-          ['replace', 'write_file', 'edit'].includes(log.toolRequest.name) &&
-          log.toolRequest.success,
-      );
-      expect(editCalls.length).toBeGreaterThanOrEqual(1);
-
-      // Verify file changed
-      const content = rig.readFile('app.ts');
-      expect(content).toContain('a + b');
-    },
-  });
-
-  /**
-   * Ensures that when the user asks "any bugs?" the agent does NOT
-   * automatically modify the file, but instead asks for permission.
-   */
-  evalTest('ALWAYS_PASSES', {
-    name: 'should not edit when asking "any bugs"',
-    prompt: 'Any bugs in app.ts?',
-    files: FILES,
-    assert: async (rig, result) => {
-      const toolLogs = rig.readToolLogs();
-
-      // Verify NO edit tools called
-      const editCalls = toolLogs.filter((log) =>
-        ['replace', 'write_file', 'edit'].includes(log.toolRequest.name),
-      );
-      expect(editCalls.length).toBe(0);
-
-      // Verify file unchanged
-      const content = rig.readFile('app.ts');
-      expect(content).toContain('a - b');
-    },
-  });
-
-  it('should only edit when directive is given after inquiry (interactive)', async () => {
-    const rig = new TestRig();
-    try {
-      rig.setup('interactive-inquiry-directive');
-      rig.createFile('app.ts', FILES['app.ts']);
-
-      const run = await rig.runInteractive();
-
-      // Turn 1: Inquiry
-      await run.sendKeys('Any bugs in app.ts?');
-      await run.type('\r');
-
-      // Wait for analysis to finish (it should find the bug but not fix it)
-      await run.expectText('bug', 30000);
-
-      // Verify no edit tools called yet
-      let toolLogs = rig.readToolLogs();
-      let editCalls = toolLogs.filter((log) =>
-        ['replace', 'write_file'].includes(log.toolRequest.name),
-      );
-      expect(editCalls.length).toBe(0);
-
-      // Turn 2: Directive
-      await run.sendKeys('Fix it');
-      await run.type('\r');
-
-      // Wait for fix (cli uses 'edit' which maps to replace/write_file in core)
-      await rig.expectToolCallSuccess(['replace', 'write_file', 'edit']);
-
-      // Verify file changed
-      const content = rig.readFile('app.ts');
-      expect(content).toContain('a + b');
-    } finally {
-      await rig.cleanup();
-    }
-  }, 120000);
-});
--- a/evals/delegation_strategy.eval.ts
+++ b/evals/delegation_strategy.eval.ts
@@ -1,72 +0,0 @@
-/**
- * @license
- * Copyright 2026 Google LLC
- * SPDX-License-Identifier: Apache-2.0
- */
-
-import { describe } from 'vitest';
-import { evalTest } from './test-helper.js';
-
-describe('Delegation Strategy Evals', () => {
-  /**
-   * Scenario 1: Multi-file / Architectural task.
-   * Expectation: Use codebase_investigator to build a mental model.
-   */
-  evalTest('USUALLY_PASSES', {
-    name: 'should delegate to codebase_investigator for architectural mapping',
-    params: {
-      settings: {
-        experimental: {
-          enableAgents: true,
-        },
-      },
-    },
-    prompt: 'How does the telemetry system interact with the hook system?',
-    files: {
-      'packages/core/src/telemetry/telemetryService.ts':
-        'export class TelemetryService {}',
-      'packages/core/src/hooks/hookManager.ts': 'export class HookManager {}',
-      'packages/core/src/index.ts':
-        'import "./telemetry/telemetryService"; import "./hooks/hookManager";',
-    },
-    assert: async (rig, _result) => {
-      await rig.expectToolCallSuccess(
-        ['delegate_to_agent'],
-        undefined,
-        (args) => {
-          try {
-            const parsed = JSON.parse(args);
-            return parsed.agent_name === 'codebase_investigator';
-          } catch {
-            return false;
-          }
-        },
-      );
-    },
-  });
-
-  /**
-   * Scenario 2: Highly localized / Trivial task.
-   * Expectation: Use manual search tools (grep) or direct read because it's surgical and fast.
-   */
-  evalTest('USUALLY_PASSES', {
-    name: 'should use manual tools for localized surgical tasks',
-    params: {
-      settings: {
-        experimental: {
-          enableAgents: true,
-        },
-      },
-    },
-    prompt:
-      'Change the default port in packages/core/src/config.ts from 3000 to 8080.',
-    files: {
-      'packages/core/src/config.ts': 'export const DEFAULT_PORT = 3000;',
-    },
-    assert: async (rig, _result) => {
-      // We expect it NOT to delegate, and instead use manual tools or edit.
-      await rig.expectNoToolCall(['delegate_to_agent']);
-      await rig.expectToolCallSuccess(['grep_search', 'read_file', 'replace']);
-    },
-  });
-});