remove some evals

This commit is contained in:
Sandy Tao
2026-01-28 17:24:15 -08:00
parent bfdb12de9f
commit 32515b49b3
2 changed files with 0 additions and 194 deletions

View File

@@ -1,122 +0,0 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect, it } from 'vitest';
import { evalTest, TestRig } from './test-helper.js';
const FILES = {
'app.ts': 'const add = (a: number, b: number) => a - b;',
} as const;
describe('analysis mode eval', () => {
/**
* Ensures that when the user asks to "inspect" for bugs, the agent does NOT
* automatically modify the file, but instead asks for permission.
*/
evalTest('ALWAYS_PASSES', {
name: 'should not edit files when asked to inspect for bugs',
prompt: 'Inspect app.ts for bugs',
files: FILES,
assert: async (rig, result) => {
const toolLogs = rig.readToolLogs();
// Verify NO edit tools called
const editCalls = toolLogs.filter((log) =>
['replace', 'write_file', 'edit'].includes(log.toolRequest.name),
);
expect(editCalls.length).toBe(0);
// Verify file unchanged
const content = rig.readFile('app.ts');
expect(content).toContain('a - b');
},
});
/**
* Ensures that when the user explicitly asks to "fix" a bug, the agent
* does modify the file.
*/
evalTest('ALWAYS_PASSES', {
name: 'should edit files when asked to fix bug',
prompt: 'Fix the bug in app.ts - it should add numbers not subtract',
files: FILES,
assert: async (rig, result) => {
const toolLogs = rig.readToolLogs();
// Verify edit tools WERE called
const editCalls = toolLogs.filter(
(log) =>
['replace', 'write_file', 'edit'].includes(log.toolRequest.name) &&
log.toolRequest.success,
);
expect(editCalls.length).toBeGreaterThanOrEqual(1);
// Verify file changed
const content = rig.readFile('app.ts');
expect(content).toContain('a + b');
},
});
/**
* Ensures that when the user asks "any bugs?" the agent does NOT
* automatically modify the file, but instead asks for permission.
*/
evalTest('ALWAYS_PASSES', {
name: 'should not edit when asking "any bugs"',
prompt: 'Any bugs in app.ts?',
files: FILES,
assert: async (rig, result) => {
const toolLogs = rig.readToolLogs();
// Verify NO edit tools called
const editCalls = toolLogs.filter((log) =>
['replace', 'write_file', 'edit'].includes(log.toolRequest.name),
);
expect(editCalls.length).toBe(0);
// Verify file unchanged
const content = rig.readFile('app.ts');
expect(content).toContain('a - b');
},
});
it('should only edit when directive is given after inquiry (interactive)', async () => {
const rig = new TestRig();
try {
rig.setup('interactive-inquiry-directive');
rig.createFile('app.ts', FILES['app.ts']);
const run = await rig.runInteractive();
// Turn 1: Inquiry
await run.sendKeys('Any bugs in app.ts?');
await run.type('\r');
// Wait for analysis to finish (it should find the bug but not fix it)
await run.expectText('bug', 30000);
// Verify no edit tools called yet
let toolLogs = rig.readToolLogs();
let editCalls = toolLogs.filter((log) =>
['replace', 'write_file'].includes(log.toolRequest.name),
);
expect(editCalls.length).toBe(0);
// Turn 2: Directive
await run.sendKeys('Fix it');
await run.type('\r');
// Wait for fix (cli uses 'edit' which maps to replace/write_file in core)
await rig.expectToolCallSuccess(['replace', 'write_file', 'edit']);
// Verify file changed
const content = rig.readFile('app.ts');
expect(content).toContain('a + b');
} finally {
await rig.cleanup();
}
}, 120000);
});

View File

@@ -1,72 +0,0 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Delegation Strategy Evals', () => {
/**
* Scenario 1: Multi-file / Architectural task.
* Expectation: Use codebase_investigator to build a mental model.
*/
evalTest('USUALLY_PASSES', {
name: 'should delegate to codebase_investigator for architectural mapping',
params: {
settings: {
experimental: {
enableAgents: true,
},
},
},
prompt: 'How does the telemetry system interact with the hook system?',
files: {
'packages/core/src/telemetry/telemetryService.ts':
'export class TelemetryService {}',
'packages/core/src/hooks/hookManager.ts': 'export class HookManager {}',
'packages/core/src/index.ts':
'import "./telemetry/telemetryService"; import "./hooks/hookManager";',
},
assert: async (rig, _result) => {
await rig.expectToolCallSuccess(
['delegate_to_agent'],
undefined,
(args) => {
try {
const parsed = JSON.parse(args);
return parsed.agent_name === 'codebase_investigator';
} catch {
return false;
}
},
);
},
});
/**
* Scenario 2: Highly localized / Trivial task.
* Expectation: Use manual search tools (grep) or direct read because it's surgical and fast.
*/
evalTest('USUALLY_PASSES', {
name: 'should use manual tools for localized surgical tasks',
params: {
settings: {
experimental: {
enableAgents: true,
},
},
},
prompt:
'Change the default port in packages/core/src/config.ts from 3000 to 8080.',
files: {
'packages/core/src/config.ts': 'export const DEFAULT_PORT = 3000;',
},
assert: async (rig, _result) => {
// We expect it NOT to delegate, and instead use manual tools or edit.
await rig.expectNoToolCall(['delegate_to_agent']);
await rig.expectToolCallSuccess(['grep_search', 'read_file', 'replace']);
},
});
});