mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-02-01 22:48:03 +00:00
remove some evals
This commit is contained in:
@@ -1,122 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { evalTest, TestRig } from './test-helper.js';
|
||||
|
||||
const FILES = {
|
||||
'app.ts': 'const add = (a: number, b: number) => a - b;',
|
||||
} as const;
|
||||
|
||||
describe('analysis mode eval', () => {
|
||||
/**
|
||||
* Ensures that when the user asks to "inspect" for bugs, the agent does NOT
|
||||
* automatically modify the file, but instead asks for permission.
|
||||
*/
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should not edit files when asked to inspect for bugs',
|
||||
prompt: 'Inspect app.ts for bugs',
|
||||
files: FILES,
|
||||
assert: async (rig, result) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
|
||||
// Verify NO edit tools called
|
||||
const editCalls = toolLogs.filter((log) =>
|
||||
['replace', 'write_file', 'edit'].includes(log.toolRequest.name),
|
||||
);
|
||||
expect(editCalls.length).toBe(0);
|
||||
|
||||
// Verify file unchanged
|
||||
const content = rig.readFile('app.ts');
|
||||
expect(content).toContain('a - b');
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* Ensures that when the user explicitly asks to "fix" a bug, the agent
|
||||
* does modify the file.
|
||||
*/
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should edit files when asked to fix bug',
|
||||
prompt: 'Fix the bug in app.ts - it should add numbers not subtract',
|
||||
files: FILES,
|
||||
assert: async (rig, result) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
|
||||
// Verify edit tools WERE called
|
||||
const editCalls = toolLogs.filter(
|
||||
(log) =>
|
||||
['replace', 'write_file', 'edit'].includes(log.toolRequest.name) &&
|
||||
log.toolRequest.success,
|
||||
);
|
||||
expect(editCalls.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Verify file changed
|
||||
const content = rig.readFile('app.ts');
|
||||
expect(content).toContain('a + b');
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* Ensures that when the user asks "any bugs?" the agent does NOT
|
||||
* automatically modify the file, but instead asks for permission.
|
||||
*/
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should not edit when asking "any bugs"',
|
||||
prompt: 'Any bugs in app.ts?',
|
||||
files: FILES,
|
||||
assert: async (rig, result) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
|
||||
// Verify NO edit tools called
|
||||
const editCalls = toolLogs.filter((log) =>
|
||||
['replace', 'write_file', 'edit'].includes(log.toolRequest.name),
|
||||
);
|
||||
expect(editCalls.length).toBe(0);
|
||||
|
||||
// Verify file unchanged
|
||||
const content = rig.readFile('app.ts');
|
||||
expect(content).toContain('a - b');
|
||||
},
|
||||
});
|
||||
|
||||
it('should only edit when directive is given after inquiry (interactive)', async () => {
|
||||
const rig = new TestRig();
|
||||
try {
|
||||
rig.setup('interactive-inquiry-directive');
|
||||
rig.createFile('app.ts', FILES['app.ts']);
|
||||
|
||||
const run = await rig.runInteractive();
|
||||
|
||||
// Turn 1: Inquiry
|
||||
await run.sendKeys('Any bugs in app.ts?');
|
||||
await run.type('\r');
|
||||
|
||||
// Wait for analysis to finish (it should find the bug but not fix it)
|
||||
await run.expectText('bug', 30000);
|
||||
|
||||
// Verify no edit tools called yet
|
||||
let toolLogs = rig.readToolLogs();
|
||||
let editCalls = toolLogs.filter((log) =>
|
||||
['replace', 'write_file'].includes(log.toolRequest.name),
|
||||
);
|
||||
expect(editCalls.length).toBe(0);
|
||||
|
||||
// Turn 2: Directive
|
||||
await run.sendKeys('Fix it');
|
||||
await run.type('\r');
|
||||
|
||||
// Wait for fix (cli uses 'edit' which maps to replace/write_file in core)
|
||||
await rig.expectToolCallSuccess(['replace', 'write_file', 'edit']);
|
||||
|
||||
// Verify file changed
|
||||
const content = rig.readFile('app.ts');
|
||||
expect(content).toContain('a + b');
|
||||
} finally {
|
||||
await rig.cleanup();
|
||||
}
|
||||
}, 120000);
|
||||
});
|
||||
@@ -1,72 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('Delegation Strategy Evals', () => {
|
||||
/**
|
||||
* Scenario 1: Multi-file / Architectural task.
|
||||
* Expectation: Use codebase_investigator to build a mental model.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should delegate to codebase_investigator for architectural mapping',
|
||||
params: {
|
||||
settings: {
|
||||
experimental: {
|
||||
enableAgents: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt: 'How does the telemetry system interact with the hook system?',
|
||||
files: {
|
||||
'packages/core/src/telemetry/telemetryService.ts':
|
||||
'export class TelemetryService {}',
|
||||
'packages/core/src/hooks/hookManager.ts': 'export class HookManager {}',
|
||||
'packages/core/src/index.ts':
|
||||
'import "./telemetry/telemetryService"; import "./hooks/hookManager";',
|
||||
},
|
||||
assert: async (rig, _result) => {
|
||||
await rig.expectToolCallSuccess(
|
||||
['delegate_to_agent'],
|
||||
undefined,
|
||||
(args) => {
|
||||
try {
|
||||
const parsed = JSON.parse(args);
|
||||
return parsed.agent_name === 'codebase_investigator';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* Scenario 2: Highly localized / Trivial task.
|
||||
* Expectation: Use manual search tools (grep) or direct read because it's surgical and fast.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should use manual tools for localized surgical tasks',
|
||||
params: {
|
||||
settings: {
|
||||
experimental: {
|
||||
enableAgents: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt:
|
||||
'Change the default port in packages/core/src/config.ts from 3000 to 8080.',
|
||||
files: {
|
||||
'packages/core/src/config.ts': 'export const DEFAULT_PORT = 3000;',
|
||||
},
|
||||
assert: async (rig, _result) => {
|
||||
// We expect it NOT to delegate, and instead use manual tools or edit.
|
||||
await rig.expectNoToolCall(['delegate_to_agent']);
|
||||
await rig.expectToolCallSuccess(['grep_search', 'read_file', 'replace']);
|
||||
},
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user