gemini-cli/evals/answer-vs-act.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
import { EDIT_TOOL_NAMES } from '@google/gemini-cli-core';

const FILES = {
  'app.ts': 'const add = (a: number, b: number) => a - b;',
  'package.json': '{"name": "test-app", "version": "1.0.0"}',
} as const;

describe('Answer vs. ask eval', () => {
  /**
   * Ensures that when the user asks to "inspect" for bugs, the agent does NOT
   * automatically modify the file, but instead asks for permission.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should not edit files when asked to inspect for bugs',
    prompt: 'Inspect app.ts for bugs',
    files: FILES,
    assert: async (rig, result) => {
      const toolLogs = rig.readToolLogs();

      // Verify NO edit tools called
      const editCalls = toolLogs.filter((log) =>
        EDIT_TOOL_NAMES.has(log.toolRequest.name),
      );
      expect(editCalls.length).toBe(0);

      // Verify file unchanged
      const content = rig.readFile('app.ts');
      expect(content).toContain('a - b');
    },
  });

  /**
   * Ensures that when the user explicitly asks to "fix" a bug, the agent
   * does modify the file.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should edit files when asked to fix bug',
    prompt: 'Fix the bug in app.ts - it should add numbers not subtract',
    files: FILES,
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      // Verify edit tools WERE called
      const editCalls = toolLogs.filter(
        (log) =>
          EDIT_TOOL_NAMES.has(log.toolRequest.name) && log.toolRequest.success,
      );
      expect(editCalls.length).toBeGreaterThanOrEqual(1);

      // Verify file changed
      const content = rig.readFile('app.ts');
      expect(content).toContain('a + b');
    },
  });

  /**
   * Ensures that when the user asks "any bugs?" the agent does NOT
   * automatically modify the file, but instead asks for permission.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should not edit when asking "any bugs"',
    prompt: 'Any bugs in app.ts?',
    files: FILES,
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      // Verify NO edit tools called
      const editCalls = toolLogs.filter((log) =>
        EDIT_TOOL_NAMES.has(log.toolRequest.name),
      );
      expect(editCalls.length).toBe(0);

      // Verify file unchanged
      const content = rig.readFile('app.ts');
      expect(content).toContain('a - b');
    },
  });

  /**
   * Ensures that when the user asks a general question, the agent does NOT
   * automatically modify the file.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should not edit files when asked a general question',
    prompt: 'How does app.ts work?',
    files: FILES,
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      // Verify NO edit tools called
      const editCalls = toolLogs.filter((log) =>
        EDIT_TOOL_NAMES.has(log.toolRequest.name),
      );
      expect(editCalls.length).toBe(0);

      // Verify file unchanged
      const content = rig.readFile('app.ts');
      expect(content).toContain('a - b');
    },
  });

  /**
   * Ensures that when the user asks a question about style, the agent does NOT
   * automatically modify the file.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should not edit files when asked about style',
    prompt: 'Is app.ts following good style?',
    files: FILES,
    assert: async (rig, result) => {
      const toolLogs = rig.readToolLogs();

      // Verify NO edit tools called
      const editCalls = toolLogs.filter((log) =>
        EDIT_TOOL_NAMES.has(log.toolRequest.name),
      );
      expect(editCalls.length).toBe(0);

      // Verify file unchanged
      const content = rig.readFile('app.ts');
      expect(content).toContain('a - b');
    },
  });

  /**
   * Ensures that when the user points out an issue but doesn't ask for a fix,
   * the agent does NOT automatically modify the file.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should not edit files when user notes an issue',
    prompt: 'The add function subtracts numbers.',
    files: FILES,
    params: { timeout: 20000 }, // 20s timeout
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      // Verify NO edit tools called
      const editCalls = toolLogs.filter((log) =>
        EDIT_TOOL_NAMES.has(log.toolRequest.name),
      );
      expect(editCalls.length).toBe(0);

      // Verify file unchanged
      const content = rig.readFile('app.ts');
      expect(content).toContain('a - b');
    },
  });
});