mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-16 17:32:46 +00:00
feat(cli): improve UserSimulator robustness and prompt sensitivity
- Prioritize interactive markers (prompts, confirmation dialogs) over background timers/spinners. - Suppress background update checks and non-fatal UI warnings when simulateUser is enabled. - Refine simulator system prompt for more aggressive prompt detection. - Implement reliable key submission with stabilized terminal initialization and inter-character delays. - Add unit tests for UserSimulator vision and input submission.
This commit is contained in:
@@ -94,11 +94,14 @@ export async function startInteractiveUI(
|
||||
const version = await getVersion();
|
||||
setWindowTitle(basename(workspaceRoot), settings);
|
||||
|
||||
const simulateUser = config.getSimulateUser();
|
||||
|
||||
const consolePatcher = new ConsolePatcher({
|
||||
onNewMessage: (msg) => {
|
||||
coreEvents.emitConsoleLog(msg.type, msg.content);
|
||||
},
|
||||
debugMode: config.getDebugMode(),
|
||||
interactive: !simulateUser,
|
||||
});
|
||||
consolePatcher.patch();
|
||||
|
||||
@@ -144,7 +147,6 @@ export async function startInteractiveUI(
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
}
|
||||
|
||||
const simulateUser = config.getSimulateUser();
|
||||
const simulatedStdin = new PassThrough({ encoding: 'utf8' });
|
||||
|
||||
let lastFrame: string | undefined;
|
||||
@@ -199,16 +201,18 @@ export async function startInteractiveUI(
|
||||
registerCleanup(cleanupLineWrapping);
|
||||
}
|
||||
|
||||
checkForUpdates(settings)
|
||||
.then((info) => {
|
||||
handleAutoUpdate(info, settings, config.getProjectRoot());
|
||||
})
|
||||
.catch((err) => {
|
||||
// Silently ignore update check errors.
|
||||
if (config.getDebugMode()) {
|
||||
debugLogger.warn('Update check failed:', err);
|
||||
}
|
||||
});
|
||||
if (!simulateUser) {
|
||||
checkForUpdates(settings)
|
||||
.then((info) => {
|
||||
handleAutoUpdate(info, settings, config.getProjectRoot());
|
||||
})
|
||||
.catch((err) => {
|
||||
// Silently ignore update check errors.
|
||||
if (config.getDebugMode()) {
|
||||
debugLogger.warn('Update check failed:', err);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (simulateUser) {
|
||||
const simulator = new UserSimulator(
|
||||
|
||||
135
packages/cli/src/services/UserSimulator.test.ts
Normal file
135
packages/cli/src/services/UserSimulator.test.ts
Normal file
@@ -0,0 +1,135 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import { UserSimulator } from './UserSimulator.js';
|
||||
import { Writable } from 'node:stream';
|
||||
import type { Config } from '@google/gemini-cli-core';
|
||||
|
||||
describe('UserSimulator', () => {
|
||||
let mockConfig: Config;
|
||||
let mockGetScreen: vi.Mock<() => string | undefined>;
|
||||
let mockStdinBuffer: Writable;
|
||||
let mockContentGenerator: {
|
||||
generateContent: vi.Mock;
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
mockContentGenerator = {
|
||||
generateContent: vi
|
||||
.fn()
|
||||
.mockResolvedValue({ text: JSON.stringify({ action: 'y\r' }) }),
|
||||
};
|
||||
|
||||
mockConfig = {
|
||||
getContentGenerator: () => mockContentGenerator,
|
||||
getSimulateUser: () => true,
|
||||
getQuestion: () => 'test goal',
|
||||
getKnowledgeSource: () => undefined,
|
||||
getHasAccessToPreviewModel: () => true,
|
||||
} as unknown as Config;
|
||||
|
||||
mockGetScreen = vi.fn();
|
||||
mockStdinBuffer = new Writable({
|
||||
write(chunk, encoding, callback) {
|
||||
callback();
|
||||
},
|
||||
});
|
||||
vi.spyOn(mockStdinBuffer, 'write');
|
||||
});
|
||||
|
||||
it('should include interactive prompts in its vision even when timers are present', async () => {
|
||||
const simulator = new UserSimulator(
|
||||
mockConfig,
|
||||
mockGetScreen,
|
||||
mockStdinBuffer,
|
||||
);
|
||||
|
||||
// Mock a screen with a timer and a confirmation prompt
|
||||
mockGetScreen.mockReturnValue(
|
||||
'Thinking... (0s)\n\nAction Required: Allow pip execution? [Y/n]',
|
||||
);
|
||||
|
||||
// We need to trigger the private tick method. Since it's private and run on an interval,
|
||||
// we can use a hack or just test the prompt construction if we refactor,
|
||||
// but for now let's use the interval.
|
||||
|
||||
vi.useFakeTimers();
|
||||
simulator.start();
|
||||
|
||||
// Trigger the interval
|
||||
await vi.advanceTimersByTimeAsync(2000);
|
||||
|
||||
expect(mockContentGenerator.generateContent).toHaveBeenCalled();
|
||||
const lastCall = mockContentGenerator.generateContent.mock.calls[0];
|
||||
const prompt = lastCall[0].contents[0].parts[0].text;
|
||||
|
||||
expect(prompt).toContain(
|
||||
'STATE 2: The agent is waiting for you to authorize a tool',
|
||||
);
|
||||
expect(prompt).toContain('[Y/n]');
|
||||
expect(prompt).toContain('RULE 1: If there is a clear confirmation prompt');
|
||||
|
||||
simulator.stop();
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it('should not wait if a prompt is visible even if a spinner is present', async () => {
|
||||
const simulator = new UserSimulator(
|
||||
mockConfig,
|
||||
mockGetScreen,
|
||||
mockStdinBuffer,
|
||||
);
|
||||
|
||||
// Mock a screen with a spinner and a prompt
|
||||
mockGetScreen.mockReturnValue('⠋ Working...\n> Type your message');
|
||||
|
||||
vi.useFakeTimers();
|
||||
simulator.start();
|
||||
|
||||
await vi.advanceTimersByTimeAsync(2000);
|
||||
|
||||
expect(mockContentGenerator.generateContent).toHaveBeenCalled();
|
||||
const lastCall = mockContentGenerator.generateContent.mock.calls[0];
|
||||
const prompt = lastCall[0].contents[0].parts[0].text;
|
||||
|
||||
expect(prompt).toContain(
|
||||
'Only <WAIT> (Rule 1 fallback) if the agent is truly mid-process',
|
||||
);
|
||||
|
||||
simulator.stop();
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it('should submit keys with reliable delays', async () => {
|
||||
const simulator = new UserSimulator(
|
||||
mockConfig,
|
||||
mockGetScreen,
|
||||
mockStdinBuffer,
|
||||
);
|
||||
mockGetScreen.mockReturnValue('> Prompt');
|
||||
mockContentGenerator.generateContent.mockResolvedValue({
|
||||
text: JSON.stringify({ action: 'abc' }),
|
||||
});
|
||||
|
||||
vi.useFakeTimers();
|
||||
simulator.start();
|
||||
|
||||
// Trigger tick
|
||||
await vi.advanceTimersByTimeAsync(2000);
|
||||
|
||||
// Wait for the async key submission loop to finish
|
||||
// Initial delay 100ms + (3 chars * 10ms) = 130ms minimum
|
||||
await vi.advanceTimersByTimeAsync(500);
|
||||
|
||||
expect(mockStdinBuffer.write).toHaveBeenCalledWith('a');
|
||||
expect(mockStdinBuffer.write).toHaveBeenCalledWith('b');
|
||||
expect(mockStdinBuffer.write).toHaveBeenCalledWith('c');
|
||||
|
||||
simulator.stop();
|
||||
vi.useRealTimers();
|
||||
});
|
||||
});
|
||||
@@ -143,11 +143,11 @@ export class UserSimulator {
|
||||
const prompt = `You are evaluating a CLI agent by simulating a user sitting at the terminal.
|
||||
Look carefully at the screen and determine the CLI's current state:
|
||||
|
||||
STATE 1: The agent is busy (e.g., streaming a response, showing a spinner, running a tool, or displaying a timer like "7s"). It is actively working and NOT waiting for text input.
|
||||
STATE 1: The agent is busy (e.g., streaming a response, executing a tool, or showing a progress message). It is actively working and NOT waiting for text input or user approval.
|
||||
- In this case, your action MUST be exactly: <WAIT>
|
||||
|
||||
STATE 2: The agent is waiting for you to authorize a tool, confirm an action, or answer a specific multi-choice question (e.g., "Action Required", "Allow execution", numbered options).
|
||||
- In this case, your action MUST be the exact raw characters to select the option and submit it (e.g., 1\\r, 2\\r, y\\r, n\\r, or just \\r if the default option is acceptable). Do NOT output <DONE> or "Thank you". You must unblock the agent and allow it to run the tool.
|
||||
STATE 2: The agent is waiting for you to authorize a tool, confirm an action, or answer a specific multi-choice question (e.g., "Action Required", "Allow execution", numbered options, "[Y/n]").
|
||||
- In this case, your action MUST be the exact raw characters to select the option and submit it (e.g., 1\\r, 2\\r, y\\r, n\\r, or just \\r if the default option is acceptable). Do NOT output <DONE> or "Thank you". You must unblock the agent and allow it to run the tool. This state takes precedence even if timers or background messages are visible.
|
||||
|
||||
STATE 3: The agent has finished its current thought process AND is idle, waiting for a NEW general text prompt (usually indicated by a "> Type your message" prompt).
|
||||
- First, verify that the ACTUAL task is fully complete based on your original goal. Do not stop at intermediate steps like planning or syntax checking.
|
||||
@@ -159,7 +159,7 @@ STATE 4: Any other situation where the agent is waiting for text input or needs
|
||||
- Your action should be the raw characters you would type, followed by \\r. For just an Enter key press, output \\r.
|
||||
|
||||
CRITICAL RULES:
|
||||
- RULE 1: If there is ANY active spinner (e.g., ⠋, ⠙, ⠹, ⠸, ⠼, ⠴, ⠧) or an elapsed time indicator (e.g., "0s", "7s") anywhere on the screen, the agent is STILL WORKING. Your action MUST be <WAIT>. Do NOT issue commands, even if a text prompt is visible below it.
|
||||
- RULE 1: If there is a clear confirmation prompt (e.g. "[Y/n]", "1) Allow Once") or an input cursor (">"), YOU MUST RESPOND (State 2 or 3). Detect these states aggressively. Only <WAIT> (Rule 1 fallback) if the agent is truly mid-process with no interactive markers visible.
|
||||
- RULE 2: If there is an "Action Required" or confirmation prompt on the screen, YOU MUST HANDLE IT (State 2). This takes precedence over everything else.
|
||||
- RULE 3: If prompted to allow execution of a command with options like 'Allow once' and 'Allow for this session', you MUST choose the option for 'Allow for this session' (typically by sending '2\\r').
|
||||
- RULE 4: You MUST output a strictly formatted JSON object with no markdown wrappers or extra text.
|
||||
@@ -319,6 +319,9 @@ ${strippedScreen}
|
||||
}
|
||||
}
|
||||
|
||||
// Wait a bit to ensure the terminal is ready for input
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
|
||||
for (const char of keys) {
|
||||
if (char === '\r') {
|
||||
// Wait a bit to ensure the previous character is rendered before submitting
|
||||
@@ -327,7 +330,7 @@ ${strippedScreen}
|
||||
this.stdinBuffer.write(char);
|
||||
// Small delay to ensure Ink processes each keypress event individually
|
||||
// while preventing UI state collisions during long simulated inputs.
|
||||
await new Promise((resolve) => setTimeout(resolve, 5));
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
}
|
||||
this.lastScreenContent = normalizedScreen;
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user