Aggregate test results. (#16581)

This commit is contained in:
Christian Gunderman
2026-01-14 07:08:05 +00:00
committed by GitHub
parent 8030404b08
commit 66e7b479ae
6 changed files with 295 additions and 9 deletions

View File

@@ -13,11 +13,16 @@ on:
permissions:
contents: 'read'
checks: 'write'
actions: 'read'
jobs:
evals:
name: 'Evals (USUALLY_PASSING) nightly run'
runs-on: 'gemini-cli-ubuntu-16-core'
strategy:
fail-fast: false
matrix:
run_attempt: [1, 2, 3]
steps:
- name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
@@ -34,8 +39,38 @@ jobs:
- name: 'Build project'
run: 'npm run build'
- name: 'Create logs directory'
run: 'mkdir -p evals/logs'
- name: 'Run Evals'
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
run: 'npm run test:all_evals'
- name: 'Upload Logs'
if: 'always()'
uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
with:
name: 'eval-logs-${{ matrix.run_attempt }}'
path: 'evals/logs'
retention-days: 7
aggregate-results:
name: 'Aggregate Results'
needs: ['evals']
if: 'always()'
runs-on: 'gemini-cli-ubuntu-16-core'
steps:
- name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
- name: 'Download Logs'
uses: 'actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806' # ratchet:actions/download-artifact@v4
with:
path: 'artifacts'
- name: 'Generate Summary'
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
run: 'node scripts/aggregate_evals.js artifacts >> "$GITHUB_STEP_SUMMARY"'

View File

@@ -46,6 +46,12 @@ two arguments:
#### Policies
Policies control how strictly a test is validated. Tests should generally use
the ALWAYS_PASSES policy to offer the strictest guarantees.
USUALLY_PASSES exists to enable assertion of less consistent or aspirational
behaviors.
- `ALWAYS_PASSES`: Tests expected to pass 100% of the time. These are typically
trivial and test basic functionality. These run in every CI.
- `USUALLY_PASSES`: Tests expected to pass most of the time but may have some
@@ -100,3 +106,37 @@ npm run test:all_evals
This command sets the `RUN_EVALS` environment variable to `1`, which enables the
`USUALLY_PASSES` tests.
## Reporting
Results for evaluations are available on GitHub Actions:
- **CI Evals**: Included in the
[E2E (Chained)](https://github.com/google-gemini/gemini-cli/actions/workflows/chained_e2e.yml)
workflow. These must pass 100% for every PR.
- **Nightly Evals**: Run daily via the
[Evals: Nightly](https://github.com/google-gemini/gemini-cli/actions/workflows/evals-nightly.yml)
workflow. These track the long-term health and stability of model steering.
### Nightly Report Format
The nightly workflow executes the full evaluation suite multiple times
(currently 3 attempts) to account for non-determinism. These results are
aggregated into a **Nightly Summary** attached to the workflow run.
#### How to interpret the report:
- **Pass Rate (%)**: Each cell represents the percentage of successful runs for
a specific test in that workflow instance.
- **History**: The table shows the pass rates for the last 10 nightly runs,
allowing you to identify if a model's behavior is trending towards
instability.
- **Total Pass Rate**: An aggregate metric of all evaluations run in that batch.
A significant drop in the pass rate for a `USUALLY_PASSES` test—even if it
doesn't drop to 0%—often indicates that a recent change to a system prompt or
tool definition has made the model's behavior less reliable.
You may be able to investigate the regression using Gemini CLI by giving it the
link to the runs before and after the change and the name of the test and asking
it to investigate what changes may have impacted the test.

View File

@@ -11,7 +11,6 @@ import { validateModelOutput } from '../integration-tests/test-helper.js';
describe('save_memory', () => {
evalTest('ALWAYS_PASSES', {
name: 'should be able to save to memory',
log: true,
params: {
settings: { tools: { core: ['save_memory'] } },
},

View File

@@ -36,12 +36,10 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
const result = await rig.run({ args: evalCase.prompt });
await evalCase.assert(rig, result);
} finally {
if (evalCase.log) {
await logToFile(
evalCase.name,
JSON.stringify(rig.readToolLogs(), null, 2),
);
}
await logToFile(
evalCase.name,
JSON.stringify(rig.readToolLogs(), null, 2),
);
await rig.cleanup();
}
};
@@ -58,7 +56,6 @@ export interface EvalCase {
params?: Record<string, any>;
prompt: string;
assert: (rig: TestRig, result: string) => Promise<void>;
log?: boolean;
}
async function logToFile(name: string, content: string) {

View File

@@ -9,7 +9,10 @@ import { defineConfig } from 'vitest/config';
export default defineConfig({
test: {
testTimeout: 300000, // 5 minutes
reporters: ['default'],
reporters: ['default', 'json'],
outputFile: {
json: 'evals/logs/report.json',
},
include: ['**/*.eval.ts'],
},
});

212
scripts/aggregate_evals.js Normal file
View File

@@ -0,0 +1,212 @@
#!/usr/bin/env node
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import fs from 'node:fs';
import path from 'node:path';
import { execSync } from 'node:child_process';
import os from 'node:os';
const artifactsDir = process.argv[2] || '.';
const MAX_HISTORY = 10;
// Find all report.json files recursively
function findReports(dir) {
const reports = [];
if (!fs.existsSync(dir)) return reports;
const files = fs.readdirSync(dir);
for (const file of files) {
const fullPath = path.join(dir, file);
const stat = fs.statSync(fullPath);
if (stat.isDirectory()) {
reports.push(...findReports(fullPath));
} else if (file === 'report.json') {
reports.push(fullPath);
}
}
return reports;
}
function getStats(reports) {
const testStats = {};
for (const reportPath of reports) {
try {
const content = fs.readFileSync(reportPath, 'utf-8');
const json = JSON.parse(content);
for (const testResult of json.testResults) {
for (const assertion of testResult.assertionResults) {
const name = assertion.title;
if (!testStats[name]) {
testStats[name] = { passed: 0, failed: 0, total: 0 };
}
testStats[name].total++;
if (assertion.status === 'passed') {
testStats[name].passed++;
} else {
testStats[name].failed++;
}
}
}
} catch (error) {
console.error(`Error processing report at ${reportPath}:`, error);
}
}
return testStats;
}
function fetchHistoricalData() {
const history = [];
try {
// Determine branch
const branch = 'main';
// Get recent runs
const cmd = `gh run list --workflow evals-nightly.yml --branch "${branch}" --limit ${
MAX_HISTORY + 5
} --json databaseId,createdAt,url,displayTitle,status,conclusion`;
const runsJson = execSync(cmd, { encoding: 'utf-8' });
let runs = JSON.parse(runsJson);
// Filter out current run
const currentRunId = process.env.GITHUB_RUN_ID;
if (currentRunId) {
runs = runs.filter((r) => r.databaseId.toString() !== currentRunId);
}
// Filter for runs that likely have artifacts (completed) and take top N
// We accept 'failure' too because we want to see stats.
runs = runs.filter((r) => r.status === 'completed').slice(0, MAX_HISTORY);
// Fetch artifacts for each run
for (const run of runs) {
const tmpDir = fs.mkdtempSync(
path.join(os.tmpdir(), `gemini-evals-${run.databaseId}-`),
);
try {
// Download report.json files.
// The artifacts are named 'eval-logs-X'.
// We use -p to match pattern.
execSync(
`gh run download ${run.databaseId} -p "eval-logs-*" -D "${tmpDir}"`,
{ stdio: 'ignore' },
);
const runReports = findReports(tmpDir);
if (runReports.length > 0) {
history.push({
run,
stats: getStats(runReports),
});
}
} catch (error) {
console.error(
`Failed to download or process artifacts for run ${run.databaseId}:`,
error,
);
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
}
} catch (error) {
console.error('Failed to fetch historical data:', error);
}
return history;
}
function generateMarkdown(currentStats, history) {
const totalStats = Object.values(currentStats).reduce(
(acc, stats) => {
acc.passed += stats.passed;
acc.total += stats.total;
return acc;
},
{ passed: 0, total: 0 },
);
const totalPassRate =
totalStats.total > 0
? ((totalStats.passed / totalStats.total) * 100).toFixed(1) + '%'
: 'N/A';
console.log('### Evals Nightly Summary');
console.log(`**Total Pass Rate: ${totalPassRate}**\n`);
console.log(
'See [evals/README.md](https://github.com/google-gemini/gemini-cli/tree/main/evals) for more details.\n',
);
// Reverse history to show oldest first
const reversedHistory = [...history].reverse();
// Header
let header = '| Test Name |';
let separator = '| :--- |';
for (const item of reversedHistory) {
header += ` [${item.run.databaseId}](${item.run.url}) |`;
separator += ' :---: |';
}
// Add Current column last
header += ' Current |';
separator += ' :---: |';
console.log(header);
console.log(separator);
// Collect all test names
const allTestNames = new Set(Object.keys(currentStats));
for (const item of reversedHistory) {
Object.keys(item.stats).forEach((name) => allTestNames.add(name));
}
for (const name of Array.from(allTestNames).sort()) {
const searchUrl = `https://github.com/search?q=repo%3Agoogle-gemini%2Fgemini-cli%20%22${encodeURIComponent(name)}%22&type=code`;
let row = `| [${name}](${searchUrl}) |`;
// History
for (const item of reversedHistory) {
const stat = item.stats[name];
if (stat) {
const passRate = ((stat.passed / stat.total) * 100).toFixed(0) + '%';
row += ` ${passRate} |`;
} else {
row += ' - |';
}
}
// Current
const curr = currentStats[name];
if (curr) {
const passRate = ((curr.passed / curr.total) * 100).toFixed(0) + '%';
row += ` ${passRate} |`;
} else {
row += ' - |';
}
console.log(row);
}
}
// --- Main ---
const currentReports = findReports(artifactsDir);
if (currentReports.length === 0) {
console.log('No reports found.');
// We don't exit here because we might still want to see history if available,
// but practically if current has no reports, something is wrong.
// Sticking to original behavior roughly, but maybe we can continue.
process.exit(0);
}
const currentStats = getStats(currentReports);
const history = fetchHistoricalData();
generateMarkdown(currentStats, history);