diff --git a/.github/workflows/evals-nightly.yml b/.github/workflows/evals-nightly.yml index 6d44de7c12..40c42e2b07 100644 --- a/.github/workflows/evals-nightly.yml +++ b/.github/workflows/evals-nightly.yml @@ -13,11 +13,16 @@ on: permissions: contents: 'read' checks: 'write' + actions: 'read' jobs: evals: name: 'Evals (USUALLY_PASSING) nightly run' runs-on: 'gemini-cli-ubuntu-16-core' + strategy: + fail-fast: false + matrix: + run_attempt: [1, 2, 3] steps: - name: 'Checkout' uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 @@ -34,8 +39,38 @@ jobs: - name: 'Build project' run: 'npm run build' + - name: 'Create logs directory' + run: 'mkdir -p evals/logs' + - name: 'Run Evals' env: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}" run: 'npm run test:all_evals' + + - name: 'Upload Logs' + if: 'always()' + uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4 + with: + name: 'eval-logs-${{ matrix.run_attempt }}' + path: 'evals/logs' + retention-days: 7 + + aggregate-results: + name: 'Aggregate Results' + needs: ['evals'] + if: 'always()' + runs-on: 'gemini-cli-ubuntu-16-core' + steps: + - name: 'Checkout' + uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5 + + - name: 'Download Logs' + uses: 'actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806' # ratchet:actions/download-artifact@v4 + with: + path: 'artifacts' + + - name: 'Generate Summary' + env: + GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + run: 'node scripts/aggregate_evals.js artifacts >> "$GITHUB_STEP_SUMMARY"' diff --git a/evals/README.md b/evals/README.md index a339af842f..891a9549f5 100644 --- a/evals/README.md +++ b/evals/README.md @@ -46,6 +46,12 @@ two arguments: #### Policies +Policies control how strictly a test is validated. Tests should generally use +the ALWAYS_PASSES policy to offer the strictest guarantees. + +USUALLY_PASSES exists to enable assertion of less consistent or aspirational +behaviors. + - `ALWAYS_PASSES`: Tests expected to pass 100% of the time. These are typically trivial and test basic functionality. These run in every CI. - `USUALLY_PASSES`: Tests expected to pass most of the time but may have some @@ -100,3 +106,37 @@ npm run test:all_evals This command sets the `RUN_EVALS` environment variable to `1`, which enables the `USUALLY_PASSES` tests. + +## Reporting + +Results for evaluations are available on GitHub Actions: + +- **CI Evals**: Included in the + [E2E (Chained)](https://github.com/google-gemini/gemini-cli/actions/workflows/chained_e2e.yml) + workflow. These must pass 100% for every PR. +- **Nightly Evals**: Run daily via the + [Evals: Nightly](https://github.com/google-gemini/gemini-cli/actions/workflows/evals-nightly.yml) + workflow. These track the long-term health and stability of model steering. + +### Nightly Report Format + +The nightly workflow executes the full evaluation suite multiple times +(currently 3 attempts) to account for non-determinism. These results are +aggregated into a **Nightly Summary** attached to the workflow run. + +#### How to interpret the report: + +- **Pass Rate (%)**: Each cell represents the percentage of successful runs for + a specific test in that workflow instance. +- **History**: The table shows the pass rates for the last 10 nightly runs, + allowing you to identify if a model's behavior is trending towards + instability. +- **Total Pass Rate**: An aggregate metric of all evaluations run in that batch. + +A significant drop in the pass rate for a `USUALLY_PASSES` test—even if it +doesn't drop to 0%—often indicates that a recent change to a system prompt or +tool definition has made the model's behavior less reliable. + +You may be able to investigate the regression using Gemini CLI by giving it the +link to the runs before and after the change and the name of the test and asking +it to investigate what changes may have impacted the test. diff --git a/evals/save_memory.eval.ts b/evals/save_memory.eval.ts index a64f21798a..48658113ce 100644 --- a/evals/save_memory.eval.ts +++ b/evals/save_memory.eval.ts @@ -11,7 +11,6 @@ import { validateModelOutput } from '../integration-tests/test-helper.js'; describe('save_memory', () => { evalTest('ALWAYS_PASSES', { name: 'should be able to save to memory', - log: true, params: { settings: { tools: { core: ['save_memory'] } }, }, diff --git a/evals/test-helper.ts b/evals/test-helper.ts index f394521d1e..9801d2307b 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -36,12 +36,10 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { const result = await rig.run({ args: evalCase.prompt }); await evalCase.assert(rig, result); } finally { - if (evalCase.log) { - await logToFile( - evalCase.name, - JSON.stringify(rig.readToolLogs(), null, 2), - ); - } + await logToFile( + evalCase.name, + JSON.stringify(rig.readToolLogs(), null, 2), + ); await rig.cleanup(); } }; @@ -58,7 +56,6 @@ export interface EvalCase { params?: Record; prompt: string; assert: (rig: TestRig, result: string) => Promise; - log?: boolean; } async function logToFile(name: string, content: string) { diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts index 8476b638ff..2c59682f16 100644 --- a/evals/vitest.config.ts +++ b/evals/vitest.config.ts @@ -9,7 +9,10 @@ import { defineConfig } from 'vitest/config'; export default defineConfig({ test: { testTimeout: 300000, // 5 minutes - reporters: ['default'], + reporters: ['default', 'json'], + outputFile: { + json: 'evals/logs/report.json', + }, include: ['**/*.eval.ts'], }, }); diff --git a/scripts/aggregate_evals.js b/scripts/aggregate_evals.js new file mode 100644 index 0000000000..4a9fba02eb --- /dev/null +++ b/scripts/aggregate_evals.js @@ -0,0 +1,212 @@ +#!/usr/bin/env node + +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { execSync } from 'node:child_process'; +import os from 'node:os'; + +const artifactsDir = process.argv[2] || '.'; +const MAX_HISTORY = 10; + +// Find all report.json files recursively +function findReports(dir) { + const reports = []; + if (!fs.existsSync(dir)) return reports; + + const files = fs.readdirSync(dir); + for (const file of files) { + const fullPath = path.join(dir, file); + const stat = fs.statSync(fullPath); + if (stat.isDirectory()) { + reports.push(...findReports(fullPath)); + } else if (file === 'report.json') { + reports.push(fullPath); + } + } + return reports; +} + +function getStats(reports) { + const testStats = {}; + + for (const reportPath of reports) { + try { + const content = fs.readFileSync(reportPath, 'utf-8'); + const json = JSON.parse(content); + + for (const testResult of json.testResults) { + for (const assertion of testResult.assertionResults) { + const name = assertion.title; + if (!testStats[name]) { + testStats[name] = { passed: 0, failed: 0, total: 0 }; + } + testStats[name].total++; + if (assertion.status === 'passed') { + testStats[name].passed++; + } else { + testStats[name].failed++; + } + } + } + } catch (error) { + console.error(`Error processing report at ${reportPath}:`, error); + } + } + return testStats; +} + +function fetchHistoricalData() { + const history = []; + + try { + // Determine branch + const branch = 'main'; + + // Get recent runs + const cmd = `gh run list --workflow evals-nightly.yml --branch "${branch}" --limit ${ + MAX_HISTORY + 5 + } --json databaseId,createdAt,url,displayTitle,status,conclusion`; + const runsJson = execSync(cmd, { encoding: 'utf-8' }); + let runs = JSON.parse(runsJson); + + // Filter out current run + const currentRunId = process.env.GITHUB_RUN_ID; + if (currentRunId) { + runs = runs.filter((r) => r.databaseId.toString() !== currentRunId); + } + + // Filter for runs that likely have artifacts (completed) and take top N + // We accept 'failure' too because we want to see stats. + runs = runs.filter((r) => r.status === 'completed').slice(0, MAX_HISTORY); + + // Fetch artifacts for each run + for (const run of runs) { + const tmpDir = fs.mkdtempSync( + path.join(os.tmpdir(), `gemini-evals-${run.databaseId}-`), + ); + try { + // Download report.json files. + // The artifacts are named 'eval-logs-X'. + // We use -p to match pattern. + execSync( + `gh run download ${run.databaseId} -p "eval-logs-*" -D "${tmpDir}"`, + { stdio: 'ignore' }, + ); + + const runReports = findReports(tmpDir); + if (runReports.length > 0) { + history.push({ + run, + stats: getStats(runReports), + }); + } + } catch (error) { + console.error( + `Failed to download or process artifacts for run ${run.databaseId}:`, + error, + ); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + } + } catch (error) { + console.error('Failed to fetch historical data:', error); + } + + return history; +} + +function generateMarkdown(currentStats, history) { + const totalStats = Object.values(currentStats).reduce( + (acc, stats) => { + acc.passed += stats.passed; + acc.total += stats.total; + return acc; + }, + { passed: 0, total: 0 }, + ); + + const totalPassRate = + totalStats.total > 0 + ? ((totalStats.passed / totalStats.total) * 100).toFixed(1) + '%' + : 'N/A'; + + console.log('### Evals Nightly Summary'); + console.log(`**Total Pass Rate: ${totalPassRate}**\n`); + console.log( + 'See [evals/README.md](https://github.com/google-gemini/gemini-cli/tree/main/evals) for more details.\n', + ); + + // Reverse history to show oldest first + const reversedHistory = [...history].reverse(); + + // Header + let header = '| Test Name |'; + let separator = '| :--- |'; + + for (const item of reversedHistory) { + header += ` [${item.run.databaseId}](${item.run.url}) |`; + separator += ' :---: |'; + } + + // Add Current column last + header += ' Current |'; + separator += ' :---: |'; + + console.log(header); + console.log(separator); + + // Collect all test names + const allTestNames = new Set(Object.keys(currentStats)); + for (const item of reversedHistory) { + Object.keys(item.stats).forEach((name) => allTestNames.add(name)); + } + + for (const name of Array.from(allTestNames).sort()) { + const searchUrl = `https://github.com/search?q=repo%3Agoogle-gemini%2Fgemini-cli%20%22${encodeURIComponent(name)}%22&type=code`; + let row = `| [${name}](${searchUrl}) |`; + + // History + for (const item of reversedHistory) { + const stat = item.stats[name]; + if (stat) { + const passRate = ((stat.passed / stat.total) * 100).toFixed(0) + '%'; + row += ` ${passRate} |`; + } else { + row += ' - |'; + } + } + + // Current + const curr = currentStats[name]; + if (curr) { + const passRate = ((curr.passed / curr.total) * 100).toFixed(0) + '%'; + row += ` ${passRate} |`; + } else { + row += ' - |'; + } + + console.log(row); + } +} + +// --- Main --- + +const currentReports = findReports(artifactsDir); +if (currentReports.length === 0) { + console.log('No reports found.'); + // We don't exit here because we might still want to see history if available, + // but practically if current has no reports, something is wrong. + // Sticking to original behavior roughly, but maybe we can continue. + process.exit(0); +} + +const currentStats = getStats(currentReports); +const history = fetchHistoricalData(); +generateMarkdown(currentStats, history);