Files
gemini-cli/.github/workflows/evals-nightly.yml
2026-01-22 18:23:15 +00:00

102 lines
3.0 KiB
YAML

name: 'Evals: Nightly'
on:
schedule:
- cron: '0 1 * * *' # Runs at 1 AM every day
workflow_dispatch:
inputs:
run_all:
description: 'Run all evaluations (including usually passing)'
type: 'boolean'
default: true
test_name_pattern:
description: 'Test name pattern or file name'
required: false
type: 'string'
permissions:
contents: 'read'
checks: 'write'
actions: 'read'
jobs:
evals:
name: 'Evals (USUALLY_PASSING) nightly run'
runs-on: 'gemini-cli-ubuntu-16-core'
strategy:
fail-fast: false
matrix:
model:
- 'gemini-3-pro-preview'
- 'gemini-3-flash-preview'
- 'gemini-2.5-pro'
- 'gemini-2.5-flash'
- 'gemini-2.5-flash-lite'
run_attempt: [1, 2, 3]
steps:
- name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
- name: 'Set up Node.js'
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4
with:
node-version-file: '.nvmrc'
cache: 'npm'
- name: 'Install dependencies'
run: 'npm ci'
- name: 'Build project'
run: 'npm run build'
- name: 'Create logs directory'
run: 'mkdir -p evals/logs'
- name: 'Run Evals'
continue-on-error: true
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GEMINI_MODEL: '${{ matrix.model }}'
RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
run: |
CMD="npm run test:all_evals"
PATTERN="${{ env.TEST_NAME_PATTERN }}"
if [[ -n "$PATTERN" ]]; then
if [[ "$PATTERN" == *.ts || "$PATTERN" == *.js || "$PATTERN" == */* ]]; then
$CMD -- "$PATTERN"
else
$CMD -- -t "$PATTERN"
fi
else
$CMD
fi
- name: 'Upload Logs'
if: 'always()'
uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
with:
name: 'eval-logs-${{ matrix.model }}-${{ matrix.run_attempt }}'
path: 'evals/logs'
retention-days: 7
aggregate-results:
name: 'Aggregate Results'
needs: ['evals']
if: 'always()'
runs-on: 'gemini-cli-ubuntu-16-core'
steps:
- name: 'Checkout'
uses: 'actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8' # ratchet:actions/checkout@v5
- name: 'Download Logs'
uses: 'actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806' # ratchet:actions/download-artifact@v4
with:
path: 'artifacts'
- name: 'Generate Summary'
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
run: 'node scripts/aggregate_evals.js artifacts >> "$GITHUB_STEP_SUMMARY"'