Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: PR Quality Gate

on:
pull_request:
push:
branches:
- main

concurrency:
group: ci-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
validate:
name: Node 24 | npm ci + typecheck + test
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "24"
cache: npm

- name: Install dependencies
run: npm ci

- name: Typecheck
run: npm run typecheck

- name: Run tests
run: npm test
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ Evaluate `SWE`, `TB2`, `Tau`, and `SAE` through a unified `kbench` CLI plus benc
`adapter generate` is part of the current runtime path for bootstrapping dynamic `custom-adapter` integrations.
Today this generator is heuristic and repository-inspection-based; this repo does not yet ship a built-in remote LLM adapter generator.

## Environment Requirements

- Node.js `24.x` is the recommended local and CI runtime for this repository
- npm `11.x` is the expected package manager line
- `npm ci` is the canonical clean-install path and is enforced by the PR quality gate
- Benchmark workflows still install their own extra runtime dependencies such as Python/Harbor/tau-bench as needed

## Quick Start

This repository is primarily intended to run benchmarks through GitHub Actions.
Expand Down
7 changes: 7 additions & 0 deletions README.zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@
`adapter generate` 已经是当前仓库里动态 `custom-adapter` bootstrap 的一部分。
但当前实现仍然是基于本地仓库检查和启发式推断,不是“仓库内置远程 LLM 动态生成 adapter”。

## 环境要求

- 推荐使用 Node.js `24.x` 作为本地与 CI 运行时
- 预期使用 npm `11.x`
- `npm ci` 是标准的干净安装路径,并已作为 PR 质量门禁的一部分
- benchmark workflow 仍会按需安装额外运行时依赖,例如 Python、Harbor、tau-bench

## Quick Start

这个仓库的主要用法仍然是通过 GitHub Actions 发起评测。
Expand Down
21 changes: 17 additions & 4 deletions src/benchmark/sae/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,17 @@ function mergeSummary(summary: SummaryResult, benchmarkResult: Record<string, un
};
}

function normalizePositiveMs(value: number | undefined, label: string, fallback?: number): number {
if (value === undefined) {
if (fallback !== undefined) return fallback;
throw new Error(`${label} is required.`);
}
if (!Number.isFinite(value) || value <= 0) {
throw new Error(`${label} must be a positive finite number of milliseconds.`);
}
return value;
}

function withTemporaryBaseUrl<T>(modelName: string, baseUrl: string | undefined, fn: () => Promise<T>): Promise<T> {
if (!baseUrl) {
return fn();
Expand All @@ -254,6 +265,8 @@ function withTemporaryBaseUrl<T>(modelName: string, baseUrl: string | undefined,
}

export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise<SaeBenchmarkOutcome> {
const saeTimeoutMs = normalizePositiveMs(config.saeTimeoutMs, 'saeTimeoutMs', 30 * 60 * 1000);
const saePollIntervalMs = normalizePositiveMs(config.saePollIntervalMs, 'saePollIntervalMs');
const layout = createRunLayout(config.runDir, config.runId);
const artifactDir = path.join(layout.runDir, 'artifacts', 'sae');
const metadata: RunMetadata = {
Expand All @@ -270,8 +283,8 @@ export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise<SaeBe
registerIfMissing: config.saeRegisterIfMissing,
agentIdFile: expandHome(config.saeAgentIdFile),
apiKeyFile: expandHome(config.saeApiKeyFile),
timeoutMs: config.saeTimeoutMs,
pollIntervalMs: config.saePollIntervalMs,
timeoutMs: saeTimeoutMs,
pollIntervalMs: saePollIntervalMs,
},
harnessConfig: {
workDir: config.workDir,
Expand All @@ -292,7 +305,7 @@ export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise<SaeBe
let credentials: SaeAgentCredentials | undefined;

try {
const deadlineAt = Date.now() + (config.saeTimeoutMs ?? 30 * 60 * 1000);
const deadlineAt = Date.now() + saeTimeoutMs;
const credentialState = await loadOrRegisterCredentials(client, config, artifactDir);
credentials = credentialState.credentials;

Expand Down Expand Up @@ -330,7 +343,7 @@ export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise<SaeBe
client,
submission,
credentials.apiToken,
config.saePollIntervalMs,
saePollIntervalMs,
deadlineAt
);
await writeJson(path.join(artifactDir, 'final_submission.json'), submission);
Expand Down
19 changes: 16 additions & 3 deletions src/cli/kbench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,19 @@ function getDefaultTimeoutMs(benchmark: BenchmarkId, harness: string): number |
return undefined;
}

function parsePositiveIntegerFlag(values: Map<string, string>, name: string): number | undefined {
const raw = values.get(name);
if (raw === undefined) {
return undefined;
}

const parsed = Number(raw);
if (!Number.isFinite(parsed) || !Number.isInteger(parsed) || parsed <= 0) {
throw new Error(`Invalid --${name}. Expected a positive integer.`);
}
return parsed;
}

function parseRunArgs(argv: string[]): RunCliArgs {
const values = parseFlags(argv);
const benchmark = values.get('benchmark') as BenchmarkId | undefined;
Expand All @@ -702,7 +715,7 @@ function parseRunArgs(argv: string[]): RunCliArgs {
const runId = values.get('run-id') || nowId('run');
const instanceId = values.get('instance-id') || `${benchmark}-instance`;
const runDir = path.resolve(values.get('run-dir') || path.join(process.cwd(), '.kbench', 'runs', runId));
const explicitTimeoutMs = values.get('timeout-ms') ? Number(values.get('timeout-ms')) : undefined;
const explicitTimeoutMs = parsePositiveIntegerFlag(values, 'timeout-ms');
const configModeValue = values.get('config-mode');
if (configModeValue && configModeValue !== 'inherit' && configModeValue !== 'isolated') {
throw new Error('Invalid --config-mode. Expected one of: inherit, isolated.');
Expand Down Expand Up @@ -770,7 +783,7 @@ function parseBenchmarkRunArgs(argv: string[]): BenchmarkRunCliArgs {
storeDirProvided: values.has('store-dir'),
workDir: values.get('workdir') ? path.resolve(values.get('workdir') as string) : undefined,
storeDir: values.get('store-dir') ? path.resolve(values.get('store-dir') as string) : undefined,
timeoutMs: values.get('sae-timeout-ms') ? Number(values.get('sae-timeout-ms')) : undefined,
timeoutMs: parsePositiveIntegerFlag(values, 'sae-timeout-ms'),
saeApiBase: values.get('sae-api-base') || 'https://www.kaggle.com/api/v1',
saeAgentIdFile: values.get('sae-agent-id-file') || '~/.kaggle-agent-id',
saeApiKeyFile: values.get('sae-api-key-file') || '~/.kaggle-agent-api-key',
Expand All @@ -779,7 +792,7 @@ function parseBenchmarkRunArgs(argv: string[]): BenchmarkRunCliArgs {
saeAgentDescription: values.get('sae-agent-description'),
saeAgentVersion: values.get('sae-agent-version') || '1.0',
saeAgentType: values.get('sae-agent-type') || harness,
saePollIntervalMs: values.get('sae-poll-interval-ms') ? Number(values.get('sae-poll-interval-ms')) : 2000,
saePollIntervalMs: parsePositiveIntegerFlag(values, 'sae-poll-interval-ms') ?? 2000,
};
}

Expand Down
3 changes: 0 additions & 3 deletions src/harness/drivers/cli/runtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,6 @@ export function extractPatchSinceBaseline(baseline: GitPatchBaseline): string |
if ((baseline.beforeDiff || '') === afterDiff) {
return undefined;
}
if (baseline.beforeDiff && baseline.beforeDiff.trim()) {
return undefined;
}
return afterDiff;
}

Expand Down
2 changes: 1 addition & 1 deletion src/harness/sdk/validate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ export async function validateAdapter(adapterPath: string): Promise<AdapterValid
}

return {
ok: loaded.schema.ok && entryValidation.ok && executionChecks.every((check) => check.ok || check.warnings.length > 0),
ok: loaded.schema.ok && entryValidation.ok && executionChecks.every((check) => check.ok),
adapterPath: loaded.adapterPath,
manifestPath: loaded.manifestPath,
entryPath: loaded.manifest ? loaded.entryPath : undefined,
Expand Down
21 changes: 20 additions & 1 deletion test/benchmark/sae-runner.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os from 'node:os';
import path from 'node:path';

import { describe, expect, it } from 'vitest';

import { parseBoolean } from '../../src/benchmark/sae/runner.js';
import { parseBoolean, runSaeBenchmark } from '../../src/benchmark/sae/runner.js';

describe('SAE runner helpers', () => {
it('parses common truthy values', () => {
Expand All @@ -21,4 +24,20 @@ describe('SAE runner helpers', () => {
expect(parseBoolean(undefined, true)).toBe(true);
expect(parseBoolean('maybe', false)).toBe(false);
});

it('rejects invalid polling configuration before making network requests', async () => {
await expect(runSaeBenchmark({
runId: 'sae-invalid-config',
runDir: path.join(os.tmpdir(), 'kbench-sae-invalid-config'),
harness: 'kode-agent-sdk',
modelName: 'openai/gpt-4.1-mini',
saeApiBase: 'https://www.kaggle.com/api/v1',
saeAgentIdFile: '~/.kaggle-agent-id',
saeApiKeyFile: '~/.kaggle-agent-api-key',
saeRegisterIfMissing: false,
saeAgentVersion: '1.0',
saeAgentType: 'kode-agent-sdk',
saePollIntervalMs: Number.NaN,
})).rejects.toThrow('saePollIntervalMs must be a positive finite number of milliseconds.');
});
});
55 changes: 55 additions & 0 deletions test/cli/kbench-cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,59 @@ describe('kbench CLI', () => {
expect(payload.benchmarkError.message).toContain(missingIdFile);
expect(payload.benchmarkError.message).toContain(missingKeyFile);
});

it('rejects invalid sae-timeout-ms values before running the benchmark', async () => {
const result = await runKbench([
'benchmark',
'run',
'--benchmark',
'sae',
'--harness',
'kode-agent-sdk',
'--model-name',
'openai/gpt-4.1-mini',
'--sae-timeout-ms',
'NaN',
]);

expect(result.status).toBe(1);
expect(result.stderr).toContain('Invalid --sae-timeout-ms. Expected a positive integer.');
});

it('rejects invalid sae-poll-interval-ms values before running the benchmark', async () => {
const result = await runKbench([
'benchmark',
'run',
'--benchmark',
'sae',
'--harness',
'kode-agent-sdk',
'--model-name',
'openai/gpt-4.1-mini',
'--sae-poll-interval-ms',
'0',
]);

expect(result.status).toBe(1);
expect(result.stderr).toContain('Invalid --sae-poll-interval-ms. Expected a positive integer.');
});

it('rejects invalid timeout-ms values before running a single instance', async () => {
const result = await runKbench([
'run',
'--benchmark',
'swe',
'--harness',
'kode-agent-sdk',
'--model-name',
'openai/gpt-4.1-mini',
'--instruction',
'Fix the bug',
'--timeout-ms',
'NaN',
]);

expect(result.status).toBe(1);
expect(result.stderr).toContain('Invalid --timeout-ms. Expected a positive integer.');
});
});
45 changes: 45 additions & 0 deletions test/harness/cli-runtime.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import { execFileSync } from 'node:child_process';
import fs from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';

import { afterEach, describe, expect, it } from 'vitest';

import { captureGitPatchBaseline, extractPatchSinceBaseline } from '../../src/harness/drivers/cli/runtime.js';

const tempDirs: string[] = [];

afterEach(async () => {
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
});

async function makeGitRepo(): Promise<string> {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), 'kbench-cli-runtime-'));
tempDirs.push(dir);

execFileSync('git', ['init'], { cwd: dir, stdio: 'ignore' });
execFileSync('git', ['config', 'user.email', 'kbench@example.com'], { cwd: dir, stdio: 'ignore' });
execFileSync('git', ['config', 'user.name', 'kbench'], { cwd: dir, stdio: 'ignore' });

await fs.writeFile(path.join(dir, 'tracked.txt'), 'base\n', 'utf-8');
execFileSync('git', ['add', 'tracked.txt'], { cwd: dir, stdio: 'ignore' });
execFileSync('git', ['commit', '-m', 'init'], { cwd: dir, stdio: 'ignore' });

return dir;
}

describe('CLI runtime patch capture', () => {
it('captures the current diff even when the worktree was already dirty before execution', async () => {
const repoDir = await makeGitRepo();

await fs.writeFile(path.join(repoDir, 'tracked.txt'), 'base\nbefore\n', 'utf-8');
const baseline = captureGitPatchBaseline(repoDir);

await fs.writeFile(path.join(repoDir, 'tracked.txt'), 'base\nbefore\nafter\n', 'utf-8');
const patch = extractPatchSinceBaseline(baseline);

expect(patch).toBeDefined();
expect(patch).toContain('tracked.txt');
expect(patch).toContain('after');
});
});
50 changes: 50 additions & 0 deletions test/sdk/adapter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,54 @@ describe('adapter scaffolding', () => {
expect(report.executionChecks[0]?.mode).toBe('task');
expect(report.executionChecks[0]?.output?.status).toBe('ok');
});

it('does not mark adapter validation as ok when execution has both errors and warnings', async () => {
const root = await makeTempRoot();
const adapterDir = path.join(root, 'warning-error-adapter');

await fs.mkdir(adapterDir, { recursive: true });
await fs.writeFile(
path.join(adapterDir, 'adapter.manifest.json'),
`${JSON.stringify({
schemaVersion: 'kbench.adapter/v1',
id: 'warning-error-adapter',
kind: 'node',
entry: './runner.mjs',
version: '0.1.0',
supportedBenchmarks: ['swe'],
capabilities: {
runModes: ['task'],
machineReadableStdout: true,
supportsPatchOutput: false,
supportsTrajectory: false,
supportsToolCallTrace: false,
supportsResume: false,
supportsImages: false,
supportsSandboxBridge: false,
supportsPromptTemplate: false,
},
}, null, 2)}\n`,
'utf-8'
);
await fs.writeFile(
path.join(adapterDir, 'runner.mjs'),
`#!/usr/bin/env node
process.stdout.write(JSON.stringify({
ok: true,
status: 'bad-status',
elapsedMs: 1,
patch: 'diff --git a/a b/a\\n',
}));
`,
{ encoding: 'utf-8', mode: 0o755 }
);

const report = await validateAdapter(adapterDir);

expect(report.ok).toBe(false);
expect(report.executionChecks).toHaveLength(1);
expect(report.executionChecks[0]?.ok).toBe(false);
expect(report.executionChecks[0]?.errors.some((error) => error.includes('valid "status"'))).toBe(true);
expect(report.executionChecks[0]?.warnings.some((warning) => warning.includes('returned a patch'))).toBe(true);
});
});
Loading