shareAI-lab · Gui-Yue · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,35 @@
+name: PR Quality Gate
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  validate:
+    name: Node 24 | npm ci + typecheck + test
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "24"
+          cache: npm
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Typecheck
+        run: npm run typecheck
+
+      - name: Run tests
+        run: npm test
diff --git a/README.md b/README.md
@@ -55,6 +55,13 @@ Evaluate `SWE`, `TB2`, `Tau`, and `SAE` through a unified `kbench` CLI plus benc
 `adapter generate` is part of the current runtime path for bootstrapping dynamic `custom-adapter` integrations.
 Today this generator is heuristic and repository-inspection-based; this repo does not yet ship a built-in remote LLM adapter generator.
 
+## Environment Requirements
+
+- Node.js `24.x` is the recommended local and CI runtime for this repository
+- npm `11.x` is the expected package manager line
+- `npm ci` is the canonical clean-install path and is enforced by the PR quality gate
+- Benchmark workflows still install their own extra runtime dependencies such as Python/Harbor/tau-bench as needed
+
 ## Quick Start
 
 This repository is primarily intended to run benchmarks through GitHub Actions.

diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -55,6 +55,13 @@
 `adapter generate` 已经是当前仓库里动态 `custom-adapter` bootstrap 的一部分。
 但当前实现仍然是基于本地仓库检查和启发式推断，不是“仓库内置远程 LLM 动态生成 adapter”。
 
+## 环境要求
+
+- 推荐使用 Node.js `24.x` 作为本地与 CI 运行时
+- 预期使用 npm `11.x`
+- `npm ci` 是标准的干净安装路径，并已作为 PR 质量门禁的一部分
+- benchmark workflow 仍会按需安装额外运行时依赖，例如 Python、Harbor、tau-bench
+
 ## Quick Start
 
 这个仓库的主要用法仍然是通过 GitHub Actions 发起评测。

diff --git a/src/benchmark/sae/runner.ts b/src/benchmark/sae/runner.ts
@@ -234,6 +234,17 @@ function mergeSummary(summary: SummaryResult, benchmarkResult: Record<string, un
   };
 }
 
+function normalizePositiveMs(value: number | undefined, label: string, fallback?: number): number {
+  if (value === undefined) {
+    if (fallback !== undefined) return fallback;
+    throw new Error(`${label} is required.`);
+  }
+  if (!Number.isFinite(value) || value <= 0) {
+    throw new Error(`${label} must be a positive finite number of milliseconds.`);
+  }
+  return value;
+}
+
 function withTemporaryBaseUrl<T>(modelName: string, baseUrl: string | undefined, fn: () => Promise<T>): Promise<T> {
   if (!baseUrl) {
     return fn();
@@ -254,6 +265,8 @@ function withTemporaryBaseUrl<T>(modelName: string, baseUrl: string | undefined,
 }
 
 export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise<SaeBenchmarkOutcome> {
+  const saeTimeoutMs = normalizePositiveMs(config.saeTimeoutMs, 'saeTimeoutMs', 30 * 60 * 1000);
+  const saePollIntervalMs = normalizePositiveMs(config.saePollIntervalMs, 'saePollIntervalMs');
   const layout = createRunLayout(config.runDir, config.runId);
   const artifactDir = path.join(layout.runDir, 'artifacts', 'sae');
   const metadata: RunMetadata = {
@@ -270,8 +283,8 @@ export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise<SaeBe
       registerIfMissing: config.saeRegisterIfMissing,
       agentIdFile: expandHome(config.saeAgentIdFile),
       apiKeyFile: expandHome(config.saeApiKeyFile),
-      timeoutMs: config.saeTimeoutMs,
-      pollIntervalMs: config.saePollIntervalMs,
+      timeoutMs: saeTimeoutMs,
+      pollIntervalMs: saePollIntervalMs,
     },
     harnessConfig: {
       workDir: config.workDir,
@@ -292,7 +305,7 @@ export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise<SaeBe
   let credentials: SaeAgentCredentials | undefined;
 
   try {
-    const deadlineAt = Date.now() + (config.saeTimeoutMs ?? 30 * 60 * 1000);
+    const deadlineAt = Date.now() + saeTimeoutMs;
     const credentialState = await loadOrRegisterCredentials(client, config, artifactDir);
     credentials = credentialState.credentials;
 
@@ -330,7 +343,7 @@ export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise<SaeBe
       client,
       submission,
       credentials.apiToken,
-      config.saePollIntervalMs,
+      saePollIntervalMs,
       deadlineAt
     );
     await writeJson(path.join(artifactDir, 'final_submission.json'), submission);

diff --git a/src/cli/kbench.ts b/src/cli/kbench.ts
@@ -683,6 +683,19 @@ function getDefaultTimeoutMs(benchmark: BenchmarkId, harness: string): number |
   return undefined;
 }
 
+function parsePositiveIntegerFlag(values: Map<string, string>, name: string): number | undefined {
+  const raw = values.get(name);
+  if (raw === undefined) {
+    return undefined;
+  }
+
+  const parsed = Number(raw);
+  if (!Number.isFinite(parsed) || !Number.isInteger(parsed) || parsed <= 0) {
+    throw new Error(`Invalid --${name}. Expected a positive integer.`);
+  }
+  return parsed;
+}
+
 function parseRunArgs(argv: string[]): RunCliArgs {
   const values = parseFlags(argv);
   const benchmark = values.get('benchmark') as BenchmarkId | undefined;
@@ -702,7 +715,7 @@ function parseRunArgs(argv: string[]): RunCliArgs {
   const runId = values.get('run-id') || nowId('run');
   const instanceId = values.get('instance-id') || `${benchmark}-instance`;
   const runDir = path.resolve(values.get('run-dir') || path.join(process.cwd(), '.kbench', 'runs', runId));
-  const explicitTimeoutMs = values.get('timeout-ms') ? Number(values.get('timeout-ms')) : undefined;
+  const explicitTimeoutMs = parsePositiveIntegerFlag(values, 'timeout-ms');
   const configModeValue = values.get('config-mode');
   if (configModeValue && configModeValue !== 'inherit' && configModeValue !== 'isolated') {
     throw new Error('Invalid --config-mode. Expected one of: inherit, isolated.');
@@ -770,7 +783,7 @@ function parseBenchmarkRunArgs(argv: string[]): BenchmarkRunCliArgs {
     storeDirProvided: values.has('store-dir'),
     workDir: values.get('workdir') ? path.resolve(values.get('workdir') as string) : undefined,
     storeDir: values.get('store-dir') ? path.resolve(values.get('store-dir') as string) : undefined,
-    timeoutMs: values.get('sae-timeout-ms') ? Number(values.get('sae-timeout-ms')) : undefined,
+    timeoutMs: parsePositiveIntegerFlag(values, 'sae-timeout-ms'),
     saeApiBase: values.get('sae-api-base') || 'https://www.kaggle.com/api/v1',
     saeAgentIdFile: values.get('sae-agent-id-file') || '~/.kaggle-agent-id',
     saeApiKeyFile: values.get('sae-api-key-file') || '~/.kaggle-agent-api-key',
@@ -779,7 +792,7 @@ function parseBenchmarkRunArgs(argv: string[]): BenchmarkRunCliArgs {
     saeAgentDescription: values.get('sae-agent-description'),
     saeAgentVersion: values.get('sae-agent-version') || '1.0',
     saeAgentType: values.get('sae-agent-type') || harness,
-    saePollIntervalMs: values.get('sae-poll-interval-ms') ? Number(values.get('sae-poll-interval-ms')) : 2000,
+    saePollIntervalMs: parsePositiveIntegerFlag(values, 'sae-poll-interval-ms') ?? 2000,
   };
 }
 

diff --git a/src/harness/drivers/cli/runtime.ts b/src/harness/drivers/cli/runtime.ts
@@ -93,9 +93,6 @@ export function extractPatchSinceBaseline(baseline: GitPatchBaseline): string |
   if ((baseline.beforeDiff || '') === afterDiff) {
     return undefined;
   }
-  if (baseline.beforeDiff && baseline.beforeDiff.trim()) {
-    return undefined;
-  }
   return afterDiff;
 }
 

diff --git a/src/harness/sdk/validate.ts b/src/harness/sdk/validate.ts
@@ -284,7 +284,7 @@ export async function validateAdapter(adapterPath: string): Promise<AdapterValid
   }
 
   return {
-    ok: loaded.schema.ok && entryValidation.ok && executionChecks.every((check) => check.ok || check.warnings.length > 0),
+    ok: loaded.schema.ok && entryValidation.ok && executionChecks.every((check) => check.ok),
     adapterPath: loaded.adapterPath,
     manifestPath: loaded.manifestPath,
     entryPath: loaded.manifest ? loaded.entryPath : undefined,

diff --git a/test/benchmark/sae-runner.test.ts b/test/benchmark/sae-runner.test.ts
@@ -1,6 +1,9 @@
+import os from 'node:os';
+import path from 'node:path';
+
 import { describe, expect, it } from 'vitest';
 
-import { parseBoolean } from '../../src/benchmark/sae/runner.js';
+import { parseBoolean, runSaeBenchmark } from '../../src/benchmark/sae/runner.js';
 
 describe('SAE runner helpers', () => {
   it('parses common truthy values', () => {
@@ -21,4 +24,20 @@ describe('SAE runner helpers', () => {
     expect(parseBoolean(undefined, true)).toBe(true);
     expect(parseBoolean('maybe', false)).toBe(false);
   });
+
+  it('rejects invalid polling configuration before making network requests', async () => {
+    await expect(runSaeBenchmark({
+      runId: 'sae-invalid-config',
+      runDir: path.join(os.tmpdir(), 'kbench-sae-invalid-config'),
+      harness: 'kode-agent-sdk',
+      modelName: 'openai/gpt-4.1-mini',
+      saeApiBase: 'https://www.kaggle.com/api/v1',
+      saeAgentIdFile: '~/.kaggle-agent-id',
+      saeApiKeyFile: '~/.kaggle-agent-api-key',
+      saeRegisterIfMissing: false,
+      saeAgentVersion: '1.0',
+      saeAgentType: 'kode-agent-sdk',
+      saePollIntervalMs: Number.NaN,
+    })).rejects.toThrow('saePollIntervalMs must be a positive finite number of milliseconds.');
+  });
 });
diff --git a/test/cli/kbench-cli.test.ts b/test/cli/kbench-cli.test.ts
@@ -99,4 +99,59 @@ describe('kbench CLI', () => {
     expect(payload.benchmarkError.message).toContain(missingIdFile);
     expect(payload.benchmarkError.message).toContain(missingKeyFile);
   });
+
+  it('rejects invalid sae-timeout-ms values before running the benchmark', async () => {
+    const result = await runKbench([
+      'benchmark',
+      'run',
+      '--benchmark',
+      'sae',
+      '--harness',
+      'kode-agent-sdk',
+      '--model-name',
+      'openai/gpt-4.1-mini',
+      '--sae-timeout-ms',
+      'NaN',
+    ]);
+
+    expect(result.status).toBe(1);
+    expect(result.stderr).toContain('Invalid --sae-timeout-ms. Expected a positive integer.');
+  });
+
+  it('rejects invalid sae-poll-interval-ms values before running the benchmark', async () => {
+    const result = await runKbench([
+      'benchmark',
+      'run',
+      '--benchmark',
+      'sae',
+      '--harness',
+      'kode-agent-sdk',
+      '--model-name',
+      'openai/gpt-4.1-mini',
+      '--sae-poll-interval-ms',
+      '0',
+    ]);
+
+    expect(result.status).toBe(1);
+    expect(result.stderr).toContain('Invalid --sae-poll-interval-ms. Expected a positive integer.');
+  });
+
+  it('rejects invalid timeout-ms values before running a single instance', async () => {
+    const result = await runKbench([
+      'run',
+      '--benchmark',
+      'swe',
+      '--harness',
+      'kode-agent-sdk',
+      '--model-name',
+      'openai/gpt-4.1-mini',
+      '--instruction',
+      'Fix the bug',
+      '--timeout-ms',
+      'NaN',
+    ]);
+
+    expect(result.status).toBe(1);
+    expect(result.stderr).toContain('Invalid --timeout-ms. Expected a positive integer.');
+  });
 });
diff --git a/test/harness/cli-runtime.test.ts b/test/harness/cli-runtime.test.ts
@@ -0,0 +1,45 @@
+import { execFileSync } from 'node:child_process';
+import fs from 'node:fs/promises';
+import os from 'node:os';
+import path from 'node:path';
+
+import { afterEach, describe, expect, it } from 'vitest';
+
+import { captureGitPatchBaseline, extractPatchSinceBaseline } from '../../src/harness/drivers/cli/runtime.js';
+
+const tempDirs: string[] = [];
+
+afterEach(async () => {
+  await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
+});
+
+async function makeGitRepo(): Promise<string> {
+  const dir = await fs.mkdtemp(path.join(os.tmpdir(), 'kbench-cli-runtime-'));
+  tempDirs.push(dir);
+
+  execFileSync('git', ['init'], { cwd: dir, stdio: 'ignore' });
+  execFileSync('git', ['config', 'user.email', 'kbench@example.com'], { cwd: dir, stdio: 'ignore' });
+  execFileSync('git', ['config', 'user.name', 'kbench'], { cwd: dir, stdio: 'ignore' });
+
+  await fs.writeFile(path.join(dir, 'tracked.txt'), 'base\n', 'utf-8');
+  execFileSync('git', ['add', 'tracked.txt'], { cwd: dir, stdio: 'ignore' });
+  execFileSync('git', ['commit', '-m', 'init'], { cwd: dir, stdio: 'ignore' });
+
+  return dir;
+}
+
+describe('CLI runtime patch capture', () => {
+  it('captures the current diff even when the worktree was already dirty before execution', async () => {
+    const repoDir = await makeGitRepo();
+
+    await fs.writeFile(path.join(repoDir, 'tracked.txt'), 'base\nbefore\n', 'utf-8');
+    const baseline = captureGitPatchBaseline(repoDir);
+
+    await fs.writeFile(path.join(repoDir, 'tracked.txt'), 'base\nbefore\nafter\n', 'utf-8');
+    const patch = extractPatchSinceBaseline(baseline);
+
+    expect(patch).toBeDefined();
+    expect(patch).toContain('tracked.txt');
+    expect(patch).toContain('after');
+  });
+});
diff --git a/test/sdk/adapter.test.ts b/test/sdk/adapter.test.ts
@@ -60,4 +60,54 @@ describe('adapter scaffolding', () => {
     expect(report.executionChecks[0]?.mode).toBe('task');
     expect(report.executionChecks[0]?.output?.status).toBe('ok');
   });
+
+  it('does not mark adapter validation as ok when execution has both errors and warnings', async () => {
+    const root = await makeTempRoot();
+    const adapterDir = path.join(root, 'warning-error-adapter');
+
+    await fs.mkdir(adapterDir, { recursive: true });
+    await fs.writeFile(
+      path.join(adapterDir, 'adapter.manifest.json'),
+      `${JSON.stringify({
+        schemaVersion: 'kbench.adapter/v1',
+        id: 'warning-error-adapter',
+        kind: 'node',
+        entry: './runner.mjs',
+        version: '0.1.0',
+        supportedBenchmarks: ['swe'],
+        capabilities: {
+          runModes: ['task'],
+          machineReadableStdout: true,
+          supportsPatchOutput: false,
+          supportsTrajectory: false,
+          supportsToolCallTrace: false,
+          supportsResume: false,
+          supportsImages: false,
+          supportsSandboxBridge: false,
+          supportsPromptTemplate: false,
+        },
+      }, null, 2)}\n`,
+      'utf-8'
+    );
+    await fs.writeFile(
+      path.join(adapterDir, 'runner.mjs'),
+      `#!/usr/bin/env node
+process.stdout.write(JSON.stringify({
+  ok: true,
+  status: 'bad-status',
+  elapsedMs: 1,
+  patch: 'diff --git a/a b/a\\n',
+}));
+`,
+      { encoding: 'utf-8', mode: 0o755 }
+    );
+
+    const report = await validateAdapter(adapterDir);
+
+    expect(report.ok).toBe(false);
+    expect(report.executionChecks).toHaveLength(1);
+    expect(report.executionChecks[0]?.ok).toBe(false);
+    expect(report.executionChecks[0]?.errors.some((error) => error.includes('valid "status"'))).toBe(true);
+    expect(report.executionChecks[0]?.warnings.some((warning) => warning.includes('returned a patch'))).toBe(true);
+  });
 });