diff --git a/EVAL_WORKFLOW.md b/EVAL_WORKFLOW.md index 8984acd..2ef66b1 100644 --- a/EVAL_WORKFLOW.md +++ b/EVAL_WORKFLOW.md @@ -227,3 +227,26 @@ Your solution should: - Group related functionality together - Include type imports from generated files - Add helpful comments for complex logic + +## AI grading + +The evals include a lightweight AI grader that reviews the generated project for each eval and provides concise reasoning on pass/fail. + +- The grader builds a prompt from `TASK.txt` plus a manifest of files from the generated output directory and asks a model to decide pass/fail with reasoning. +- The helper logs reasoning on every run and, on failure, throws an error with that reasoning so it appears directly in the test output and in `run.log`. + +### How to use in a grader test + +Add a single standardized test using the helper: + +```ts +import { createAIGraderTest } from "../../../grader/aiGrader"; + +// Basic usage (default name and 60s timeout) +createAIGraderTest(import.meta.url); + +// Optional: custom name/timeout +createAIGraderTest(import.meta.url, "AI grader assessment", 60000); +``` + +That’s it. On failure, the thrown error message will include the AI reasoning. The same reasoning is logged to the console and captured in the eval’s `run.log` by the default Vitest reporter. diff --git a/README.md b/README.md index cae7710..c051629 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,10 @@ Output: Optional Convex summary posting (still local mode): set both `CONVEX_EVAL_ENDPOINT` and `CONVEX_AUTH_TOKEN`. +## AI grading helper + +Grader tests can include an AI-based assessment that provides concise reasoning on failure. See the "AI grading" section in `EVAL_WORKFLOW.md` for details and usage with `createAIGraderTest(import.meta.url)`. + ## Rerunning grading After running the evals, you may want to dig into a particular test failure. You can use the `run_grader.py` script to grade the evaluations again without regenerating them: diff --git a/bun.lock b/bun.lock index 46e20ce..4a74360 100644 --- a/bun.lock +++ b/bun.lock @@ -4,13 +4,16 @@ "": { "name": "evals-convex", "dependencies": { + "@ai-sdk/openai": "^2.0.19", "@types/bun": "^1.2.20", "@types/node": "^22.12.0", + "ai": "^5.0.22", "convex": "^1.18.2", "dotenv": "^17.2.1", "prettier": "^3.4.2", "typescript-eslint": "^8.23.0", "vitest": "^3.0.2", + "zod": "^3.23.8", }, "devDependencies": { "@eslint/eslintrc": "^3.2.0", @@ -27,6 +30,14 @@ }, }, "packages": { + "@ai-sdk/gateway": ["@ai-sdk/gateway@1.0.11", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-ErwWS3sPOuWy42eE3AVxlKkTa1XjjKBEtNCOylVKMO5KNyz5qie8QVlLYbULOG56dtxX4zTKX3rQNJudplhcmQ=="], + + "@ai-sdk/openai": ["@ai-sdk/openai@2.0.19", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-sG3/IVaPvV7Vn6513I1bcJILHpLCXbVif2ht6CyROcB9FzXCJe2K5uRbAg30HWsdCEe7xu4OAWtMK6yWTOcsSA=="], + + "@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], + + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.5", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.3", "zod-to-json-schema": "^3.24.1" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-HliwB/yzufw3iwczbFVE2Fiwf1XqROB/I6ng8EKUsPM5+2wnIa8f4VbljZcDx+grhFrPV+PnRZH7zBqi8WZM7Q=="], + "@ampproject/remapping": ["@ampproject/remapping@2.3.0", "", { "dependencies": { "@jridgewell/gen-mapping": "^0.3.5", "@jridgewell/trace-mapping": "^0.3.24" } }, "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw=="], "@babel/code-frame": ["@babel/code-frame@7.26.2", "", { "dependencies": { "@babel/helper-validator-identifier": "^7.25.9", "js-tokens": "^4.0.0", "picocolors": "^1.0.0" } }, "sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ=="], @@ -157,6 +168,8 @@ "@nodelib/fs.walk": ["@nodelib/fs.walk@1.2.8", "", { "dependencies": { "@nodelib/fs.scandir": "2.1.5", "fastq": "^1.6.0" } }, "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg=="], + "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="], + "@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.31.0", "", { "os": "android", "cpu": "arm" }, "sha512-9NrR4033uCbUBRgvLcBrJofa2KY9DzxL2UKZ1/4xA/mnTNyhZCWBuD8X3tPm1n4KxcgaraOYgrFKSgwjASfmlA=="], "@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.31.0", "", { "os": "android", "cpu": "arm64" }, "sha512-iBbODqT86YBFHajxxF8ebj2hwKm1k8PTBQSojSt3d1FFt1gN+xf4CowE47iN0vOSdnd+5ierMHBbu/rHc7nq5g=="], @@ -195,6 +208,8 @@ "@rollup/rollup-win32-x64-msvc": ["@rollup/rollup-win32-x64-msvc@4.31.0", "", { "os": "win32", "cpu": "x64" }, "sha512-ul8rnCsUumNln5YWwz0ted2ZHFhzhRRnkpBZ+YRuHoRAlUji9KChpOUOndY7uykrPEPXVbHLlsdo6v5yXo/TXw=="], + "@standard-schema/spec": ["@standard-schema/spec@1.0.0", "", {}, "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA=="], + "@types/babel__core": ["@types/babel__core@7.20.5", "", { "dependencies": { "@babel/parser": "^7.20.7", "@babel/types": "^7.20.7", "@types/babel__generator": "*", "@types/babel__template": "*", "@types/babel__traverse": "*" } }, "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA=="], "@types/babel__generator": ["@types/babel__generator@7.6.8", "", { "dependencies": { "@babel/types": "^7.0.0" } }, "sha512-ASsj+tpEDsEiFr1arWrlN6V3mdfjRMZt6LtK/Vp/kreFLnr5QH5+DhvD5nINYZXzwJvXeGq+05iUXcAzVrqWtw=="], @@ -249,6 +264,8 @@ "acorn-jsx": ["acorn-jsx@5.3.2", "", { "peerDependencies": { "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ=="], + "ai": ["ai@5.0.22", "", { "dependencies": { "@ai-sdk/gateway": "1.0.11", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-RZiYhj7Ux7hrLtXkHPcxzdiSZt4NOiC69O5AkNfMCsz3twwz/KRkl9ASptosoOsg833s5yRcTSdIu5z53Sl6Pw=="], + "ajv": ["ajv@6.12.6", "", { "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", "json-schema-traverse": "^0.4.1", "uri-js": "^4.2.2" } }, "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g=="], "ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], @@ -335,6 +352,8 @@ "esutils": ["esutils@2.0.3", "", {}, "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g=="], + "eventsource-parser": ["eventsource-parser@3.0.5", "", {}, "sha512-bSRG85ZrMdmWtm7qkF9He9TNRzc/Bm99gEJMaQoHJ9E6Kv9QBbsldh2oMj7iXmYNEAVvNgvv5vPorG6W+XtBhQ=="], + "expect-type": ["expect-type@1.1.0", "", {}, "sha512-bFi65yM+xZgk+u/KRIpekdSYkTB5W1pEf0Lt8Q8Msh7b+eQ7LXVtIB1Bkm4fvclDEL1b2CZkMhv2mOeF8tMdkA=="], "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], @@ -391,6 +410,8 @@ "json-buffer": ["json-buffer@3.0.1", "", {}, "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ=="], + "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="], + "json-schema-traverse": ["json-schema-traverse@0.4.1", "", {}, "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg=="], "json-stable-stringify-without-jsonify": ["json-stable-stringify-without-jsonify@1.0.1", "", {}, "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw=="], @@ -527,6 +548,10 @@ "yocto-queue": ["yocto-queue@0.1.0", "", {}, "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q=="], + "zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="], + + "zod-to-json-schema": ["zod-to-json-schema@3.24.6", "", { "peerDependencies": { "zod": "^3.24.1" } }, "sha512-h/z3PKvcTcTetyjl1fkj79MHNEjm+HpD6NXheWjzOekY7kV+lwDYnHw+ivHkijnCSMz1yJaWBD9vu/Fcmk+vEg=="], + "@babel/core/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="], "@babel/helper-compilation-targets/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="], diff --git a/evals/000-fundamentals/008-helper_fns/grader.test.ts b/evals/000-fundamentals/008-helper_fns/grader.test.ts index 0c4df6d..05d9b2c 100644 --- a/evals/000-fundamentals/008-helper_fns/grader.test.ts +++ b/evals/000-fundamentals/008-helper_fns/grader.test.ts @@ -7,6 +7,9 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { Doc, Id } from "./answer/convex/_generated/dataModel"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("getItem and updateItem handle non-existent items", async () => { // Try to get a non-existent item diff --git a/evals/001-data_modeling/007-schema_evolution/grader.test.ts b/evals/001-data_modeling/007-schema_evolution/grader.test.ts index 12a58e7..7db0db0 100644 --- a/evals/001-data_modeling/007-schema_evolution/grader.test.ts +++ b/evals/001-data_modeling/007-schema_evolution/grader.test.ts @@ -8,6 +8,10 @@ import { import { api, internal } from "./answer/convex/_generated/api"; import { Doc } from "./answer/convex/_generated/dataModel"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); + test("migration helper transforms data correctly", async () => { // Insert a product with old schema format await addDocuments(responseAdminClient, "products", [ diff --git a/evals/001-data_modeling/009-normalize_json/grader.test.ts b/evals/001-data_modeling/009-normalize_json/grader.test.ts index 42a9e44..dbef2f7 100644 --- a/evals/001-data_modeling/009-normalize_json/grader.test.ts +++ b/evals/001-data_modeling/009-normalize_json/grader.test.ts @@ -7,6 +7,9 @@ import { hasIndexWithPrefix, getSchema, } from "../../../grader"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("organization data model works correctly", async () => { const schema = (await getSchema( diff --git a/evals/001-data_modeling/010-discriminated_union/grader.test.ts b/evals/001-data_modeling/010-discriminated_union/grader.test.ts index 4cdc1cb..86323a3 100644 --- a/evals/001-data_modeling/010-discriminated_union/grader.test.ts +++ b/evals/001-data_modeling/010-discriminated_union/grader.test.ts @@ -1,5 +1,8 @@ import { expect, test } from "vitest"; import { responseAdminClient, addDocuments } from "../../../grader"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("schema validates different notification types correctly", async () => { // Valid notifications diff --git a/evals/001-data_modeling/011-deconstruct_validators/grader.test.ts b/evals/001-data_modeling/011-deconstruct_validators/grader.test.ts index a9a41b2..3b4de22 100644 --- a/evals/001-data_modeling/011-deconstruct_validators/grader.test.ts +++ b/evals/001-data_modeling/011-deconstruct_validators/grader.test.ts @@ -8,6 +8,10 @@ import { import { resultValidator } from "./answer/convex/schema"; import { VLiteral, VObject, VString } from "convex/values"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); + afterAll(async () => { await deleteAllDocuments(responseAdminClient, ["llm_calls", "api_calls"]); }); @@ -26,11 +30,16 @@ test("resultValidator is exported as the correct type", async () => { expect(resultValidator.members[0].fields.success.kind).toBe("literal"); expect(resultValidator.members[1].fields).toHaveProperty("success"); expect(resultValidator.members[1].fields.success.kind).toBe("literal"); - let [success, error] = resultValidator.members as VObject<{ success: false; error?: string; value?: string }, { - success: VLiteral; - error?: VString; - value?: VString; - }, "required", "success" | "error" | "value">[]; + let [success, error] = resultValidator.members as VObject< + { success: false; error?: string; value?: string }, + { + success: VLiteral; + error?: VString; + value?: VString; + }, + "required", + "success" | "error" | "value" + >[]; if (success.fields.success.value !== true) { [success, error] = [error, success]; } @@ -40,82 +49,95 @@ test("resultValidator is exported as the correct type", async () => { expect(error.fields.error!.kind).toBe("string"); }); - test("schema validates successful results correctly", async () => { - await expect(addDocuments(responseAdminClient, "llm_calls", [ - { - prompt: "What is the capital of France?", - result: { - success: true, - value: "Paris" - } - } - ])).resolves.toBeUndefined(); + await expect( + addDocuments(responseAdminClient, "llm_calls", [ + { + prompt: "What is the capital of France?", + result: { + success: true, + value: "Paris", + }, + }, + ]), + ).resolves.toBeUndefined(); - await expect(addDocuments(responseAdminClient, "api_calls", [ - { - url: "https://api.example.com/data", - result: { - success: true, - value: "response data" - } - } - ])).resolves.toBeUndefined(); + await expect( + addDocuments(responseAdminClient, "api_calls", [ + { + url: "https://api.example.com/data", + result: { + success: true, + value: "response data", + }, + }, + ]), + ).resolves.toBeUndefined(); }); test("schema validates error results correctly", async () => { - await expect(addDocuments(responseAdminClient, "llm_calls", [ - { - prompt: "Invalid prompt", - result: { - success: false, - error: "Failed to process prompt" - } - } - ])).resolves.toBeUndefined(); + await expect( + addDocuments(responseAdminClient, "llm_calls", [ + { + prompt: "Invalid prompt", + result: { + success: false, + error: "Failed to process prompt", + }, + }, + ]), + ).resolves.toBeUndefined(); - await expect(addDocuments(responseAdminClient, "api_calls", [ - { - url: "https://api.example.com/invalid", - result: { - success: false, - error: "404 Not Found" - } - } - ])).resolves.toBeUndefined(); + await expect( + addDocuments(responseAdminClient, "api_calls", [ + { + url: "https://api.example.com/invalid", + result: { + success: false, + error: "404 Not Found", + }, + }, + ]), + ).resolves.toBeUndefined(); }); test("schema rejects invalid result formats", async () => { // Missing required fields - await expect(addDocuments(responseAdminClient, "llm_calls", [ - { - prompt: "test", - result: { - success: true - // missing value field - } - } - ])).rejects.toThrow(); + await expect( + addDocuments(responseAdminClient, "llm_calls", [ + { + prompt: "test", + result: { + success: true, + // missing value field + }, + }, + ]), + ).rejects.toThrow(); // Wrong field types - await expect(addDocuments(responseAdminClient, "api_calls", [ - { - url: "https://example.com", - result: { - success: false, - error: 123 // should be string - } - } - ])).rejects.toThrow(); + await expect( + addDocuments(responseAdminClient, "api_calls", [ + { + url: "https://example.com", + result: { + success: false, + error: 123, // should be string + }, + }, + ]), + ).rejects.toThrow(); // Invalid success value - await expect(addDocuments(responseAdminClient, "llm_calls", [ - { - prompt: "test", - result: { - success: "yes", // should be boolean literal - value: "test" - } - } - ])).rejects.toThrow(); -}); \ No newline at end of file + await expect( + addDocuments(responseAdminClient, "llm_calls", [ + { + prompt: "test", + result: { + success: "yes", // should be boolean literal + value: "test", + }, + }, + ]), + ).rejects.toThrow(); +}); diff --git a/evals/001-data_modeling/012-denormalize_for_index/grader.test.ts b/evals/001-data_modeling/012-denormalize_for_index/grader.test.ts index 8b930a4..4b783f5 100644 --- a/evals/001-data_modeling/012-denormalize_for_index/grader.test.ts +++ b/evals/001-data_modeling/012-denormalize_for_index/grader.test.ts @@ -8,6 +8,9 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "node:test"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); type IdOwners = string & { __tableName: "owners" }; type DogRow = { diff --git a/evals/002-queries/006-three_level_join/grader.test.ts b/evals/002-queries/006-three_level_join/grader.test.ts index 03b8130..debf143 100644 --- a/evals/002-queries/006-three_level_join/grader.test.ts +++ b/evals/002-queries/006-three_level_join/grader.test.ts @@ -7,6 +7,9 @@ import { listTable, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("compare schema", async ({ skip }) => { await compareSchema(skip); diff --git a/evals/002-queries/007-aggregation/grader.test.ts b/evals/002-queries/007-aggregation/grader.test.ts index 1e17a3e..93d9f71 100644 --- a/evals/002-queries/007-aggregation/grader.test.ts +++ b/evals/002-queries/007-aggregation/grader.test.ts @@ -6,6 +6,9 @@ import { addDocuments, } from "../../../grader"; import { anyApi } from "convex/server"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("compare schema", async ({ skip }) => { await compareSchema(skip); diff --git a/evals/002-queries/008-group_by/grader.test.ts b/evals/002-queries/008-group_by/grader.test.ts index 6a78125..dc5feb9 100644 --- a/evals/002-queries/008-group_by/grader.test.ts +++ b/evals/002-queries/008-group_by/grader.test.ts @@ -6,6 +6,9 @@ import { addDocuments, } from "../../../grader"; import { anyApi } from "convex/server"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("compare schema", async ({ skip }) => { await compareSchema(skip); diff --git a/evals/002-queries/010-parallel_fetch/grader.test.ts b/evals/002-queries/010-parallel_fetch/grader.test.ts index 7288f12..eb74f6a 100644 --- a/evals/002-queries/010-parallel_fetch/grader.test.ts +++ b/evals/002-queries/010-parallel_fetch/grader.test.ts @@ -7,6 +7,9 @@ import { listTable, } from "../../../grader"; import { anyApi } from "convex/server"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("compare schema", async ({ skip }) => { await compareSchema(skip); diff --git a/evals/002-queries/011-denormalize_pagination/grader.test.ts b/evals/002-queries/011-denormalize_pagination/grader.test.ts index a75d600..310f720 100644 --- a/evals/002-queries/011-denormalize_pagination/grader.test.ts +++ b/evals/002-queries/011-denormalize_pagination/grader.test.ts @@ -10,6 +10,9 @@ import { import { api } from "./answer/convex/_generated/api"; import { Doc } from "./answer/convex/_generated/dataModel"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { await deleteAllDocuments(responseAdminClient, ["dogs", "owners"]); diff --git a/evals/002-queries/012-index_and_filter/grader.test.ts b/evals/002-queries/012-index_and_filter/grader.test.ts index 3766028..a50dccb 100644 --- a/evals/002-queries/012-index_and_filter/grader.test.ts +++ b/evals/002-queries/012-index_and_filter/grader.test.ts @@ -7,6 +7,9 @@ import { deleteAllDocuments, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); afterEach(async () => { await deleteAllDocuments(responseAdminClient, ["users"]); diff --git a/evals/002-queries/013-async_iterator_filter/grader.test.ts b/evals/002-queries/013-async_iterator_filter/grader.test.ts index 157a4ff..1178678 100644 --- a/evals/002-queries/013-async_iterator_filter/grader.test.ts +++ b/evals/002-queries/013-async_iterator_filter/grader.test.ts @@ -10,6 +10,9 @@ import { import { api } from "./answer/convex/_generated/api"; import { Doc } from "./answer/convex/_generated/dataModel"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { await deleteAllDocuments(responseAdminClient, ["teams", "users"]); diff --git a/evals/002-queries/014-select_distinct/grader.test.ts b/evals/002-queries/014-select_distinct/grader.test.ts index 5a36a98..0f1605b 100644 --- a/evals/002-queries/014-select_distinct/grader.test.ts +++ b/evals/002-queries/014-select_distinct/grader.test.ts @@ -8,6 +8,9 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { await deleteAllDocuments(responseAdminClient, ["users"]); diff --git a/evals/002-queries/017-pagination_join/grader.test.ts b/evals/002-queries/017-pagination_join/grader.test.ts index 8f0701d..870aeac 100644 --- a/evals/002-queries/017-pagination_join/grader.test.ts +++ b/evals/002-queries/017-pagination_join/grader.test.ts @@ -8,6 +8,9 @@ import { listTable, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { Doc } from "./answer/convex/_generated/dataModel"; import { beforeEach } from "vitest"; diff --git a/evals/002-queries/018-pagination_returns_validator/grader.test.ts b/evals/002-queries/018-pagination_returns_validator/grader.test.ts index 1963f88..a996d0b 100644 --- a/evals/002-queries/018-pagination_returns_validator/grader.test.ts +++ b/evals/002-queries/018-pagination_returns_validator/grader.test.ts @@ -10,6 +10,9 @@ import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; import { PaginationResult } from "convex/server"; import { Doc } from "./answer/convex/_generated/dataModel"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { await deleteAllDocuments(responseAdminClient, ["posts"]); diff --git a/evals/002-queries/019-no_scheduler/grader.test.ts b/evals/002-queries/019-no_scheduler/grader.test.ts index f2f89de..14321da 100644 --- a/evals/002-queries/019-no_scheduler/grader.test.ts +++ b/evals/002-queries/019-no_scheduler/grader.test.ts @@ -9,6 +9,9 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { Doc } from "./answer/convex/_generated/dataModel"; beforeEach(async () => { diff --git a/evals/002-queries/020-text_search_join/grader.test.ts b/evals/002-queries/020-text_search_join/grader.test.ts index 1bff46d..30f92d5 100644 --- a/evals/002-queries/020-text_search_join/grader.test.ts +++ b/evals/002-queries/020-text_search_join/grader.test.ts @@ -8,6 +8,9 @@ import { listTable, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { beforeEach } from "vitest"; import { Doc } from "./answer/convex/_generated/dataModel"; diff --git a/evals/002-queries/021-intersection/grader.test.ts b/evals/002-queries/021-intersection/grader.test.ts index 1c677d5..7ed1483 100644 --- a/evals/002-queries/021-intersection/grader.test.ts +++ b/evals/002-queries/021-intersection/grader.test.ts @@ -8,6 +8,9 @@ import { listTable, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { beforeEach } from "vitest"; import { Doc } from "./answer/convex/_generated/dataModel"; diff --git a/evals/003-mutations/003-patch_nested/grader.test.ts b/evals/003-mutations/003-patch_nested/grader.test.ts index 7073f27..ddda0f3 100644 --- a/evals/003-mutations/003-patch_nested/grader.test.ts +++ b/evals/003-mutations/003-patch_nested/grader.test.ts @@ -7,6 +7,9 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { await deleteAllDocuments(responseAdminClient, ["documents"]); diff --git a/evals/003-mutations/004-cascade_delete/grader.test.ts b/evals/003-mutations/004-cascade_delete/grader.test.ts index afffdf7..a8efe44 100644 --- a/evals/003-mutations/004-cascade_delete/grader.test.ts +++ b/evals/003-mutations/004-cascade_delete/grader.test.ts @@ -8,6 +8,9 @@ import { listTable, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { beforeEach } from "vitest"; import { Doc } from "./answer/convex/_generated/dataModel"; diff --git a/evals/003-mutations/005-cascade_delete_nested/grader.test.ts b/evals/003-mutations/005-cascade_delete_nested/grader.test.ts index 511e0bb..5c7fac5 100644 --- a/evals/003-mutations/005-cascade_delete_nested/grader.test.ts +++ b/evals/003-mutations/005-cascade_delete_nested/grader.test.ts @@ -9,8 +9,11 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; import { Doc, Id } from "./answer/convex/_generated/dataModel"; +createAIGraderTest(import.meta.url); + beforeEach(async () => { await deleteAllDocuments(responseAdminClient, [ "users", diff --git a/evals/003-mutations/006-no_storage/grader.test.ts b/evals/003-mutations/006-no_storage/grader.test.ts index 4c3376b..e6bf28b 100644 --- a/evals/003-mutations/006-no_storage/grader.test.ts +++ b/evals/003-mutations/006-no_storage/grader.test.ts @@ -8,6 +8,9 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { Doc } from "./answer/convex/_generated/dataModel"; beforeEach(async () => { diff --git a/evals/004-actions/001-run_mutation/grader.test.ts b/evals/004-actions/001-run_mutation/grader.test.ts index e67bf3f..f684ad4 100644 --- a/evals/004-actions/001-run_mutation/grader.test.ts +++ b/evals/004-actions/001-run_mutation/grader.test.ts @@ -9,6 +9,9 @@ import { import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; import { Doc } from "./answer/convex/_generated/dataModel"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { await deleteAllDocuments(responseAdminClient, ["fetchResults"]); diff --git a/evals/004-actions/002-run_query_mutation/grader.test.ts b/evals/004-actions/002-run_query_mutation/grader.test.ts index 433f592..1348ce7 100644 --- a/evals/004-actions/002-run_query_mutation/grader.test.ts +++ b/evals/004-actions/002-run_query_mutation/grader.test.ts @@ -7,6 +7,9 @@ import { listTable, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { beforeEach } from "vitest"; import { Doc } from "./answer/convex/_generated/dataModel"; diff --git a/evals/004-actions/003-mutation_schedule_action/grader.test.ts b/evals/004-actions/003-mutation_schedule_action/grader.test.ts index 88f883d..6ae9662 100644 --- a/evals/004-actions/003-mutation_schedule_action/grader.test.ts +++ b/evals/004-actions/003-mutation_schedule_action/grader.test.ts @@ -8,6 +8,9 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { Doc } from "./answer/convex/_generated/dataModel"; beforeEach(async () => { diff --git a/evals/004-actions/004-storage/grader.test.ts b/evals/004-actions/004-storage/grader.test.ts index 9e65164..5e351f6 100644 --- a/evals/004-actions/004-storage/grader.test.ts +++ b/evals/004-actions/004-storage/grader.test.ts @@ -1,8 +1,11 @@ import { expect, test } from "vitest"; import { responseClient } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; import { Id } from "./answer/convex/_generated/dataModel"; +createAIGraderTest(import.meta.url); + test("writes and reads text content", async () => { const testText = "Hello, world!"; diff --git a/evals/004-actions/005-storage_http_action/grader.test.ts b/evals/004-actions/005-storage_http_action/grader.test.ts index 53d70a4..7b7a670 100644 --- a/evals/004-actions/005-storage_http_action/grader.test.ts +++ b/evals/004-actions/005-storage_http_action/grader.test.ts @@ -1,6 +1,9 @@ import { expect, test } from "vitest"; import { responseAdminClient } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); import { getSiteURL } from "./answer/convex/http"; async function getStoreURL(): Promise { diff --git a/evals/004-actions/006-node/grader.test.ts b/evals/004-actions/006-node/grader.test.ts index dbed3e0..51ac3b5 100644 --- a/evals/004-actions/006-node/grader.test.ts +++ b/evals/004-actions/006-node/grader.test.ts @@ -1,6 +1,9 @@ import { expect, test } from "vitest"; import { responseClient } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("processes string input correctly", async () => { const result = await responseClient.action(api.index.processWithNode, { diff --git a/evals/004-actions/007-http_action_routing/grader.test.ts b/evals/004-actions/007-http_action_routing/grader.test.ts index 6302c28..163fcf0 100644 --- a/evals/004-actions/007-http_action_routing/grader.test.ts +++ b/evals/004-actions/007-http_action_routing/grader.test.ts @@ -1,6 +1,9 @@ import { expect, test } from "vitest"; import { responseAdminClient } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); async function getBaseURL(): Promise { return await responseAdminClient.query(api.http.getSiteURL, {}); diff --git a/evals/005-idioms/001-file_organization/grader.test.ts b/evals/005-idioms/001-file_organization/grader.test.ts index 0fc40d7..87ef715 100644 --- a/evals/005-idioms/001-file_organization/grader.test.ts +++ b/evals/005-idioms/001-file_organization/grader.test.ts @@ -1,6 +1,9 @@ import { expect, test } from "vitest"; import { responseClient } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("can create and get user", async () => { const userData = { diff --git a/evals/005-idioms/002-batch_queries/grader.test.ts b/evals/005-idioms/002-batch_queries/grader.test.ts index c7677c2..dd644c8 100644 --- a/evals/005-idioms/002-batch_queries/grader.test.ts +++ b/evals/005-idioms/002-batch_queries/grader.test.ts @@ -10,6 +10,9 @@ import { import { api, internal } from "./answer/convex/_generated/api"; import { Doc, Id } from "./answer/convex/_generated/dataModel"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { await deleteAllDocuments(responseAdminClient, ["users", "posts"]); @@ -19,12 +22,18 @@ test("compare schema", async ({ skip }) => { await compareSchema(skip); }); -async function setupTestData(): Promise<{ userId: Id<"users">, postIds: Id<"posts">[] }> { +async function setupTestData(): Promise<{ + userId: Id<"users">; + postIds: Id<"posts">[]; +}> { // Create a test user await addDocuments(responseAdminClient, "users", [ { name: "Test User", email: "test@example.com" }, ]); - const users = await listTable(responseAdminClient, "users") as Doc<"users">[]; + const users = (await listTable( + responseAdminClient, + "users", + )) as Doc<"users">[]; const userId = users[0]._id; // Create some test posts @@ -32,8 +41,11 @@ async function setupTestData(): Promise<{ userId: Id<"users">, postIds: Id<"post { userId, content: "Post 1" }, { userId, content: "Post 2" }, ]); - const posts = await listTable(responseAdminClient, "posts") as Doc<"posts">[]; - const [post1Id, post2Id] = posts.map(p => p._id); + const posts = (await listTable( + responseAdminClient, + "posts", + )) as Doc<"posts">[]; + const [post1Id, post2Id] = posts.map((p) => p._id); return { userId, postIds: [post1Id, post2Id] }; } @@ -42,9 +54,12 @@ test("getUserByEmail returns correct user", async () => { const { userId } = await setupTestData(); /* eslint-disable */ - const user = await responseAdminClient.query(internal.users.getUserByEmail as any, { - email: "test@example.com", - }); + const user = await responseAdminClient.query( + internal.users.getUserByEmail as any, + { + email: "test@example.com", + }, + ); /* eslint-enable */ expect(user).toBeDefined(); @@ -56,9 +71,12 @@ test("getUserByEmail returns correct user", async () => { test("getUserByEmail returns null for non-existent user", async () => { /* eslint-disable */ - const user = await responseAdminClient.query(internal.users.getUserByEmail as any, { - email: "nonexistent@example.com", - }); + const user = await responseAdminClient.query( + internal.users.getUserByEmail as any, + { + email: "nonexistent@example.com", + }, + ); /* eslint-enable */ expect(user).toBeNull(); }); diff --git a/evals/006-clients/000-use_query/grader.test.ts b/evals/006-clients/000-use_query/grader.test.ts index 8b0acbc..d044af3 100644 --- a/evals/006-clients/000-use_query/grader.test.ts +++ b/evals/006-clients/000-use_query/grader.test.ts @@ -9,6 +9,9 @@ import { } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; import { beforeEach } from "vitest"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { await deleteAllDocuments(responseAdminClient, ["messages"]); diff --git a/evals/006-clients/001-use_mutation/grader.test.ts b/evals/006-clients/001-use_mutation/grader.test.ts index f09ab5b..333b790 100644 --- a/evals/006-clients/001-use_mutation/grader.test.ts +++ b/evals/006-clients/001-use_mutation/grader.test.ts @@ -8,6 +8,9 @@ import { listTable, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); beforeEach(async () => { // Clear the messages table before each test diff --git a/evals/006-clients/002-use_paginated_query/grader.test.ts b/evals/006-clients/002-use_paginated_query/grader.test.ts index 0228a97..5c71867 100644 --- a/evals/006-clients/002-use_paginated_query/grader.test.ts +++ b/evals/006-clients/002-use_paginated_query/grader.test.ts @@ -9,6 +9,9 @@ import { listTable, } from "../../../grader"; import { api } from "./answer/convex/_generated/api"; +import { createAIGraderTest } from "../../../grader/aiGrader"; + +createAIGraderTest(import.meta.url); test("compare function spec", async ({ skip }) => { await compareFunctionSpec(skip); diff --git a/grader/aiGrader.ts b/grader/aiGrader.ts new file mode 100644 index 0000000..fcf840d --- /dev/null +++ b/grader/aiGrader.ts @@ -0,0 +1,298 @@ +/* eslint-disable */ + +import { readdirSync, readFileSync, statSync } from "node:fs"; +import { join, relative } from "node:path"; +import { tmpdir } from "node:os"; +import { fileURLToPath } from "node:url"; +import { generateObject } from "ai"; +import { createOpenAI } from "@ai-sdk/openai"; +import { z } from "zod"; +import { test } from "vitest"; + +type EvalInfo = { + category: string; + name: string; + testFilePath: string; +}; + +type GradeResult = { + result: "pass" | "fail"; + reasoning: string; +}; + +/** + * Extract eval category and name from the test file URL and load the task content. + */ +function getTask(testFileUrl: string): { + evalInfo: EvalInfo; + taskContent: string; +} { + const testFilePath = fileURLToPath(testFileUrl); + const parts = testFilePath.replace(/\\/g, "/").split("/"); + const evalsIdx = parts.lastIndexOf("evals"); + if (evalsIdx < 0 || parts.length < evalsIdx + 3) + throw new Error( + `Could not derive eval category/name from path: ${testFilePath}`, + ); + + const evalInfo: EvalInfo = { + category: parts[evalsIdx + 1], + name: parts[evalsIdx + 2], + testFilePath, + }; + + // Load the task assignment alongside the grader + const taskPath = testFilePath + .replace(/grader\.test\.ts$/, "TASK.txt") + .replace(/grader\.test\.tsx$/, "TASK.txt"); + + let taskContent = ""; + try { + taskContent = readFileSync(taskPath, { encoding: "utf-8" }); + } catch { + throw new Error(`TASK.txt not found at expected path: ${taskPath}`); + } + + return { evalInfo, taskContent }; +} + +/** + * Find and return paths to all answer files in the generated output directory. + */ +function gatherAnswerFiles(evalInfo: EvalInfo): { + filePaths: string[]; + outputProjectDir: string; +} { + const { category, name } = evalInfo; + + // Locate the generated output project directory under the system tempdir + const candidateRoots = [] as { dir: string; mtime: number }[]; + const tdir = tmpdir(); + + for (const entry of readdirSync(tdir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + const root = join(tdir, entry.name, "output"); + try { + const models = readdirSync(root, { withFileTypes: true }).filter((d) => + d.isDirectory(), + ); + for (const modelDir of models) { + const projectDir = join(root, modelDir.name, category, name); + try { + const st = statSync(projectDir); + if (st.isDirectory()) + candidateRoots.push({ dir: projectDir, mtime: st.mtimeMs }); + } catch { + // ignore missing + } + } + } catch { + // no output here + } + } + + if (candidateRoots.length === 0) + throw new Error( + `Could not find output directory for ${category}/${name} under ${tdir}`, + ); + + candidateRoots.sort((a, b) => b.mtime - a.mtime); + const outputProjectDir = candidateRoots[0].dir; + + // Collect file paths with exclusions + const excludedFileNames = new Set([ + "run.log", + "tsconfig.json", + "bun.lock", + ".env.local", + ]); + const shouldInclude = (fullPath: string): boolean => { + const rel = relative(outputProjectDir, fullPath).replace(/\\/g, "/"); + if (rel.startsWith("node_modules/")) return false; + if (rel.startsWith("convex/_generated/")) return false; + if (rel === "convex/README.md") return false; + if (rel === "convex/tsconfig.json") return false; + const base = rel.split("/").pop() ?? ""; + if (excludedFileNames.has(base)) return false; + return true; + }; + + const stack: string[] = [outputProjectDir]; + const filePaths: string[] = []; + + while (stack.length > 0) { + const dir = stack.pop() as string; + for (const de of readdirSync(dir, { withFileTypes: true })) { + const full = join(dir, de.name); + if (de.isDirectory()) { + if (de.name === "node_modules") continue; + if ( + de.name === "_generated" && + dir.replace(/\\/g, "/").endsWith("/convex") + ) + continue; + stack.push(full); + } else { + if (shouldInclude(full)) filePaths.push(full); + } + } + } + + filePaths.sort(); + return { filePaths, outputProjectDir }; +} + +/** + * Read and concatenate all answer files into a single string with file headers. + */ +function concatenateAnswerFiles( + filePaths: string[], + outputProjectDir: string, +): string { + if (filePaths.length === 0) return ""; + + const getLang = (path: string): string => { + const lower = path.toLowerCase(); + if (lower.endsWith(".ts")) return "ts"; + if (lower.endsWith(".tsx")) return "tsx"; + if (lower.endsWith(".js")) return "js"; + if (lower.endsWith(".jsx")) return "jsx"; + if (lower.endsWith(".json")) return "json"; + if (lower.endsWith(".md")) return "md"; + if (lower.endsWith(".sql")) return "sql"; + if (lower.endsWith(".py")) return "python"; + if (lower.endsWith(".txt")) return "text"; + return "text"; + }; + + type ManifestEntry = { + path: string; + language: string; + bytes: number; + lines: number; + }; + const manifest: ManifestEntry[] = []; + const blocks: string[] = []; + + for (const fp of filePaths) { + try { + const rel = relative(outputProjectDir, fp).replace(/\\/g, "/"); + const content = readFileSync(fp, { encoding: "utf-8" }); + const lines = content.split(/\r?\n/).length; + const lang = getLang(rel); + manifest.push({ + path: rel, + language: lang, + bytes: Buffer.byteLength(content, "utf-8"), + lines, + }); + const block = + `\n<<>>\n` + + `\n\u0060\u0060\u0060${lang}\n` + + `${content}\n` + + `\u0060\u0060\u0060\n` + + `<<>>\n`; + blocks.push(block); + } catch { + // skip unreadable files + } + } + + const manifestJson = JSON.stringify(manifest, null, 2); + const header = `FILES MANIFEST (JSON)\n\u0060\u0060\u0060json\n${manifestJson}\n\u0060\u0060\u0060\n`; + + const LIMIT = 300_000; + let result = header; + let omitted = 0; + for (const block of blocks) { + if (result.length + block.length > LIMIT) { + omitted += 1; + continue; + } + result += block; + } + if (omitted > 0) + result += `\n[omitted ${omitted} file block(s) due to length limit]\n`; + return result; +} + +/** + * Use AI to generate a grade based on the task and concatenated files. + */ +async function generateGrade( + taskContent: string, + concatenated: string, +): Promise { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) throw new Error("OPENAI_API_KEY is not set"); + + const prompt = `You are grading an autogenerated Convex backend submission. + +Task assignment (verbatim from TASK.txt): +--- +${taskContent} +--- + +You are given a manifest and a sequence of file blocks with explicit boundaries. +Instructions: +- Treat each file block independently; do not merge or infer code across files. +- Only use the content inside a given block when assessing that file. +- If files are omitted due to length limits, grade only on what is present. +- When referencing code, cite the file path from the manifest. + +File data: +--- +${concatenated} +--- + +Decide if the output fully satisfies the task requirements. Provide a short reasoning and a final grade.`; + + const openai = createOpenAI({ apiKey }); + + const schema = z.object({ + reasoning: z.string(), + grade: z.enum(["pass", "fail"]), + }); + + const resultObj = schema.parse( + ( + await generateObject({ + model: openai("gpt-5-mini"), + schema, + prompt, + }) + ).object, + ); + + return { result: resultObj.grade, reasoning: resultObj.reasoning }; +} + +/** + * Helper for tests: throws with AI reasoning when grading fails, so the failure + * message includes the specific cause. Logs reasoning either way. + */ +export async function expectAIGraderPass(testFileUrl: string): Promise { + const { evalInfo, taskContent } = getTask(testFileUrl); + const { filePaths, outputProjectDir } = gatherAnswerFiles(evalInfo); + const concatenated = concatenateAnswerFiles(filePaths, outputProjectDir); + const { result, reasoning } = await generateGrade(taskContent, concatenated); + + console.log( + `[AI Grader ${evalInfo.category}/${evalInfo.name}] ${result == "pass" ? "PASS" : "FAIL"}: ${reasoning}`, + ); + + if (result !== "pass") throw new Error(`AI grading failed: ${reasoning}`); +} + +/** + * Create a standardized Vitest for AI grading with optional name and timeout. + */ +export function createAIGraderTest( + testFileUrl: string, + name: string = "AI grader assessment", + timeoutMs: number = 60000, +): void { + test(name, { timeout: timeoutMs }, async () => { + await expectAIGraderPass(testFileUrl); + }); +} diff --git a/grader/index.ts b/grader/index.ts index 17c9410..6c8db24 100644 --- a/grader/index.ts +++ b/grader/index.ts @@ -1,5 +1,6 @@ import { ConvexClient } from "convex/browser"; import { expect } from "vitest"; + /* eslint-disable @typescript-eslint/no-explicit-any */ /* eslint-disable @typescript-eslint/no-unsafe-call */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ @@ -193,7 +194,7 @@ export function hasIndexForPrefix( fieldNames?: string[]; }[]; return indexes.some((idx) => { - const idxFields = (idx.fields ?? idx.fieldNames ?? []) as string[]; + const idxFields = idx.fields ?? idx.fieldNames ?? []; if (!Array.isArray(idxFields)) return false; if (idxFields.length < fieldsPrefix.length) return false; for (let i = 0; i < fieldsPrefix.length; i++) { diff --git a/package.json b/package.json index c704098..b4f7f8e 100644 --- a/package.json +++ b/package.json @@ -18,8 +18,11 @@ "typescript": "^5.7.3" }, "dependencies": { + "@ai-sdk/openai": "^2.0.19", + "zod": "^3.23.8", "@types/bun": "^1.2.20", "@types/node": "^22.12.0", + "ai": "^5.0.22", "convex": "^1.18.2", "dotenv": "^17.2.1", "prettier": "^3.4.2", diff --git a/runner/convex_backend.py b/runner/convex_backend.py index 0862cc8..05fe645 100644 --- a/runner/convex_backend.py +++ b/runner/convex_backend.py @@ -134,10 +134,10 @@ def download_convex_binary(): if os.path.exists(binary_path): return binary_path - log_info("Latest release:", version) + print(f"Latest release: {version}", flush=True) url = matching_asset["browser_download_url"] - log_info("Downloading:", url) + print(f"Downloading: {url}", flush=True) response = requests.get(url, stream=True) response.raise_for_status() @@ -145,7 +145,7 @@ def download_convex_binary(): with open(zip_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) - log_info("Downloaded:", matching_asset["name"]) + print(f"Downloaded: {matching_asset['name']}", flush=True) # Unzip the file with zipfile.ZipFile(zip_path, "r") as zip_ref: @@ -163,6 +163,6 @@ def download_convex_binary(): # Clean up zip file os.remove(zip_path) - log_info("Extracted binary to:", binary_path) + print(f"Extracted binary to: {binary_path}", flush=True) return binary_path diff --git a/runner/reporting.py b/runner/reporting.py index cc57953..727c0ea 100644 --- a/runner/reporting.py +++ b/runner/reporting.py @@ -13,6 +13,9 @@ def post_scores_to_convex(model_name: str, category_scores: dict, total_score: float) -> None: + # When Braintrust is disabled, also disable reporting to the Convex endpoint + if os.getenv("DISABLE_BRAINTRUST") == "1": + return payload = {"model": model_name, "scores": category_scores, "totalScore": total_score} if CONVEX_EVAL_ENDPOINT is not None and CONVEX_AUTH_TOKEN is not None: try: @@ -77,20 +80,20 @@ def report_eval(evaluator, result: EvalResultWithSummary, verbose, jsonl): # Pretty console summary overall_rate = (total_score / total_num_tests) if total_num_tests > 0 else 0 - log_info("") - log_info("=== Eval Summary ===") - log_info(f"Model: {results[0].metadata.get('model_name', 'unknown') if results and results[0].metadata else 'unknown'}") - log_info(f"Overall: {overall_rate:.2%} ({total_passed} pass, {total_num_tests - total_passed} fail)") + print("", flush=True) + print("=== Eval Summary ===", flush=True) + print(f"Model: {results[0].metadata.get('model_name', 'unknown') if results and results[0].metadata else 'unknown'}", flush=True) + print(f"Overall: {overall_rate:.2%} ({total_passed} pass, {total_num_tests - total_passed} fail)", flush=True) for category in sorted(num_tests.keys()): rate = scores[category] / num_tests[category] cat_pass = passed_counts.get(category, 0) - log_info(f"- {category}: {rate:.2%} ({cat_pass} pass, {num_tests[category] - cat_pass} fail)") + print(f"- {category}: {rate:.2%} ({cat_pass} pass, {num_tests[category] - cat_pass} fail)", flush=True) # Always write local results; print the path - log_info(f"Results written to: {OUTPUT_RESULTS_FILE}") + print(f"Results written to: {OUTPUT_RESULTS_FILE}", flush=True) if jsonl: - log_info(json.dumps(summary.as_dict())) + print(json.dumps(summary.as_dict()), flush=True) return len(failing_results) == 0 diff --git a/runner/run_grader.py b/runner/run_grader.py index 56c94d8..de0b718 100644 --- a/runner/run_grader.py +++ b/runner/run_grader.py @@ -74,7 +74,7 @@ def run_grader(category: str, name: str, project_dir: str): except Exception as e: message.append(f" - Tests fail: {e}") - log_info("\n".join(message)) + print("\n".join(message), flush=True) return success diff --git a/runner/scorer.py b/runner/scorer.py index 77dccce..949ca59 100644 --- a/runner/scorer.py +++ b/runner/scorer.py @@ -3,6 +3,7 @@ import subprocess import json import re +import tempfile from braintrust import traced, Score from runner.convex_backend import convex_backend, admin_key from runner.logging import append_log, append_log_block, log_cmd_results, log_info, run_command_step @@ -132,6 +133,9 @@ def convex_scorer(model, tempdir, *, input, expected, metadata, output): if isinstance(e, TestsFailedException): scores.append(Score("Tests pass", e.ratio)) tests_ratio = e.ratio + # Even on failure, capture and log Vitest stdout (includes console logs) + if getattr(e, "stdout", None) is not None and getattr(e, "cmd", None) is not None: + log_cmd_results(run_log_path, [(e.cmd, e.stdout)], "vitest") else: scores.append(Score("Tests pass", 0)) tests_ratio = 0.0 @@ -168,9 +172,11 @@ def convex_scorer(model, tempdir, *, input, expected, metadata, output): class TestsFailedException(Exception): - def __init__(self, message, ratio): + def __init__(self, message, ratio, stdout=None, cmd=None): super().__init__(message) self.ratio = ratio + self.stdout = stdout + self.cmd = cmd @traced @@ -344,12 +350,20 @@ def run_tests(backend, answer_backend, test_file): ) if answer_backend is not None: env["CONVEX_ANSWER_PORT"] = str(answer_backend["port"]) + # Write JSON reporter output to a temp file so stdout can include human output + console logs + tmp_json = tempfile.NamedTemporaryFile(delete=False, suffix=".json") + tmp_json_path = tmp_json.name + tmp_json.close() + # Vitest supports multiple reporters; keep JSON (to parse) and default (to include logs on stdout) cmd = [ "bunx", "vitest", "run", test_file, "--reporter=json", + "--outputFile", + tmp_json_path, + "--reporter=default", "--no-color", ] done = subprocess.run( @@ -361,9 +375,15 @@ def run_tests(backend, answer_backend, test_file): ) try: - # Removes all characters before the first `{` and after the last `}` - cleaned_stdout = re.sub(r"^.*?(\{.*\}).*$", r"\1", done.stdout, flags=re.DOTALL) - results = json.loads(cleaned_stdout) + # Prefer reading structured results from the JSON reporter file + results = None + try: + with open(tmp_json_path, "r", encoding="utf-8") as f: + results = json.load(f) + except Exception: + # Fallback: extract JSON blob from stdout if file missing or invalid + cleaned_stdout = re.sub(r"^.*?(\{.*\}).*$", r"\1", done.stdout, flags=re.DOTALL) + results = json.loads(cleaned_stdout) total = results["numTotalTests"] passed = results["numPassedTests"] @@ -374,13 +394,18 @@ def run_tests(backend, answer_backend, test_file): raise Exception(f"Failed to run tests:\n{done.stdout}") else: raise Exception(f"Failed to parse tests results: {e}") + finally: + try: + os.remove(tmp_json_path) + except Exception: + pass if ratio != 1: error_message = "" for test in results["testResults"][0]["assertionResults"]: if test["status"] == "failed": error_message += f"{test['title']}: {test['failureMessages']}\n" - raise TestsFailedException(f"Tests failed:\n{error_message}", ratio) + raise TestsFailedException(f"Tests failed:\n{error_message}", ratio, done.stdout, cmd) return ratio, done.stdout, cmd