diff --git a/EVAL_WORKFLOW.md b/EVAL_WORKFLOW.md
index 8984acd..2ef66b1 100644
--- a/EVAL_WORKFLOW.md
+++ b/EVAL_WORKFLOW.md
@@ -227,3 +227,26 @@ Your solution should:
    - Group related functionality together
    - Include type imports from generated files
    - Add helpful comments for complex logic
+
+## AI grading
+
+The evals include a lightweight AI grader that reviews the generated project for each eval and provides concise reasoning on pass/fail.
+
+- The grader builds a prompt from `TASK.txt` plus a manifest of files from the generated output directory and asks a model to decide pass/fail with reasoning.
+- The helper logs reasoning on every run and, on failure, throws an error with that reasoning so it appears directly in the test output and in `run.log`.
+
+### How to use in a grader test
+
+Add a single standardized test using the helper:
+
+```ts
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+// Basic usage (default name and 60s timeout)
+createAIGraderTest(import.meta.url);
+
+// Optional: custom name/timeout
+createAIGraderTest(import.meta.url, "AI grader assessment", 60000);
+```
+
+That’s it. On failure, the thrown error message will include the AI reasoning. The same reasoning is logged to the console and captured in the eval’s `run.log` by the default Vitest reporter.
diff --git a/README.md b/README.md
index cae7710..c051629 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,10 @@ Output:
 
 Optional Convex summary posting (still local mode): set both `CONVEX_EVAL_ENDPOINT` and `CONVEX_AUTH_TOKEN`.
 
+## AI grading helper
+
+Grader tests can include an AI-based assessment that provides concise reasoning on failure. See the "AI grading" section in `EVAL_WORKFLOW.md` for details and usage with `createAIGraderTest(import.meta.url)`.
+
 ## Rerunning grading
 
 After running the evals, you may want to dig into a particular test failure. You can use the `run_grader.py` script to grade the evaluations again without regenerating them:
diff --git a/bun.lock b/bun.lock
index 46e20ce..4a74360 100644
--- a/bun.lock
+++ b/bun.lock
@@ -4,13 +4,16 @@
     "": {
       "name": "evals-convex",
       "dependencies": {
+        "@ai-sdk/openai": "^2.0.19",
         "@types/bun": "^1.2.20",
         "@types/node": "^22.12.0",
+        "ai": "^5.0.22",
         "convex": "^1.18.2",
         "dotenv": "^17.2.1",
         "prettier": "^3.4.2",
         "typescript-eslint": "^8.23.0",
         "vitest": "^3.0.2",
+        "zod": "^3.23.8",
       },
       "devDependencies": {
         "@eslint/eslintrc": "^3.2.0",
@@ -27,6 +30,14 @@
     },
   },
   "packages": {
+    "@ai-sdk/gateway": ["@ai-sdk/gateway@1.0.11", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-ErwWS3sPOuWy42eE3AVxlKkTa1XjjKBEtNCOylVKMO5KNyz5qie8QVlLYbULOG56dtxX4zTKX3rQNJudplhcmQ=="],
+
+    "@ai-sdk/openai": ["@ai-sdk/openai@2.0.19", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-sG3/IVaPvV7Vn6513I1bcJILHpLCXbVif2ht6CyROcB9FzXCJe2K5uRbAg30HWsdCEe7xu4OAWtMK6yWTOcsSA=="],
+
+    "@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="],
+
+    "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.5", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.3", "zod-to-json-schema": "^3.24.1" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-HliwB/yzufw3iwczbFVE2Fiwf1XqROB/I6ng8EKUsPM5+2wnIa8f4VbljZcDx+grhFrPV+PnRZH7zBqi8WZM7Q=="],
+
     "@ampproject/remapping": ["@ampproject/remapping@2.3.0", "", { "dependencies": { "@jridgewell/gen-mapping": "^0.3.5", "@jridgewell/trace-mapping": "^0.3.24" } }, "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw=="],
 
     "@babel/code-frame": ["@babel/code-frame@7.26.2", "", { "dependencies": { "@babel/helper-validator-identifier": "^7.25.9", "js-tokens": "^4.0.0", "picocolors": "^1.0.0" } }, "sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ=="],
@@ -157,6 +168,8 @@
 
     "@nodelib/fs.walk": ["@nodelib/fs.walk@1.2.8", "", { "dependencies": { "@nodelib/fs.scandir": "2.1.5", "fastq": "^1.6.0" } }, "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg=="],
 
+    "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="],
+
     "@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.31.0", "", { "os": "android", "cpu": "arm" }, "sha512-9NrR4033uCbUBRgvLcBrJofa2KY9DzxL2UKZ1/4xA/mnTNyhZCWBuD8X3tPm1n4KxcgaraOYgrFKSgwjASfmlA=="],
 
     "@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.31.0", "", { "os": "android", "cpu": "arm64" }, "sha512-iBbODqT86YBFHajxxF8ebj2hwKm1k8PTBQSojSt3d1FFt1gN+xf4CowE47iN0vOSdnd+5ierMHBbu/rHc7nq5g=="],
@@ -195,6 +208,8 @@
 
     "@rollup/rollup-win32-x64-msvc": ["@rollup/rollup-win32-x64-msvc@4.31.0", "", { "os": "win32", "cpu": "x64" }, "sha512-ul8rnCsUumNln5YWwz0ted2ZHFhzhRRnkpBZ+YRuHoRAlUji9KChpOUOndY7uykrPEPXVbHLlsdo6v5yXo/TXw=="],
 
+    "@standard-schema/spec": ["@standard-schema/spec@1.0.0", "", {}, "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA=="],
+
     "@types/babel__core": ["@types/babel__core@7.20.5", "", { "dependencies": { "@babel/parser": "^7.20.7", "@babel/types": "^7.20.7", "@types/babel__generator": "*", "@types/babel__template": "*", "@types/babel__traverse": "*" } }, "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA=="],
 
     "@types/babel__generator": ["@types/babel__generator@7.6.8", "", { "dependencies": { "@babel/types": "^7.0.0" } }, "sha512-ASsj+tpEDsEiFr1arWrlN6V3mdfjRMZt6LtK/Vp/kreFLnr5QH5+DhvD5nINYZXzwJvXeGq+05iUXcAzVrqWtw=="],
@@ -249,6 +264,8 @@
 
     "acorn-jsx": ["acorn-jsx@5.3.2", "", { "peerDependencies": { "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ=="],
 
+    "ai": ["ai@5.0.22", "", { "dependencies": { "@ai-sdk/gateway": "1.0.11", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-RZiYhj7Ux7hrLtXkHPcxzdiSZt4NOiC69O5AkNfMCsz3twwz/KRkl9ASptosoOsg833s5yRcTSdIu5z53Sl6Pw=="],
+
     "ajv": ["ajv@6.12.6", "", { "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", "json-schema-traverse": "^0.4.1", "uri-js": "^4.2.2" } }, "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g=="],
 
     "ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="],
@@ -335,6 +352,8 @@
 
     "esutils": ["esutils@2.0.3", "", {}, "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g=="],
 
+    "eventsource-parser": ["eventsource-parser@3.0.5", "", {}, "sha512-bSRG85ZrMdmWtm7qkF9He9TNRzc/Bm99gEJMaQoHJ9E6Kv9QBbsldh2oMj7iXmYNEAVvNgvv5vPorG6W+XtBhQ=="],
+
     "expect-type": ["expect-type@1.1.0", "", {}, "sha512-bFi65yM+xZgk+u/KRIpekdSYkTB5W1pEf0Lt8Q8Msh7b+eQ7LXVtIB1Bkm4fvclDEL1b2CZkMhv2mOeF8tMdkA=="],
 
     "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="],
@@ -391,6 +410,8 @@
 
     "json-buffer": ["json-buffer@3.0.1", "", {}, "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ=="],
 
+    "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="],
+
     "json-schema-traverse": ["json-schema-traverse@0.4.1", "", {}, "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg=="],
 
     "json-stable-stringify-without-jsonify": ["json-stable-stringify-without-jsonify@1.0.1", "", {}, "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw=="],
@@ -527,6 +548,10 @@
 
     "yocto-queue": ["yocto-queue@0.1.0", "", {}, "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q=="],
 
+    "zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],
+
+    "zod-to-json-schema": ["zod-to-json-schema@3.24.6", "", { "peerDependencies": { "zod": "^3.24.1" } }, "sha512-h/z3PKvcTcTetyjl1fkj79MHNEjm+HpD6NXheWjzOekY7kV+lwDYnHw+ivHkijnCSMz1yJaWBD9vu/Fcmk+vEg=="],
+
     "@babel/core/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="],
 
     "@babel/helper-compilation-targets/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="],
diff --git a/evals/000-fundamentals/008-helper_fns/grader.test.ts b/evals/000-fundamentals/008-helper_fns/grader.test.ts
index 0c4df6d..05d9b2c 100644
--- a/evals/000-fundamentals/008-helper_fns/grader.test.ts
+++ b/evals/000-fundamentals/008-helper_fns/grader.test.ts
@@ -7,6 +7,9 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { Doc, Id } from "./answer/convex/_generated/dataModel";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("getItem and updateItem handle non-existent items", async () => {
   // Try to get a non-existent item
diff --git a/evals/001-data_modeling/007-schema_evolution/grader.test.ts b/evals/001-data_modeling/007-schema_evolution/grader.test.ts
index 12a58e7..7db0db0 100644
--- a/evals/001-data_modeling/007-schema_evolution/grader.test.ts
+++ b/evals/001-data_modeling/007-schema_evolution/grader.test.ts
@@ -8,6 +8,10 @@ import {
 import { api, internal } from "./answer/convex/_generated/api";
 import { Doc } from "./answer/convex/_generated/dataModel";
 
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
+
 test("migration helper transforms data correctly", async () => {
   // Insert a product with old schema format
   await addDocuments(responseAdminClient, "products", [
diff --git a/evals/001-data_modeling/009-normalize_json/grader.test.ts b/evals/001-data_modeling/009-normalize_json/grader.test.ts
index 42a9e44..dbef2f7 100644
--- a/evals/001-data_modeling/009-normalize_json/grader.test.ts
+++ b/evals/001-data_modeling/009-normalize_json/grader.test.ts
@@ -7,6 +7,9 @@ import {
   hasIndexWithPrefix,
   getSchema,
 } from "../../../grader";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("organization data model works correctly", async () => {
   const schema = (await getSchema(
diff --git a/evals/001-data_modeling/010-discriminated_union/grader.test.ts b/evals/001-data_modeling/010-discriminated_union/grader.test.ts
index 4cdc1cb..86323a3 100644
--- a/evals/001-data_modeling/010-discriminated_union/grader.test.ts
+++ b/evals/001-data_modeling/010-discriminated_union/grader.test.ts
@@ -1,5 +1,8 @@
 import { expect, test } from "vitest";
 import { responseAdminClient, addDocuments } from "../../../grader";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("schema validates different notification types correctly", async () => {
   // Valid notifications
diff --git a/evals/001-data_modeling/011-deconstruct_validators/grader.test.ts b/evals/001-data_modeling/011-deconstruct_validators/grader.test.ts
index a9a41b2..3b4de22 100644
--- a/evals/001-data_modeling/011-deconstruct_validators/grader.test.ts
+++ b/evals/001-data_modeling/011-deconstruct_validators/grader.test.ts
@@ -8,6 +8,10 @@ import {
 import { resultValidator } from "./answer/convex/schema";
 import { VLiteral, VObject, VString } from "convex/values";
 
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
+
 afterAll(async () => {
   await deleteAllDocuments(responseAdminClient, ["llm_calls", "api_calls"]);
 });
@@ -26,11 +30,16 @@ test("resultValidator is exported as the correct type", async () => {
   expect(resultValidator.members[0].fields.success.kind).toBe("literal");
   expect(resultValidator.members[1].fields).toHaveProperty("success");
   expect(resultValidator.members[1].fields.success.kind).toBe("literal");
-  let [success, error] = resultValidator.members as VObject<{ success: false; error?: string; value?: string }, {
-    success: VLiteral<false | true, "required">;
-    error?: VString<string, "required">;
-    value?: VString<string, "optional">;
-   }, "required", "success" | "error" | "value">[];
+  let [success, error] = resultValidator.members as VObject<
+    { success: false; error?: string; value?: string },
+    {
+      success: VLiteral<false | true, "required">;
+      error?: VString<string, "required">;
+      value?: VString<string, "optional">;
+    },
+    "required",
+    "success" | "error" | "value"
+  >[];
   if (success.fields.success.value !== true) {
     [success, error] = [error, success];
   }
@@ -40,82 +49,95 @@ test("resultValidator is exported as the correct type", async () => {
   expect(error.fields.error!.kind).toBe("string");
 });
 
-
 test("schema validates successful results correctly", async () => {
-  await expect(addDocuments(responseAdminClient, "llm_calls", [
-    {
-      prompt: "What is the capital of France?",
-      result: {
-        success: true,
-        value: "Paris"
-      }
-    }
-  ])).resolves.toBeUndefined();
+  await expect(
+    addDocuments(responseAdminClient, "llm_calls", [
+      {
+        prompt: "What is the capital of France?",
+        result: {
+          success: true,
+          value: "Paris",
+        },
+      },
+    ]),
+  ).resolves.toBeUndefined();
 
-  await expect(addDocuments(responseAdminClient, "api_calls", [
-    {
-      url: "https://api.example.com/data",
-      result: {
-        success: true,
-        value: "response data"
-      }
-    }
-  ])).resolves.toBeUndefined();
+  await expect(
+    addDocuments(responseAdminClient, "api_calls", [
+      {
+        url: "https://api.example.com/data",
+        result: {
+          success: true,
+          value: "response data",
+        },
+      },
+    ]),
+  ).resolves.toBeUndefined();
 });
 
 test("schema validates error results correctly", async () => {
-  await expect(addDocuments(responseAdminClient, "llm_calls", [
-    {
-      prompt: "Invalid prompt",
-      result: {
-        success: false,
-        error: "Failed to process prompt"
-      }
-    }
-  ])).resolves.toBeUndefined();
+  await expect(
+    addDocuments(responseAdminClient, "llm_calls", [
+      {
+        prompt: "Invalid prompt",
+        result: {
+          success: false,
+          error: "Failed to process prompt",
+        },
+      },
+    ]),
+  ).resolves.toBeUndefined();
 
-  await expect(addDocuments(responseAdminClient, "api_calls", [
-    {
-      url: "https://api.example.com/invalid",
-      result: {
-        success: false,
-        error: "404 Not Found"
-      }
-    }
-  ])).resolves.toBeUndefined();
+  await expect(
+    addDocuments(responseAdminClient, "api_calls", [
+      {
+        url: "https://api.example.com/invalid",
+        result: {
+          success: false,
+          error: "404 Not Found",
+        },
+      },
+    ]),
+  ).resolves.toBeUndefined();
 });
 
 test("schema rejects invalid result formats", async () => {
   // Missing required fields
-  await expect(addDocuments(responseAdminClient, "llm_calls", [
-    {
-      prompt: "test",
-      result: {
-        success: true
-        // missing value field
-      }
-    }
-  ])).rejects.toThrow();
+  await expect(
+    addDocuments(responseAdminClient, "llm_calls", [
+      {
+        prompt: "test",
+        result: {
+          success: true,
+          // missing value field
+        },
+      },
+    ]),
+  ).rejects.toThrow();
 
   // Wrong field types
-  await expect(addDocuments(responseAdminClient, "api_calls", [
-    {
-      url: "https://example.com",
-      result: {
-        success: false,
-        error: 123 // should be string
-      }
-    }
-  ])).rejects.toThrow();
+  await expect(
+    addDocuments(responseAdminClient, "api_calls", [
+      {
+        url: "https://example.com",
+        result: {
+          success: false,
+          error: 123, // should be string
+        },
+      },
+    ]),
+  ).rejects.toThrow();
 
   // Invalid success value
-  await expect(addDocuments(responseAdminClient, "llm_calls", [
-    {
-      prompt: "test",
-      result: {
-        success: "yes", // should be boolean literal
-        value: "test"
-      }
-    }
-  ])).rejects.toThrow();
-});
\ No newline at end of file
+  await expect(
+    addDocuments(responseAdminClient, "llm_calls", [
+      {
+        prompt: "test",
+        result: {
+          success: "yes", // should be boolean literal
+          value: "test",
+        },
+      },
+    ]),
+  ).rejects.toThrow();
+});
diff --git a/evals/001-data_modeling/012-denormalize_for_index/grader.test.ts b/evals/001-data_modeling/012-denormalize_for_index/grader.test.ts
index 8b930a4..4b783f5 100644
--- a/evals/001-data_modeling/012-denormalize_for_index/grader.test.ts
+++ b/evals/001-data_modeling/012-denormalize_for_index/grader.test.ts
@@ -8,6 +8,9 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "node:test";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 type IdOwners = string & { __tableName: "owners" };
 type DogRow = {
diff --git a/evals/002-queries/006-three_level_join/grader.test.ts b/evals/002-queries/006-three_level_join/grader.test.ts
index 03b8130..debf143 100644
--- a/evals/002-queries/006-three_level_join/grader.test.ts
+++ b/evals/002-queries/006-three_level_join/grader.test.ts
@@ -7,6 +7,9 @@ import {
   listTable,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("compare schema", async ({ skip }) => {
   await compareSchema(skip);
diff --git a/evals/002-queries/007-aggregation/grader.test.ts b/evals/002-queries/007-aggregation/grader.test.ts
index 1e17a3e..93d9f71 100644
--- a/evals/002-queries/007-aggregation/grader.test.ts
+++ b/evals/002-queries/007-aggregation/grader.test.ts
@@ -6,6 +6,9 @@ import {
   addDocuments,
 } from "../../../grader";
 import { anyApi } from "convex/server";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("compare schema", async ({ skip }) => {
   await compareSchema(skip);
diff --git a/evals/002-queries/008-group_by/grader.test.ts b/evals/002-queries/008-group_by/grader.test.ts
index 6a78125..dc5feb9 100644
--- a/evals/002-queries/008-group_by/grader.test.ts
+++ b/evals/002-queries/008-group_by/grader.test.ts
@@ -6,6 +6,9 @@ import {
   addDocuments,
 } from "../../../grader";
 import { anyApi } from "convex/server";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("compare schema", async ({ skip }) => {
   await compareSchema(skip);
diff --git a/evals/002-queries/010-parallel_fetch/grader.test.ts b/evals/002-queries/010-parallel_fetch/grader.test.ts
index 7288f12..eb74f6a 100644
--- a/evals/002-queries/010-parallel_fetch/grader.test.ts
+++ b/evals/002-queries/010-parallel_fetch/grader.test.ts
@@ -7,6 +7,9 @@ import {
   listTable,
 } from "../../../grader";
 import { anyApi } from "convex/server";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("compare schema", async ({ skip }) => {
   await compareSchema(skip);
diff --git a/evals/002-queries/011-denormalize_pagination/grader.test.ts b/evals/002-queries/011-denormalize_pagination/grader.test.ts
index a75d600..310f720 100644
--- a/evals/002-queries/011-denormalize_pagination/grader.test.ts
+++ b/evals/002-queries/011-denormalize_pagination/grader.test.ts
@@ -10,6 +10,9 @@ import {
 import { api } from "./answer/convex/_generated/api";
 import { Doc } from "./answer/convex/_generated/dataModel";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["dogs", "owners"]);
diff --git a/evals/002-queries/012-index_and_filter/grader.test.ts b/evals/002-queries/012-index_and_filter/grader.test.ts
index 3766028..a50dccb 100644
--- a/evals/002-queries/012-index_and_filter/grader.test.ts
+++ b/evals/002-queries/012-index_and_filter/grader.test.ts
@@ -7,6 +7,9 @@ import {
   deleteAllDocuments,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 afterEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["users"]);
diff --git a/evals/002-queries/013-async_iterator_filter/grader.test.ts b/evals/002-queries/013-async_iterator_filter/grader.test.ts
index 157a4ff..1178678 100644
--- a/evals/002-queries/013-async_iterator_filter/grader.test.ts
+++ b/evals/002-queries/013-async_iterator_filter/grader.test.ts
@@ -10,6 +10,9 @@ import {
 import { api } from "./answer/convex/_generated/api";
 import { Doc } from "./answer/convex/_generated/dataModel";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["teams", "users"]);
diff --git a/evals/002-queries/014-select_distinct/grader.test.ts b/evals/002-queries/014-select_distinct/grader.test.ts
index 5a36a98..0f1605b 100644
--- a/evals/002-queries/014-select_distinct/grader.test.ts
+++ b/evals/002-queries/014-select_distinct/grader.test.ts
@@ -8,6 +8,9 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["users"]);
diff --git a/evals/002-queries/017-pagination_join/grader.test.ts b/evals/002-queries/017-pagination_join/grader.test.ts
index 8f0701d..870aeac 100644
--- a/evals/002-queries/017-pagination_join/grader.test.ts
+++ b/evals/002-queries/017-pagination_join/grader.test.ts
@@ -8,6 +8,9 @@ import {
   listTable,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { Doc } from "./answer/convex/_generated/dataModel";
 import { beforeEach } from "vitest";
 
diff --git a/evals/002-queries/018-pagination_returns_validator/grader.test.ts b/evals/002-queries/018-pagination_returns_validator/grader.test.ts
index 1963f88..a996d0b 100644
--- a/evals/002-queries/018-pagination_returns_validator/grader.test.ts
+++ b/evals/002-queries/018-pagination_returns_validator/grader.test.ts
@@ -10,6 +10,9 @@ import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
 import { PaginationResult } from "convex/server";
 import { Doc } from "./answer/convex/_generated/dataModel";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["posts"]);
diff --git a/evals/002-queries/019-no_scheduler/grader.test.ts b/evals/002-queries/019-no_scheduler/grader.test.ts
index f2f89de..14321da 100644
--- a/evals/002-queries/019-no_scheduler/grader.test.ts
+++ b/evals/002-queries/019-no_scheduler/grader.test.ts
@@ -9,6 +9,9 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { Doc } from "./answer/convex/_generated/dataModel";
 
 beforeEach(async () => {
diff --git a/evals/002-queries/020-text_search_join/grader.test.ts b/evals/002-queries/020-text_search_join/grader.test.ts
index 1bff46d..30f92d5 100644
--- a/evals/002-queries/020-text_search_join/grader.test.ts
+++ b/evals/002-queries/020-text_search_join/grader.test.ts
@@ -8,6 +8,9 @@ import {
   listTable,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { beforeEach } from "vitest";
 import { Doc } from "./answer/convex/_generated/dataModel";
 
diff --git a/evals/002-queries/021-intersection/grader.test.ts b/evals/002-queries/021-intersection/grader.test.ts
index 1c677d5..7ed1483 100644
--- a/evals/002-queries/021-intersection/grader.test.ts
+++ b/evals/002-queries/021-intersection/grader.test.ts
@@ -8,6 +8,9 @@ import {
   listTable,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { beforeEach } from "vitest";
 import { Doc } from "./answer/convex/_generated/dataModel";
 
diff --git a/evals/003-mutations/003-patch_nested/grader.test.ts b/evals/003-mutations/003-patch_nested/grader.test.ts
index 7073f27..ddda0f3 100644
--- a/evals/003-mutations/003-patch_nested/grader.test.ts
+++ b/evals/003-mutations/003-patch_nested/grader.test.ts
@@ -7,6 +7,9 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["documents"]);
diff --git a/evals/003-mutations/004-cascade_delete/grader.test.ts b/evals/003-mutations/004-cascade_delete/grader.test.ts
index afffdf7..a8efe44 100644
--- a/evals/003-mutations/004-cascade_delete/grader.test.ts
+++ b/evals/003-mutations/004-cascade_delete/grader.test.ts
@@ -8,6 +8,9 @@ import {
   listTable,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { beforeEach } from "vitest";
 import { Doc } from "./answer/convex/_generated/dataModel";
 
diff --git a/evals/003-mutations/005-cascade_delete_nested/grader.test.ts b/evals/003-mutations/005-cascade_delete_nested/grader.test.ts
index 511e0bb..5c7fac5 100644
--- a/evals/003-mutations/005-cascade_delete_nested/grader.test.ts
+++ b/evals/003-mutations/005-cascade_delete_nested/grader.test.ts
@@ -9,8 +9,11 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
 import { Doc, Id } from "./answer/convex/_generated/dataModel";
 
+createAIGraderTest(import.meta.url);
+
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, [
     "users",
diff --git a/evals/003-mutations/006-no_storage/grader.test.ts b/evals/003-mutations/006-no_storage/grader.test.ts
index 4c3376b..e6bf28b 100644
--- a/evals/003-mutations/006-no_storage/grader.test.ts
+++ b/evals/003-mutations/006-no_storage/grader.test.ts
@@ -8,6 +8,9 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { Doc } from "./answer/convex/_generated/dataModel";
 
 beforeEach(async () => {
diff --git a/evals/004-actions/001-run_mutation/grader.test.ts b/evals/004-actions/001-run_mutation/grader.test.ts
index e67bf3f..f684ad4 100644
--- a/evals/004-actions/001-run_mutation/grader.test.ts
+++ b/evals/004-actions/001-run_mutation/grader.test.ts
@@ -9,6 +9,9 @@ import {
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
 import { Doc } from "./answer/convex/_generated/dataModel";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["fetchResults"]);
diff --git a/evals/004-actions/002-run_query_mutation/grader.test.ts b/evals/004-actions/002-run_query_mutation/grader.test.ts
index 433f592..1348ce7 100644
--- a/evals/004-actions/002-run_query_mutation/grader.test.ts
+++ b/evals/004-actions/002-run_query_mutation/grader.test.ts
@@ -7,6 +7,9 @@ import {
   listTable,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { beforeEach } from "vitest";
 import { Doc } from "./answer/convex/_generated/dataModel";
 
diff --git a/evals/004-actions/003-mutation_schedule_action/grader.test.ts b/evals/004-actions/003-mutation_schedule_action/grader.test.ts
index 88f883d..6ae9662 100644
--- a/evals/004-actions/003-mutation_schedule_action/grader.test.ts
+++ b/evals/004-actions/003-mutation_schedule_action/grader.test.ts
@@ -8,6 +8,9 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { Doc } from "./answer/convex/_generated/dataModel";
 
 beforeEach(async () => {
diff --git a/evals/004-actions/004-storage/grader.test.ts b/evals/004-actions/004-storage/grader.test.ts
index 9e65164..5e351f6 100644
--- a/evals/004-actions/004-storage/grader.test.ts
+++ b/evals/004-actions/004-storage/grader.test.ts
@@ -1,8 +1,11 @@
 import { expect, test } from "vitest";
 import { responseClient } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
 import { Id } from "./answer/convex/_generated/dataModel";
 
+createAIGraderTest(import.meta.url);
+
 test("writes and reads text content", async () => {
   const testText = "Hello, world!";
 
diff --git a/evals/004-actions/005-storage_http_action/grader.test.ts b/evals/004-actions/005-storage_http_action/grader.test.ts
index 53d70a4..7b7a670 100644
--- a/evals/004-actions/005-storage_http_action/grader.test.ts
+++ b/evals/004-actions/005-storage_http_action/grader.test.ts
@@ -1,6 +1,9 @@
 import { expect, test } from "vitest";
 import { responseAdminClient } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 import { getSiteURL } from "./answer/convex/http";
 
 async function getStoreURL(): Promise<string> {
diff --git a/evals/004-actions/006-node/grader.test.ts b/evals/004-actions/006-node/grader.test.ts
index dbed3e0..51ac3b5 100644
--- a/evals/004-actions/006-node/grader.test.ts
+++ b/evals/004-actions/006-node/grader.test.ts
@@ -1,6 +1,9 @@
 import { expect, test } from "vitest";
 import { responseClient } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("processes string input correctly", async () => {
   const result = await responseClient.action(api.index.processWithNode, {
diff --git a/evals/004-actions/007-http_action_routing/grader.test.ts b/evals/004-actions/007-http_action_routing/grader.test.ts
index 6302c28..163fcf0 100644
--- a/evals/004-actions/007-http_action_routing/grader.test.ts
+++ b/evals/004-actions/007-http_action_routing/grader.test.ts
@@ -1,6 +1,9 @@
 import { expect, test } from "vitest";
 import { responseAdminClient } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 async function getBaseURL(): Promise<string> {
   return await responseAdminClient.query(api.http.getSiteURL, {});
diff --git a/evals/005-idioms/001-file_organization/grader.test.ts b/evals/005-idioms/001-file_organization/grader.test.ts
index 0fc40d7..87ef715 100644
--- a/evals/005-idioms/001-file_organization/grader.test.ts
+++ b/evals/005-idioms/001-file_organization/grader.test.ts
@@ -1,6 +1,9 @@
 import { expect, test } from "vitest";
 import { responseClient } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("can create and get user", async () => {
   const userData = {
diff --git a/evals/005-idioms/002-batch_queries/grader.test.ts b/evals/005-idioms/002-batch_queries/grader.test.ts
index c7677c2..dd644c8 100644
--- a/evals/005-idioms/002-batch_queries/grader.test.ts
+++ b/evals/005-idioms/002-batch_queries/grader.test.ts
@@ -10,6 +10,9 @@ import {
 import { api, internal } from "./answer/convex/_generated/api";
 import { Doc, Id } from "./answer/convex/_generated/dataModel";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["users", "posts"]);
@@ -19,12 +22,18 @@ test("compare schema", async ({ skip }) => {
   await compareSchema(skip);
 });
 
-async function setupTestData(): Promise<{ userId: Id<"users">, postIds: Id<"posts">[] }> {
+async function setupTestData(): Promise<{
+  userId: Id<"users">;
+  postIds: Id<"posts">[];
+}> {
   // Create a test user
   await addDocuments(responseAdminClient, "users", [
     { name: "Test User", email: "test@example.com" },
   ]);
-  const users = await listTable(responseAdminClient, "users") as Doc<"users">[];
+  const users = (await listTable(
+    responseAdminClient,
+    "users",
+  )) as Doc<"users">[];
   const userId = users[0]._id;
 
   // Create some test posts
@@ -32,8 +41,11 @@ async function setupTestData(): Promise<{ userId: Id<"users">, postIds: Id<"post
     { userId, content: "Post 1" },
     { userId, content: "Post 2" },
   ]);
-  const posts = await listTable(responseAdminClient, "posts") as Doc<"posts">[];
-  const [post1Id, post2Id] = posts.map(p => p._id);
+  const posts = (await listTable(
+    responseAdminClient,
+    "posts",
+  )) as Doc<"posts">[];
+  const [post1Id, post2Id] = posts.map((p) => p._id);
 
   return { userId, postIds: [post1Id, post2Id] };
 }
@@ -42,9 +54,12 @@ test("getUserByEmail returns correct user", async () => {
   const { userId } = await setupTestData();
 
   /* eslint-disable */
-  const user = await responseAdminClient.query(internal.users.getUserByEmail as any, {
-    email: "test@example.com",
-  });
+  const user = await responseAdminClient.query(
+    internal.users.getUserByEmail as any,
+    {
+      email: "test@example.com",
+    },
+  );
   /* eslint-enable */
 
   expect(user).toBeDefined();
@@ -56,9 +71,12 @@ test("getUserByEmail returns correct user", async () => {
 
 test("getUserByEmail returns null for non-existent user", async () => {
   /* eslint-disable */
-  const user = await responseAdminClient.query(internal.users.getUserByEmail as any, {
-    email: "nonexistent@example.com",
-  });
+  const user = await responseAdminClient.query(
+    internal.users.getUserByEmail as any,
+    {
+      email: "nonexistent@example.com",
+    },
+  );
   /* eslint-enable */
   expect(user).toBeNull();
 });
diff --git a/evals/006-clients/000-use_query/grader.test.ts b/evals/006-clients/000-use_query/grader.test.ts
index 8b0acbc..d044af3 100644
--- a/evals/006-clients/000-use_query/grader.test.ts
+++ b/evals/006-clients/000-use_query/grader.test.ts
@@ -9,6 +9,9 @@ import {
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
 import { beforeEach } from "vitest";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   await deleteAllDocuments(responseAdminClient, ["messages"]);
diff --git a/evals/006-clients/001-use_mutation/grader.test.ts b/evals/006-clients/001-use_mutation/grader.test.ts
index f09ab5b..333b790 100644
--- a/evals/006-clients/001-use_mutation/grader.test.ts
+++ b/evals/006-clients/001-use_mutation/grader.test.ts
@@ -8,6 +8,9 @@ import {
   listTable,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 beforeEach(async () => {
   // Clear the messages table before each test
diff --git a/evals/006-clients/002-use_paginated_query/grader.test.ts b/evals/006-clients/002-use_paginated_query/grader.test.ts
index 0228a97..5c71867 100644
--- a/evals/006-clients/002-use_paginated_query/grader.test.ts
+++ b/evals/006-clients/002-use_paginated_query/grader.test.ts
@@ -9,6 +9,9 @@ import {
   listTable,
 } from "../../../grader";
 import { api } from "./answer/convex/_generated/api";
+import { createAIGraderTest } from "../../../grader/aiGrader";
+
+createAIGraderTest(import.meta.url);
 
 test("compare function spec", async ({ skip }) => {
   await compareFunctionSpec(skip);
diff --git a/grader/aiGrader.ts b/grader/aiGrader.ts
new file mode 100644
index 0000000..fcf840d
--- /dev/null
+++ b/grader/aiGrader.ts
@@ -0,0 +1,298 @@
+/* eslint-disable */
+
+import { readdirSync, readFileSync, statSync } from "node:fs";
+import { join, relative } from "node:path";
+import { tmpdir } from "node:os";
+import { fileURLToPath } from "node:url";
+import { generateObject } from "ai";
+import { createOpenAI } from "@ai-sdk/openai";
+import { z } from "zod";
+import { test } from "vitest";
+
+type EvalInfo = {
+  category: string;
+  name: string;
+  testFilePath: string;
+};
+
+type GradeResult = {
+  result: "pass" | "fail";
+  reasoning: string;
+};
+
+/**
+ * Extract eval category and name from the test file URL and load the task content.
+ */
+function getTask(testFileUrl: string): {
+  evalInfo: EvalInfo;
+  taskContent: string;
+} {
+  const testFilePath = fileURLToPath(testFileUrl);
+  const parts = testFilePath.replace(/\\/g, "/").split("/");
+  const evalsIdx = parts.lastIndexOf("evals");
+  if (evalsIdx < 0 || parts.length < evalsIdx + 3)
+    throw new Error(
+      `Could not derive eval category/name from path: ${testFilePath}`,
+    );
+
+  const evalInfo: EvalInfo = {
+    category: parts[evalsIdx + 1],
+    name: parts[evalsIdx + 2],
+    testFilePath,
+  };
+
+  // Load the task assignment alongside the grader
+  const taskPath = testFilePath
+    .replace(/grader\.test\.ts$/, "TASK.txt")
+    .replace(/grader\.test\.tsx$/, "TASK.txt");
+
+  let taskContent = "";
+  try {
+    taskContent = readFileSync(taskPath, { encoding: "utf-8" });
+  } catch {
+    throw new Error(`TASK.txt not found at expected path: ${taskPath}`);
+  }
+
+  return { evalInfo, taskContent };
+}
+
+/**
+ * Find and return paths to all answer files in the generated output directory.
+ */
+function gatherAnswerFiles(evalInfo: EvalInfo): {
+  filePaths: string[];
+  outputProjectDir: string;
+} {
+  const { category, name } = evalInfo;
+
+  // Locate the generated output project directory under the system tempdir
+  const candidateRoots = [] as { dir: string; mtime: number }[];
+  const tdir = tmpdir();
+
+  for (const entry of readdirSync(tdir, { withFileTypes: true })) {
+    if (!entry.isDirectory()) continue;
+    const root = join(tdir, entry.name, "output");
+    try {
+      const models = readdirSync(root, { withFileTypes: true }).filter((d) =>
+        d.isDirectory(),
+      );
+      for (const modelDir of models) {
+        const projectDir = join(root, modelDir.name, category, name);
+        try {
+          const st = statSync(projectDir);
+          if (st.isDirectory())
+            candidateRoots.push({ dir: projectDir, mtime: st.mtimeMs });
+        } catch {
+          // ignore missing
+        }
+      }
+    } catch {
+      // no output here
+    }
+  }
+
+  if (candidateRoots.length === 0)
+    throw new Error(
+      `Could not find output directory for ${category}/${name} under ${tdir}`,
+    );
+
+  candidateRoots.sort((a, b) => b.mtime - a.mtime);
+  const outputProjectDir = candidateRoots[0].dir;
+
+  // Collect file paths with exclusions
+  const excludedFileNames = new Set([
+    "run.log",
+    "tsconfig.json",
+    "bun.lock",
+    ".env.local",
+  ]);
+  const shouldInclude = (fullPath: string): boolean => {
+    const rel = relative(outputProjectDir, fullPath).replace(/\\/g, "/");
+    if (rel.startsWith("node_modules/")) return false;
+    if (rel.startsWith("convex/_generated/")) return false;
+    if (rel === "convex/README.md") return false;
+    if (rel === "convex/tsconfig.json") return false;
+    const base = rel.split("/").pop() ?? "";
+    if (excludedFileNames.has(base)) return false;
+    return true;
+  };
+
+  const stack: string[] = [outputProjectDir];
+  const filePaths: string[] = [];
+
+  while (stack.length > 0) {
+    const dir = stack.pop() as string;
+    for (const de of readdirSync(dir, { withFileTypes: true })) {
+      const full = join(dir, de.name);
+      if (de.isDirectory()) {
+        if (de.name === "node_modules") continue;
+        if (
+          de.name === "_generated" &&
+          dir.replace(/\\/g, "/").endsWith("/convex")
+        )
+          continue;
+        stack.push(full);
+      } else {
+        if (shouldInclude(full)) filePaths.push(full);
+      }
+    }
+  }
+
+  filePaths.sort();
+  return { filePaths, outputProjectDir };
+}
+
+/**
+ * Read and concatenate all answer files into a single string with file headers.
+ */
+function concatenateAnswerFiles(
+  filePaths: string[],
+  outputProjectDir: string,
+): string {
+  if (filePaths.length === 0) return "";
+
+  const getLang = (path: string): string => {
+    const lower = path.toLowerCase();
+    if (lower.endsWith(".ts")) return "ts";
+    if (lower.endsWith(".tsx")) return "tsx";
+    if (lower.endsWith(".js")) return "js";
+    if (lower.endsWith(".jsx")) return "jsx";
+    if (lower.endsWith(".json")) return "json";
+    if (lower.endsWith(".md")) return "md";
+    if (lower.endsWith(".sql")) return "sql";
+    if (lower.endsWith(".py")) return "python";
+    if (lower.endsWith(".txt")) return "text";
+    return "text";
+  };
+
+  type ManifestEntry = {
+    path: string;
+    language: string;
+    bytes: number;
+    lines: number;
+  };
+  const manifest: ManifestEntry[] = [];
+  const blocks: string[] = [];
+
+  for (const fp of filePaths) {
+    try {
+      const rel = relative(outputProjectDir, fp).replace(/\\/g, "/");
+      const content = readFileSync(fp, { encoding: "utf-8" });
+      const lines = content.split(/\r?\n/).length;
+      const lang = getLang(rel);
+      manifest.push({
+        path: rel,
+        language: lang,
+        bytes: Buffer.byteLength(content, "utf-8"),
+        lines,
+      });
+      const block =
+        `\n<<<FILE_START path="${rel}" lang="${lang}" lines=${lines} bytes=${Buffer.byteLength(content, "utf-8")}>>>\n` +
+        `\n\u0060\u0060\u0060${lang}\n` +
+        `${content}\n` +
+        `\u0060\u0060\u0060\n` +
+        `<<<FILE_END>>>\n`;
+      blocks.push(block);
+    } catch {
+      // skip unreadable files
+    }
+  }
+
+  const manifestJson = JSON.stringify(manifest, null, 2);
+  const header = `FILES MANIFEST (JSON)\n\u0060\u0060\u0060json\n${manifestJson}\n\u0060\u0060\u0060\n`;
+
+  const LIMIT = 300_000;
+  let result = header;
+  let omitted = 0;
+  for (const block of blocks) {
+    if (result.length + block.length > LIMIT) {
+      omitted += 1;
+      continue;
+    }
+    result += block;
+  }
+  if (omitted > 0)
+    result += `\n[omitted ${omitted} file block(s) due to length limit]\n`;
+  return result;
+}
+
+/**
+ * Use AI to generate a grade based on the task and concatenated files.
+ */
+async function generateGrade(
+  taskContent: string,
+  concatenated: string,
+): Promise<GradeResult> {
+  const apiKey = process.env.OPENAI_API_KEY;
+  if (!apiKey) throw new Error("OPENAI_API_KEY is not set");
+
+  const prompt = `You are grading an autogenerated Convex backend submission.
+
+Task assignment (verbatim from TASK.txt):
+---
+${taskContent}
+---
+
+You are given a manifest and a sequence of file blocks with explicit boundaries.
+Instructions:
+- Treat each file block independently; do not merge or infer code across files.
+- Only use the content inside a given block when assessing that file.
+- If files are omitted due to length limits, grade only on what is present.
+- When referencing code, cite the file path from the manifest.
+
+File data:
+---
+${concatenated}
+---
+
+Decide if the output fully satisfies the task requirements. Provide a short reasoning and a final grade.`;
+
+  const openai = createOpenAI({ apiKey });
+
+  const schema = z.object({
+    reasoning: z.string(),
+    grade: z.enum(["pass", "fail"]),
+  });
+
+  const resultObj = schema.parse(
+    (
+      await generateObject({
+        model: openai("gpt-5-mini"),
+        schema,
+        prompt,
+      })
+    ).object,
+  );
+
+  return { result: resultObj.grade, reasoning: resultObj.reasoning };
+}
+
+/**
+ * Helper for tests: throws with AI reasoning when grading fails, so the failure
+ * message includes the specific cause. Logs reasoning either way.
+ */
+export async function expectAIGraderPass(testFileUrl: string): Promise<void> {
+  const { evalInfo, taskContent } = getTask(testFileUrl);
+  const { filePaths, outputProjectDir } = gatherAnswerFiles(evalInfo);
+  const concatenated = concatenateAnswerFiles(filePaths, outputProjectDir);
+  const { result, reasoning } = await generateGrade(taskContent, concatenated);
+
+  console.log(
+    `[AI Grader ${evalInfo.category}/${evalInfo.name}] ${result == "pass" ? "PASS" : "FAIL"}: ${reasoning}`,
+  );
+
+  if (result !== "pass") throw new Error(`AI grading failed: ${reasoning}`);
+}
+
+/**
+ * Create a standardized Vitest for AI grading with optional name and timeout.
+ */
+export function createAIGraderTest(
+  testFileUrl: string,
+  name: string = "AI grader assessment",
+  timeoutMs: number = 60000,
+): void {
+  test(name, { timeout: timeoutMs }, async () => {
+    await expectAIGraderPass(testFileUrl);
+  });
+}
diff --git a/grader/index.ts b/grader/index.ts
index 17c9410..6c8db24 100644
--- a/grader/index.ts
+++ b/grader/index.ts
@@ -1,5 +1,6 @@
 import { ConvexClient } from "convex/browser";
 import { expect } from "vitest";
+
 /* eslint-disable @typescript-eslint/no-explicit-any */
 /* eslint-disable @typescript-eslint/no-unsafe-call */
 /* eslint-disable @typescript-eslint/no-unsafe-assignment */
@@ -193,7 +194,7 @@ export function hasIndexForPrefix(
     fieldNames?: string[];
   }[];
   return indexes.some((idx) => {
-    const idxFields = (idx.fields ?? idx.fieldNames ?? []) as string[];
+    const idxFields = idx.fields ?? idx.fieldNames ?? [];
     if (!Array.isArray(idxFields)) return false;
     if (idxFields.length < fieldsPrefix.length) return false;
     for (let i = 0; i < fieldsPrefix.length; i++) {
diff --git a/package.json b/package.json
index c704098..b4f7f8e 100644
--- a/package.json
+++ b/package.json
@@ -18,8 +18,11 @@
     "typescript": "^5.7.3"
   },
   "dependencies": {
+    "@ai-sdk/openai": "^2.0.19",
+    "zod": "^3.23.8",
     "@types/bun": "^1.2.20",
     "@types/node": "^22.12.0",
+    "ai": "^5.0.22",
     "convex": "^1.18.2",
     "dotenv": "^17.2.1",
     "prettier": "^3.4.2",
diff --git a/runner/convex_backend.py b/runner/convex_backend.py
index 0862cc8..05fe645 100644
--- a/runner/convex_backend.py
+++ b/runner/convex_backend.py
@@ -134,10 +134,10 @@ def download_convex_binary():
         if os.path.exists(binary_path):
             return binary_path
 
-        log_info("Latest release:", version)
+        print(f"Latest release: {version}", flush=True)
 
         url = matching_asset["browser_download_url"]
-        log_info("Downloading:", url)
+        print(f"Downloading: {url}", flush=True)
         response = requests.get(url, stream=True)
         response.raise_for_status()
 
@@ -145,7 +145,7 @@ def download_convex_binary():
         with open(zip_path, "wb") as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
-        log_info("Downloaded:", matching_asset["name"])
+        print(f"Downloaded: {matching_asset['name']}", flush=True)
 
         # Unzip the file
         with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -163,6 +163,6 @@ def download_convex_binary():
 
         # Clean up zip file
         os.remove(zip_path)
-        log_info("Extracted binary to:", binary_path)
+        print(f"Extracted binary to: {binary_path}", flush=True)
 
     return binary_path
diff --git a/runner/reporting.py b/runner/reporting.py
index cc57953..727c0ea 100644
--- a/runner/reporting.py
+++ b/runner/reporting.py
@@ -13,6 +13,9 @@
 
 
 def post_scores_to_convex(model_name: str, category_scores: dict, total_score: float) -> None:
+    # When Braintrust is disabled, also disable reporting to the Convex endpoint
+    if os.getenv("DISABLE_BRAINTRUST") == "1":
+        return
     payload = {"model": model_name, "scores": category_scores, "totalScore": total_score}
     if CONVEX_EVAL_ENDPOINT is not None and CONVEX_AUTH_TOKEN is not None:
         try:
@@ -77,20 +80,20 @@ def report_eval(evaluator, result: EvalResultWithSummary, verbose, jsonl):
 
         # Pretty console summary
         overall_rate = (total_score / total_num_tests) if total_num_tests > 0 else 0
-        log_info("")
-        log_info("=== Eval Summary ===")
-        log_info(f"Model: {results[0].metadata.get('model_name', 'unknown') if results and results[0].metadata else 'unknown'}")
-        log_info(f"Overall: {overall_rate:.2%} ({total_passed} pass, {total_num_tests - total_passed} fail)")
+        print("", flush=True)
+        print("=== Eval Summary ===", flush=True)
+        print(f"Model: {results[0].metadata.get('model_name', 'unknown') if results and results[0].metadata else 'unknown'}", flush=True)
+        print(f"Overall: {overall_rate:.2%} ({total_passed} pass, {total_num_tests - total_passed} fail)", flush=True)
         for category in sorted(num_tests.keys()):
             rate = scores[category] / num_tests[category]
             cat_pass = passed_counts.get(category, 0)
-            log_info(f"- {category}: {rate:.2%} ({cat_pass} pass, {num_tests[category] - cat_pass} fail)")
+            print(f"- {category}: {rate:.2%} ({cat_pass} pass, {num_tests[category] - cat_pass} fail)", flush=True)
 
         # Always write local results; print the path
-        log_info(f"Results written to: {OUTPUT_RESULTS_FILE}")
+        print(f"Results written to: {OUTPUT_RESULTS_FILE}", flush=True)
 
         if jsonl:
-            log_info(json.dumps(summary.as_dict()))
+            print(json.dumps(summary.as_dict()), flush=True)
 
     return len(failing_results) == 0
 
diff --git a/runner/run_grader.py b/runner/run_grader.py
index 56c94d8..de0b718 100644
--- a/runner/run_grader.py
+++ b/runner/run_grader.py
@@ -74,7 +74,7 @@ def run_grader(category: str, name: str, project_dir: str):
         except Exception as e:
             message.append(f"  - Tests fail: {e}")
 
-    log_info("\n".join(message))
+    print("\n".join(message), flush=True)
     return success
 
 
diff --git a/runner/scorer.py b/runner/scorer.py
index 77dccce..949ca59 100644
--- a/runner/scorer.py
+++ b/runner/scorer.py
@@ -3,6 +3,7 @@
 import subprocess
 import json
 import re
+import tempfile
 from braintrust import traced, Score
 from runner.convex_backend import convex_backend, admin_key
 from runner.logging import append_log, append_log_block, log_cmd_results, log_info, run_command_step
@@ -132,6 +133,9 @@ def convex_scorer(model, tempdir, *, input, expected, metadata, output):
                 if isinstance(e, TestsFailedException):
                     scores.append(Score("Tests pass", e.ratio))
                     tests_ratio = e.ratio
+                    # Even on failure, capture and log Vitest stdout (includes console logs)
+                    if getattr(e, "stdout", None) is not None and getattr(e, "cmd", None) is not None:
+                        log_cmd_results(run_log_path, [(e.cmd, e.stdout)], "vitest")
                 else:
                     scores.append(Score("Tests pass", 0))
                     tests_ratio = 0.0
@@ -168,9 +172,11 @@ def convex_scorer(model, tempdir, *, input, expected, metadata, output):
 
 
 class TestsFailedException(Exception):
-    def __init__(self, message, ratio):
+    def __init__(self, message, ratio, stdout=None, cmd=None):
         super().__init__(message)
         self.ratio = ratio
+        self.stdout = stdout
+        self.cmd = cmd
 
 
 @traced
@@ -344,12 +350,20 @@ def run_tests(backend, answer_backend, test_file):
     )
     if answer_backend is not None:
         env["CONVEX_ANSWER_PORT"] = str(answer_backend["port"])
+    # Write JSON reporter output to a temp file so stdout can include human output + console logs
+    tmp_json = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
+    tmp_json_path = tmp_json.name
+    tmp_json.close()
+    # Vitest supports multiple reporters; keep JSON (to parse) and default (to include logs on stdout)
     cmd = [
         "bunx",
         "vitest",
         "run",
         test_file,
         "--reporter=json",
+        "--outputFile",
+        tmp_json_path,
+        "--reporter=default",
         "--no-color",
     ]
     done = subprocess.run(
@@ -361,9 +375,15 @@ def run_tests(backend, answer_backend, test_file):
     )
 
     try:
-        # Removes all characters before the first `{` and after the last `}`
-        cleaned_stdout = re.sub(r"^.*?(\{.*\}).*$", r"\1", done.stdout, flags=re.DOTALL)
-        results = json.loads(cleaned_stdout)
+        # Prefer reading structured results from the JSON reporter file
+        results = None
+        try:
+            with open(tmp_json_path, "r", encoding="utf-8") as f:
+                results = json.load(f)
+        except Exception:
+            # Fallback: extract JSON blob from stdout if file missing or invalid
+            cleaned_stdout = re.sub(r"^.*?(\{.*\}).*$", r"\1", done.stdout, flags=re.DOTALL)
+            results = json.loads(cleaned_stdout)
 
         total = results["numTotalTests"]
         passed = results["numPassedTests"]
@@ -374,13 +394,18 @@ def run_tests(backend, answer_backend, test_file):
             raise Exception(f"Failed to run tests:\n{done.stdout}")
         else:
             raise Exception(f"Failed to parse tests results: {e}")
+    finally:
+        try:
+            os.remove(tmp_json_path)
+        except Exception:
+            pass
 
     if ratio != 1:
         error_message = ""
         for test in results["testResults"][0]["assertionResults"]:
             if test["status"] == "failed":
                 error_message += f"{test['title']}: {test['failureMessages']}\n"
-        raise TestsFailedException(f"Tests failed:\n{error_message}", ratio)
+        raise TestsFailedException(f"Tests failed:\n{error_message}", ratio, done.stdout, cmd)
     return ratio, done.stdout, cmd