diff --git a/AGENTS.md b/AGENTS.md index dd752931..8bbb6afa 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -325,10 +325,14 @@ When working on a GitHub issue, **ALWAYS** follow this workflow: # Claim it — label + project roadmap status gh issue edit --add-label "in-progress" - # Update project roadmap: set status to "In Progress" and stamp Agent ID - ITEM_ID=$(gh project item-list 1 --owner EntityProcess --format json | jq -r '.items[] | select(.content.number == and .content.repository == "agentv") | .id') + # Update project roadmap: ensure the issue is on the AgentV OSS board, + # then set status to "In progress" and stamp Agent ID + ITEM_ID=$(gh project item-list 1 --owner EntityProcess --format json | jq -r '.items[] | select(.content.number == and .content.repository == "EntityProcess/agentv") | .id') + if [ -z "$ITEM_ID" ] || [ "$ITEM_ID" = "null" ]; then + ITEM_ID=$(gh project item-add 1 --owner EntityProcess --url "https://github.com/EntityProcess/agentv/issues/" --format json | jq -r '.id') + fi if [ -n "$ITEM_ID" ]; then - gh project item-edit --project-id PVT_kwDOAIbbRc4BSmjF --id "$ITEM_ID" --field-id PVTSSF_lADOAIbbRc4BSmjFzhAFomw --single-select-option-id 47fc9ee4 + gh project item-edit --project-id PVT_kwDOAIbbRc4BSmjF --id "$ITEM_ID" --field-id PVTSSF_lADOAIbbRc4BSmjFzhAFomw --single-select-option-id c3991b20 gh project item-edit --project-id PVT_kwDOAIbbRc4BSmjF --id "$ITEM_ID" --field-id PVTF_lADOAIbbRc4BSmjFzhAHSnk --text "$AGENT_ID" fi ``` diff --git a/packages/core/src/evaluation/evaluators/llm-grader.ts b/packages/core/src/evaluation/evaluators/llm-grader.ts index 05ae50ad..55f906ff 100644 --- a/packages/core/src/evaluation/evaluators/llm-grader.ts +++ b/packages/core/src/evaluation/evaluators/llm-grader.ts @@ -145,6 +145,12 @@ const scoreRangeEvaluationSchema = z.object({ export { freeformEvaluationSchema, rubricEvaluationSchema }; +interface StructuredGenerationResult { + readonly text: string; + readonly providerResponse?: ProviderResponse; + readonly tokenUsage?: TokenUsage; +} + export class LlmGraderEvaluator implements Evaluator { readonly kind = 'llm-grader'; @@ -955,67 +961,126 @@ export class LlmGraderEvaluator implements Evaluator { const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options; let lastError: Error | undefined; + let lastInvalidResponse: StructuredGenerationResult | undefined; + let shouldAttemptStructureFix = false; for (let attempt = 1; attempt <= 3; attempt++) { try { - // Prefer Vercel AI SDK language model if available. - const model = graderProvider.asLanguageModel?.(); - if (model) { - const modelOptions = { - ...(this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {}), - ...(typeof this.temperature === 'number' ? { temperature: this.temperature } : {}), - }; - - // When images are present, use multi-part messages instead of plain prompt - const hasImages = images && images.length > 0; - const result = hasImages - ? await generateText({ - model, - system: systemPrompt, - messages: [ - { - role: 'user' as const, - content: [ - { type: 'text' as const, text: userPrompt }, - ...toAiSdkImageParts(images), - ], - }, - ], - ...modelOptions, - }) - : await generateText({ - model, - system: systemPrompt, - prompt: userPrompt, - ...modelOptions, - }); - - const data = schema.parse(parseJsonFromText(result.text)); - const rawUsage = result.usage; - const tokenUsage = - rawUsage?.inputTokens != null && rawUsage?.outputTokens != null - ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } - : undefined; - return { data, tokenUsage }; + const result = await this.generateStructuredResponse({ + context, + graderProvider, + systemPrompt, + userPrompt, + images, + }); + const canRepairResponse = result.text.trim().length > 0; + lastInvalidResponse = canRepairResponse ? result : undefined; + let data: T; + try { + data = schema.parse(parseJsonFromText(result.text)); + } catch (e: unknown) { + lastError = e instanceof Error ? e : new Error(String(e)); + shouldAttemptStructureFix = canRepairResponse; + continue; } + return { + data, + providerResponse: result.providerResponse, + tokenUsage: result.tokenUsage, + }; + } catch (e: unknown) { + lastError = e instanceof Error ? e : new Error(String(e)); + } + } - const response = await graderProvider.invoke({ - question: userPrompt, + if (shouldAttemptStructureFix && lastInvalidResponse) { + try { + const repaired = await this.generateStructuredResponse({ + context, + graderProvider, systemPrompt, - evalCaseId: context.evalCase.id, - attempt: context.attempt, - maxOutputTokens: this.maxOutputTokens, - temperature: this.temperature, + userPrompt: buildStructureRepairPrompt({ + validationError: lastError?.message ?? 'Schema validation failed', + invalidResponse: lastInvalidResponse.text, + }), }); - - const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output))); - return { data, providerResponse: response, tokenUsage: response.tokenUsage }; + const data = schema.parse(parseJsonFromText(repaired.text)); + return { + data, + providerResponse: repaired.providerResponse, + tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage), + }; } catch (e: unknown) { lastError = e instanceof Error ? e : new Error(String(e)); } } - throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`); + throw new Error( + `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`, + ); + } + + private async generateStructuredResponse(options: { + readonly context: EvaluationContext; + readonly graderProvider: Provider; + readonly systemPrompt: string; + readonly userPrompt: string; + readonly images?: readonly ContentImage[]; + }): Promise { + const { context, graderProvider, systemPrompt, userPrompt, images } = options; + + const model = graderProvider.asLanguageModel?.(); + if (model) { + const modelOptions = { + ...(this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {}), + ...(typeof this.temperature === 'number' ? { temperature: this.temperature } : {}), + }; + + const hasImages = images && images.length > 0; + const result = hasImages + ? await generateText({ + model, + system: systemPrompt, + messages: [ + { + role: 'user' as const, + content: [ + { type: 'text' as const, text: userPrompt }, + ...toAiSdkImageParts(images), + ], + }, + ], + ...modelOptions, + }) + : await generateText({ + model, + system: systemPrompt, + prompt: userPrompt, + ...modelOptions, + }); + + const rawUsage = result.usage; + const tokenUsage = + rawUsage?.inputTokens != null && rawUsage?.outputTokens != null + ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } + : undefined; + return { text: result.text, tokenUsage }; + } + + const response = await graderProvider.invoke({ + question: userPrompt, + systemPrompt, + evalCaseId: context.evalCase.id, + attempt: context.attempt, + maxOutputTokens: this.maxOutputTokens, + temperature: this.temperature, + }); + + return { + text: extractLastAssistantContent(response.output), + providerResponse: response, + tokenUsage: response.tokenUsage, + }; } } @@ -1045,6 +1110,37 @@ export function buildOutputSchema(): string { ].join('\n'); } +function buildStructureRepairPrompt(options: { + readonly validationError: string; + readonly invalidResponse: string; +}): string { + const { validationError, invalidResponse } = options; + return [ + 'The following evaluation response has useful grading content but invalid JSON structure.', + 'Repair it to satisfy the schema in the system prompt.', + 'Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.', + '', + 'Validation error:', + validationError, + '', + 'Invalid response:', + invalidResponse, + ].join('\n'); +} + +function sumTokenUsage( + first: TokenUsage | undefined, + second: TokenUsage | undefined, +): TokenUsage | undefined { + if (!first && !second) { + return undefined; + } + return { + input: (first?.input ?? 0) + (second?.input ?? 0), + output: (first?.output ?? 0) + (second?.output ?? 0), + }; +} + export function buildRubricOutputSchema(): string { return `You are an expert evaluator. Evaluate the candidate answer against each rubric item. You must return a valid JSON object matching this schema: diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts index 00efcd61..38ba1c82 100644 --- a/packages/core/test/evaluation/evaluators.test.ts +++ b/packages/core/test/evaluation/evaluators.test.ts @@ -57,6 +57,26 @@ class CapturingProvider implements Provider { } } +class SequenceCapturingProvider implements Provider { + readonly id = 'sequence-capturing'; + readonly kind = 'mock' as const; + readonly targetName = 'sequence-capturing'; + readonly requests: ProviderRequest[] = []; + private index = 0; + + constructor(private readonly responses: readonly ProviderResponse[]) {} + + async invoke(request: ProviderRequest): Promise { + this.requests.push(request); + const response = this.responses[this.index] ?? this.responses[this.responses.length - 1]; + this.index += 1; + if (!response) { + throw new Error('No mock response configured'); + } + return response; + } +} + const baseTestCase: EvalTest = { id: 'case-1', dataset: 'test-dataset', @@ -647,6 +667,81 @@ describe('LlmGraderEvaluator (llm-grader)', () => { warnSpy.mockRestore(); }); + it('repairs malformed freeform grader output after standard retries are exhausted', async () => { + const malformedResponse = textResponse( + JSON.stringify({ + score: '0.8', + assertions: [{ text: 'Captured logging requirement', passed: true }], + }), + ); + const repairedResponse = textResponse( + JSON.stringify({ + score: 0.8, + assertions: [{ text: 'Captured logging requirement', passed: true }], + }), + ); + + const graderProvider = new SequenceCapturingProvider([ + malformedResponse, + malformedResponse, + malformedResponse, + repairedResponse, + ]); + + const evaluator = new LlmGraderEvaluator({ + resolveGraderProvider: async () => graderProvider, + }); + + const result = await evaluator.evaluate({ + evalCase: { ...baseTestCase, evaluator: 'llm-grader' }, + candidate: 'Answer', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '' }, + now: new Date(), + }); + + expect(result.score).toBeCloseTo(0.8); + expect(result.verdict).toBe('pass'); + expect(graderProvider.requests).toHaveLength(4); + expect(graderProvider.requests[3]?.question).toContain( + 'The following evaluation response has useful grading content but invalid JSON structure.', + ); + expect(graderProvider.requests[3]?.question).toContain('"score":"0.8"'); + }); + + it('keeps skipping when the structure-fix attempt is also malformed', async () => { + const malformedResponse = textResponse( + '{"score":"0.8","assertions":[{"text":"Bad","passed":true}]}', + ); + const graderProvider = new SequenceCapturingProvider([ + malformedResponse, + malformedResponse, + malformedResponse, + textResponse('{"score":'), + ]); + + const evaluator = new LlmGraderEvaluator({ + resolveGraderProvider: async () => graderProvider, + }); + + const result = await evaluator.evaluate({ + evalCase: { ...baseTestCase, evaluator: 'llm-grader' }, + candidate: 'Answer', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '' }, + now: new Date(), + }); + + expect(result.score).toBe(0); + expect(result.verdict).toBe('skip'); + expect(result.assertions[0]?.text).toContain('structure-fix attempt'); + expect(graderProvider.requests).toHaveLength(4); + }); + it('keeps skipping on unrecoverable malformed JSON', async () => { const graderProvider = new StubProvider(textResponse('{"score":'));