Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -325,10 +325,14 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
# Claim it — label + project roadmap status
gh issue edit <number> --add-label "in-progress"

# Update project roadmap: set status to "In Progress" and stamp Agent ID
ITEM_ID=$(gh project item-list 1 --owner EntityProcess --format json | jq -r '.items[] | select(.content.number == <number> and .content.repository == "agentv") | .id')
# Update project roadmap: ensure the issue is on the AgentV OSS board,
# then set status to "In progress" and stamp Agent ID
ITEM_ID=$(gh project item-list 1 --owner EntityProcess --format json | jq -r '.items[] | select(.content.number == <number> and .content.repository == "EntityProcess/agentv") | .id')
if [ -z "$ITEM_ID" ] || [ "$ITEM_ID" = "null" ]; then
ITEM_ID=$(gh project item-add 1 --owner EntityProcess --url "https://github.com/EntityProcess/agentv/issues/<number>" --format json | jq -r '.id')
fi
if [ -n "$ITEM_ID" ]; then
gh project item-edit --project-id PVT_kwDOAIbbRc4BSmjF --id "$ITEM_ID" --field-id PVTSSF_lADOAIbbRc4BSmjFzhAFomw --single-select-option-id 47fc9ee4
gh project item-edit --project-id PVT_kwDOAIbbRc4BSmjF --id "$ITEM_ID" --field-id PVTSSF_lADOAIbbRc4BSmjFzhAFomw --single-select-option-id c3991b20
gh project item-edit --project-id PVT_kwDOAIbbRc4BSmjF --id "$ITEM_ID" --field-id PVTF_lADOAIbbRc4BSmjFzhAHSnk --text "$AGENT_ID"
fi
```
Expand Down
194 changes: 145 additions & 49 deletions packages/core/src/evaluation/evaluators/llm-grader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ const scoreRangeEvaluationSchema = z.object({

export { freeformEvaluationSchema, rubricEvaluationSchema };

interface StructuredGenerationResult {
readonly text: string;
readonly providerResponse?: ProviderResponse;
readonly tokenUsage?: TokenUsage;
}

export class LlmGraderEvaluator implements Evaluator {
readonly kind = 'llm-grader';

Expand Down Expand Up @@ -955,67 +961,126 @@ export class LlmGraderEvaluator implements Evaluator {
const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;

let lastError: Error | undefined;
let lastInvalidResponse: StructuredGenerationResult | undefined;
let shouldAttemptStructureFix = false;

for (let attempt = 1; attempt <= 3; attempt++) {
try {
// Prefer Vercel AI SDK language model if available.
const model = graderProvider.asLanguageModel?.();
if (model) {
const modelOptions = {
...(this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {}),
...(typeof this.temperature === 'number' ? { temperature: this.temperature } : {}),
};

// When images are present, use multi-part messages instead of plain prompt
const hasImages = images && images.length > 0;
const result = hasImages
? await generateText({
model,
system: systemPrompt,
messages: [
{
role: 'user' as const,
content: [
{ type: 'text' as const, text: userPrompt },
...toAiSdkImageParts(images),
],
},
],
...modelOptions,
})
: await generateText({
model,
system: systemPrompt,
prompt: userPrompt,
...modelOptions,
});

const data = schema.parse(parseJsonFromText(result.text));
const rawUsage = result.usage;
const tokenUsage =
rawUsage?.inputTokens != null && rawUsage?.outputTokens != null
? { input: rawUsage.inputTokens, output: rawUsage.outputTokens }
: undefined;
return { data, tokenUsage };
const result = await this.generateStructuredResponse({
context,
graderProvider,
systemPrompt,
userPrompt,
images,
});
const canRepairResponse = result.text.trim().length > 0;
lastInvalidResponse = canRepairResponse ? result : undefined;
let data: T;
try {
data = schema.parse(parseJsonFromText(result.text));
} catch (e: unknown) {
lastError = e instanceof Error ? e : new Error(String(e));
shouldAttemptStructureFix = canRepairResponse;
continue;
}
return {
data,
providerResponse: result.providerResponse,
tokenUsage: result.tokenUsage,
};
} catch (e: unknown) {
lastError = e instanceof Error ? e : new Error(String(e));
}
}

const response = await graderProvider.invoke({
question: userPrompt,
if (shouldAttemptStructureFix && lastInvalidResponse) {
try {
const repaired = await this.generateStructuredResponse({
context,
graderProvider,
systemPrompt,
evalCaseId: context.evalCase.id,
attempt: context.attempt,
maxOutputTokens: this.maxOutputTokens,
temperature: this.temperature,
userPrompt: buildStructureRepairPrompt({
validationError: lastError?.message ?? 'Schema validation failed',
invalidResponse: lastInvalidResponse.text,
}),
});

const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
return { data, providerResponse: response, tokenUsage: response.tokenUsage };
const data = schema.parse(parseJsonFromText(repaired.text));
return {
data,
providerResponse: repaired.providerResponse,
tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage),
};
} catch (e: unknown) {
lastError = e instanceof Error ? e : new Error(String(e));
}
}

throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
throw new Error(
`Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`,
);
}

private async generateStructuredResponse(options: {
readonly context: EvaluationContext;
readonly graderProvider: Provider;
readonly systemPrompt: string;
readonly userPrompt: string;
readonly images?: readonly ContentImage[];
}): Promise<StructuredGenerationResult> {
const { context, graderProvider, systemPrompt, userPrompt, images } = options;

const model = graderProvider.asLanguageModel?.();
if (model) {
const modelOptions = {
...(this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {}),
...(typeof this.temperature === 'number' ? { temperature: this.temperature } : {}),
};

const hasImages = images && images.length > 0;
const result = hasImages
? await generateText({
model,
system: systemPrompt,
messages: [
{
role: 'user' as const,
content: [
{ type: 'text' as const, text: userPrompt },
...toAiSdkImageParts(images),
],
},
],
...modelOptions,
})
: await generateText({
model,
system: systemPrompt,
prompt: userPrompt,
...modelOptions,
});

const rawUsage = result.usage;
const tokenUsage =
rawUsage?.inputTokens != null && rawUsage?.outputTokens != null
? { input: rawUsage.inputTokens, output: rawUsage.outputTokens }
: undefined;
return { text: result.text, tokenUsage };
}

const response = await graderProvider.invoke({
question: userPrompt,
systemPrompt,
evalCaseId: context.evalCase.id,
attempt: context.attempt,
maxOutputTokens: this.maxOutputTokens,
temperature: this.temperature,
});

return {
text: extractLastAssistantContent(response.output),
providerResponse: response,
tokenUsage: response.tokenUsage,
};
}
}

Expand Down Expand Up @@ -1045,6 +1110,37 @@ export function buildOutputSchema(): string {
].join('\n');
}

function buildStructureRepairPrompt(options: {
readonly validationError: string;
readonly invalidResponse: string;
}): string {
const { validationError, invalidResponse } = options;
return [
'The following evaluation response has useful grading content but invalid JSON structure.',
'Repair it to satisfy the schema in the system prompt.',
'Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.',
'',
'Validation error:',
validationError,
'',
'Invalid response:',
invalidResponse,
].join('\n');
}

function sumTokenUsage(
first: TokenUsage | undefined,
second: TokenUsage | undefined,
): TokenUsage | undefined {
if (!first && !second) {
return undefined;
}
return {
input: (first?.input ?? 0) + (second?.input ?? 0),
output: (first?.output ?? 0) + (second?.output ?? 0),
};
}

export function buildRubricOutputSchema(): string {
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
You must return a valid JSON object matching this schema:
Expand Down
95 changes: 95 additions & 0 deletions packages/core/test/evaluation/evaluators.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,26 @@ class CapturingProvider implements Provider {
}
}

class SequenceCapturingProvider implements Provider {
readonly id = 'sequence-capturing';
readonly kind = 'mock' as const;
readonly targetName = 'sequence-capturing';
readonly requests: ProviderRequest[] = [];
private index = 0;

constructor(private readonly responses: readonly ProviderResponse[]) {}

async invoke(request: ProviderRequest): Promise<ProviderResponse> {
this.requests.push(request);
const response = this.responses[this.index] ?? this.responses[this.responses.length - 1];
this.index += 1;
if (!response) {
throw new Error('No mock response configured');
}
return response;
}
}

const baseTestCase: EvalTest = {
id: 'case-1',
dataset: 'test-dataset',
Expand Down Expand Up @@ -647,6 +667,81 @@ describe('LlmGraderEvaluator (llm-grader)', () => {
warnSpy.mockRestore();
});

it('repairs malformed freeform grader output after standard retries are exhausted', async () => {
const malformedResponse = textResponse(
JSON.stringify({
score: '0.8',
assertions: [{ text: 'Captured logging requirement', passed: true }],
}),
);
const repairedResponse = textResponse(
JSON.stringify({
score: 0.8,
assertions: [{ text: 'Captured logging requirement', passed: true }],
}),
);

const graderProvider = new SequenceCapturingProvider([
malformedResponse,
malformedResponse,
malformedResponse,
repairedResponse,
]);

const evaluator = new LlmGraderEvaluator({
resolveGraderProvider: async () => graderProvider,
});

const result = await evaluator.evaluate({
evalCase: { ...baseTestCase, evaluator: 'llm-grader' },
candidate: 'Answer',
target: baseTarget,
provider: graderProvider,
attempt: 0,
promptInputs: { question: '' },
now: new Date(),
});

expect(result.score).toBeCloseTo(0.8);
expect(result.verdict).toBe('pass');
expect(graderProvider.requests).toHaveLength(4);
expect(graderProvider.requests[3]?.question).toContain(
'The following evaluation response has useful grading content but invalid JSON structure.',
);
expect(graderProvider.requests[3]?.question).toContain('"score":"0.8"');
});

it('keeps skipping when the structure-fix attempt is also malformed', async () => {
const malformedResponse = textResponse(
'{"score":"0.8","assertions":[{"text":"Bad","passed":true}]}',
);
const graderProvider = new SequenceCapturingProvider([
malformedResponse,
malformedResponse,
malformedResponse,
textResponse('{"score":'),
]);

const evaluator = new LlmGraderEvaluator({
resolveGraderProvider: async () => graderProvider,
});

const result = await evaluator.evaluate({
evalCase: { ...baseTestCase, evaluator: 'llm-grader' },
candidate: 'Answer',
target: baseTarget,
provider: graderProvider,
attempt: 0,
promptInputs: { question: '' },
now: new Date(),
});

expect(result.score).toBe(0);
expect(result.verdict).toBe('skip');
expect(result.assertions[0]?.text).toContain('structure-fix attempt');
expect(graderProvider.requests).toHaveLength(4);
});

it('keeps skipping on unrecoverable malformed JSON', async () => {
const graderProvider = new StubProvider(textResponse('{"score":'));

Expand Down
Loading