diff --git a/packages/ai/src/evals/eval.ts b/packages/ai/src/evals/eval.ts index 89394af6..cde2acf9 100644 --- a/packages/ai/src/evals/eval.ts +++ b/packages/ai/src/evals/eval.ts @@ -454,101 +454,203 @@ async function registerEval< async (caseSpan) => { const caseContext = trace.setSpan(context.active(), caseSpan); - try { - const result = await runTask( - caseContext, - { - id: evalId, - version: evalVersion, - name: evalName, - }, - { - index: data.index, - input: data.input, - expected: data.expected, - scorers: opts.scorers, - task: opts.task, - metadata: opts.metadata, - configFlags: opts.configFlags, - capability: opts.capability, - step: opts.step, - }, - ); - const { output, duration } = result; - outOfScopeFlags = result.outOfScopeFlags; - - finalConfigSnapshot = { - flags: result.finalFlags || {}, - pickedFlags: opts.configFlags, - overrides: result.overrides, - }; + const numTrials = opts.trials ?? 1; + const trialResults: { + output: TOutput; + scores: Record; + duration: number; + }[] = []; + let lastError: Error | null = null; - const scoreList: ScoreWithName[] = await Promise.all( - opts.scorers.map(async (scorer) => { - const scorerName = getScorerName(scorer); - return startActiveSpan( - `score ${scorerName}`, - { - attributes: { - [Attr.GenAI.Operation.Name]: 'eval.score', - [Attr.Eval.ID]: evalId, - [Attr.Eval.Name]: evalName, - [Attr.Eval.Version]: evalVersion, - }, + try { + for (let trialIndex = 0; trialIndex < numTrials; trialIndex++) { + await startActiveSpan( + `trial ${trialIndex}`, + { + attributes: { + [Attr.GenAI.Operation.Name]: 'eval.trial', + [Attr.Eval.ID]: evalId, + [Attr.Eval.Name]: evalName, + [Attr.Eval.Version]: evalVersion, + [Attr.Eval.Case.Index]: data.index, + [Attr.Eval.Trial.Index]: trialIndex, }, - async (scorerSpan) => { - const start = performance.now(); - const result = await scorer({ - input: data.input, - output: output, - expected: data.expected, + }, + async (trialSpan) => { + const trialContext = trace.setSpan(context.active(), trialSpan); + + try { + const result = await runTask( + trialContext, + { + id: evalId, + version: evalVersion, + name: evalName, + }, + { + index: data.index, + input: data.input, + expected: data.expected, + scorers: opts.scorers, + task: opts.task, + metadata: opts.metadata, + configFlags: opts.configFlags, + capability: opts.capability, + step: opts.step, + }, + ); + const { output, duration } = result; + outOfScopeFlags = result.outOfScopeFlags; + + finalConfigSnapshot = { + flags: result.finalFlags || {}, + pickedFlags: opts.configFlags, + overrides: result.overrides, + }; + + const scoreList: ScoreWithName[] = await Promise.all( + opts.scorers.map(async (scorer) => { + const scorerName = getScorerName(scorer); + return startActiveSpan( + `score ${scorerName}`, + { + attributes: { + [Attr.GenAI.Operation.Name]: 'eval.score', + [Attr.Eval.ID]: evalId, + [Attr.Eval.Name]: evalName, + [Attr.Eval.Version]: evalVersion, + }, + }, + async (scorerSpan) => { + const start = performance.now(); + const result = await scorer({ + input: data.input, + output: output, + expected: data.expected, + }); + + const duration = Math.round(performance.now() - start); + const scoreValue = result.score as number; + + scorerSpan.setAttributes({ + [Attr.Eval.Score.Name]: scorerName, + [Attr.Eval.Score.Value]: scoreValue, + }); + + return { + name: scorerName, + ...result, + metadata: { duration, startedAt: start, error: null }, + }; + }, + trialContext, + ); + }), + ); + + const scores = Object.fromEntries(scoreList.map((s) => [s.name, s])); + + trialSpan.setAttributes({ + [Attr.Eval.Case.Output]: + typeof output === 'string' ? output : JSON.stringify(output), + [Attr.Eval.Case.Scores]: JSON.stringify(scores ? scores : {}), }); - const duration = Math.round(performance.now() - start); - const scoreValue = result.score as number; + trialResults.push({ output, scores, duration }); - scorerSpan.setAttributes({ - [Attr.Eval.Score.Name]: scorerName, - [Attr.Eval.Score.Value]: scoreValue, - }); + allOutOfScopeFlags.push(...outOfScopeFlags); + } catch (e) { + console.log(e); + const error = e as Error; - return { - name: scorerName, - ...result, - metadata: { duration, startedAt: start, error: null }, - }; - }, - caseContext, - ); - }), - ); + const ctx = getEvalContext(); + outOfScopeFlags = ctx.outOfScopeFlags || ([] as OutOfScopeFlagAccess[]); - const scores = Object.fromEntries(scoreList.map((s) => [s.name, s])); + lastError = error; + } + }, + caseContext, + ); + } - caseSpan.setAttributes({ - [Attr.Eval.Case.Output]: - typeof output === 'string' ? output : JSON.stringify(output), - [Attr.Eval.Case.Scores]: JSON.stringify(scores ? scores : {}), - }); + if (trialResults.length > 0) { + const scorerNames = Object.keys(trialResults[0].scores); + const averagedScores: Record = {}; - // set task meta for showing result in vitest report - task.meta.case = { - index: data.index, - name: evalName, - expected: data.expected, - input: data.input, - output: output, - scores, - status: 'success', - errors: [], - duration, - startedAt: start, - outOfScopeFlags, - pickedFlags: opts.configFlags, - }; + for (const scorerName of scorerNames) { + const scorerValues = trialResults + .map((trial) => trial.scores[scorerName]) + .filter((s) => s !== undefined); - // Collect out-of-scope flags for evaluation-level aggregation - allOutOfScopeFlags.push(...outOfScopeFlags); + if (scorerValues.length > 0) { + const avgScore = + scorerValues.reduce((sum, s) => sum + (s.score as number), 0) / + scorerValues.length; + + averagedScores[scorerName] = { + ...scorerValues[0], + score: avgScore, + }; + } + } + + const avgDuration = + trialResults.reduce((sum, t) => sum + t.duration, 0) / trialResults.length; + + const lastTrial = trialResults[trialResults.length - 1]; + + caseSpan.setAttributes({ + [Attr.Eval.Case.Output]: + typeof lastTrial.output === 'string' + ? lastTrial.output + : JSON.stringify(lastTrial.output), + [Attr.Eval.Case.Scores]: JSON.stringify(averagedScores ? averagedScores : {}), + }); + + task.meta.case = { + index: data.index, + name: evalName, + expected: data.expected, + input: data.input, + output: lastTrial.output, + scores: averagedScores, + status: 'success', + errors: [], + duration: Math.round(avgDuration), + startedAt: start, + outOfScopeFlags, + pickedFlags: opts.configFlags, + }; + } else if (lastError) { + const error: Error = lastError; + const failedScores: Record = {}; + for (const scorer of opts.scorers) { + failedScores[scorer.name] = { + name: scorer.name, + score: 0, + metadata: { + duration: 0, + startedAt: start, + error: error.message, + }, + }; + } + + task.meta.case = { + name: evalName, + index: data.index, + expected: data.expected, + input: data.input, + output: String(error), + scores: failedScores, + status: 'fail', + errors: [error], + startedAt: start, + duration: Math.round(performance.now() - start), + outOfScopeFlags, + pickedFlags: opts.configFlags, + }; + } } catch (e) { console.log(e); const error = e as Error; @@ -556,7 +658,6 @@ async function registerEval< const ctx = getEvalContext(); outOfScopeFlags = ctx.outOfScopeFlags || ([] as OutOfScopeFlagAccess[]); - // Populate scores with error metadata for all scorers that didn't run const failedScores: Record = {}; for (const scorer of opts.scorers) { failedScores[scorer.name] = { diff --git a/packages/ai/src/evals/eval.types.ts b/packages/ai/src/evals/eval.types.ts index 965b8d75..0e4496e2 100644 --- a/packages/ai/src/evals/eval.types.ts +++ b/packages/ai/src/evals/eval.types.ts @@ -98,6 +98,8 @@ export type EvalParams< timeout?: number; /** Optional reduction of flag namespace */ configFlags?: string[]; + /** Number of times to run each test case (defaults to 1) */ + trials?: number; }; // Discriminated-union type for per-case runtime flags (console/meta only) diff --git a/packages/ai/src/otel/semconv/attributes.ts b/packages/ai/src/otel/semconv/attributes.ts index 1e8e0e0d..1bb15bc4 100644 --- a/packages/ai/src/otel/semconv/attributes.ts +++ b/packages/ai/src/otel/semconv/attributes.ts @@ -36,6 +36,7 @@ import { ATTR_EVAL_CONFIG_FLAGS, ATTR_EVAL_CAPABILITY_NAME, ATTR_EVAL_STEP_NAME, + ATTR_EVAL_TRIAL_INDEX, } from './eval_proposal'; import { @@ -331,6 +332,9 @@ export const Attr = { Scores: ATTR_EVAL_CASE_SCORES, Metadata: ATTR_EVAL_CASE_METADATA, }, + Trial: { + Index: ATTR_EVAL_TRIAL_INDEX, + }, Task: { Output: ATTR_EVAL_TASK_OUTPUT, Name: ATTR_EVAL_TASK_NAME, diff --git a/packages/ai/src/otel/semconv/eval_proposal.ts b/packages/ai/src/otel/semconv/eval_proposal.ts index 2d88dd56..abe08696 100644 --- a/packages/ai/src/otel/semconv/eval_proposal.ts +++ b/packages/ai/src/otel/semconv/eval_proposal.ts @@ -24,6 +24,8 @@ export const ATTR_EVAL_CASE_OUTPUT = 'eval.case.output' as const; export const ATTR_EVAL_CASE_EXPECTED = 'eval.case.expected' as const; export const ATTR_EVAL_CASE_SCORES = 'eval.case.scores' as const; export const ATTR_EVAL_CASE_METADATA = 'eval.case.metadata' as const; +// trial +export const ATTR_EVAL_TRIAL_INDEX = 'eval.trial.index' as const; // task export const ATTR_EVAL_TASK_OUTPUT = 'eval.task.output' as const; export const ATTR_EVAL_TASK_NAME = 'eval.task.name' as const;