Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
273 changes: 187 additions & 86 deletions packages/ai/src/evals/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -454,109 +454,210 @@ async function registerEval<
async (caseSpan) => {
const caseContext = trace.setSpan(context.active(), caseSpan);

try {
const result = await runTask(
caseContext,
{
id: evalId,
version: evalVersion,
name: evalName,
},
{
index: data.index,
input: data.input,
expected: data.expected,
scorers: opts.scorers,
task: opts.task,
metadata: opts.metadata,
configFlags: opts.configFlags,
capability: opts.capability,
step: opts.step,
},
);
const { output, duration } = result;
outOfScopeFlags = result.outOfScopeFlags;

finalConfigSnapshot = {
flags: result.finalFlags || {},
pickedFlags: opts.configFlags,
overrides: result.overrides,
};
const numTrials = opts.trials ?? 1;
const trialResults: {
output: TOutput;
scores: Record<string, ScoreWithName>;
duration: number;
}[] = [];
let lastError: Error | null = null;

const scoreList: ScoreWithName[] = await Promise.all(
opts.scorers.map(async (scorer) => {
const scorerName = getScorerName(scorer);
return startActiveSpan(
`score ${scorerName}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: 'eval.score',
[Attr.Eval.ID]: evalId,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion,
},
try {
for (let trialIndex = 0; trialIndex < numTrials; trialIndex++) {
await startActiveSpan(
`trial ${trialIndex}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: 'eval.trial',
[Attr.Eval.ID]: evalId,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion,
[Attr.Eval.Case.Index]: data.index,
[Attr.Eval.Trial.Index]: trialIndex,
},
async (scorerSpan) => {
const start = performance.now();
const result = await scorer({
input: data.input,
output: output,
expected: data.expected,
},
async (trialSpan) => {
const trialContext = trace.setSpan(context.active(), trialSpan);

try {
const result = await runTask(
trialContext,
{
id: evalId,
version: evalVersion,
name: evalName,
},
{
index: data.index,
input: data.input,
expected: data.expected,
scorers: opts.scorers,
task: opts.task,
metadata: opts.metadata,
configFlags: opts.configFlags,
capability: opts.capability,
step: opts.step,
},
);
const { output, duration } = result;
outOfScopeFlags = result.outOfScopeFlags;

finalConfigSnapshot = {
flags: result.finalFlags || {},
pickedFlags: opts.configFlags,
overrides: result.overrides,
};

const scoreList: ScoreWithName[] = await Promise.all(
opts.scorers.map(async (scorer) => {
const scorerName = getScorerName(scorer);
return startActiveSpan(
`score ${scorerName}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: 'eval.score',
[Attr.Eval.ID]: evalId,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion,
},
},
async (scorerSpan) => {
const start = performance.now();
const result = await scorer({
input: data.input,
output: output,
expected: data.expected,
});

const duration = Math.round(performance.now() - start);
const scoreValue = result.score as number;

scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorerName,
[Attr.Eval.Score.Value]: scoreValue,
});

return {
name: scorerName,
...result,
metadata: { duration, startedAt: start, error: null },
};
},
trialContext,
);
}),
);

const scores = Object.fromEntries(scoreList.map((s) => [s.name, s]));

trialSpan.setAttributes({
[Attr.Eval.Case.Output]:
typeof output === 'string' ? output : JSON.stringify(output),
[Attr.Eval.Case.Scores]: JSON.stringify(scores ? scores : {}),
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ternary check scores ? scores : {} is redundant. The scores object is always defined (created from the scoreList at line 551), so this condition will always be truthy. Consider simplifying to JSON.stringify(scores).

Suggested change
[Attr.Eval.Case.Scores]: JSON.stringify(scores ? scores : {}),
[Attr.Eval.Case.Scores]: JSON.stringify(scores),

Copilot uses AI. Check for mistakes.
});

const duration = Math.round(performance.now() - start);
const scoreValue = result.score as number;
trialResults.push({ output, scores, duration });

scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorerName,
[Attr.Eval.Score.Value]: scoreValue,
});
allOutOfScopeFlags.push(...outOfScopeFlags);
} catch (e) {
console.log(e);
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using console.log for error logging is not ideal for production code. Consider using a proper logger or at minimum console.error for error cases. Additionally, the error is logged but then silently swallowed, which may make debugging difficult.

Suggested change
console.log(e);
console.error(e);

Copilot uses AI. Check for mistakes.
const error = e as Error;

return {
name: scorerName,
...result,
metadata: { duration, startedAt: start, error: null },
};
},
caseContext,
);
}),
);
const ctx = getEvalContext();
outOfScopeFlags = ctx.outOfScopeFlags || ([] as OutOfScopeFlagAccess[]);

const scores = Object.fromEntries(scoreList.map((s) => [s.name, s]));
lastError = error;
}
Comment on lines +562 to +570
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error handling in the trial loop silently catches errors and continues to the next trial. If a trial fails, only lastError is recorded, but the loop continues. This means:

  1. If some trials succeed and some fail, only successful trials are averaged (which may be intended)
  2. If all trials fail, the error is handled in the else if (lastError) block

However, there's a potential issue: when a trial error is caught, allOutOfScopeFlags.push(...outOfScopeFlags) at line 561 won't be executed for that trial. This inconsistency should be addressed - either push the flags before the try block ends, or ensure they're collected even on error.

Copilot uses AI. Check for mistakes.
},
caseContext,
);
}

caseSpan.setAttributes({
[Attr.Eval.Case.Output]:
typeof output === 'string' ? output : JSON.stringify(output),
[Attr.Eval.Case.Scores]: JSON.stringify(scores ? scores : {}),
});
if (trialResults.length > 0) {
const scorerNames = Object.keys(trialResults[0].scores);
const averagedScores: Record<string, ScoreWithName> = {};

// set task meta for showing result in vitest report
task.meta.case = {
index: data.index,
name: evalName,
expected: data.expected,
input: data.input,
output: output,
scores,
status: 'success',
errors: [],
duration,
startedAt: start,
outOfScopeFlags,
pickedFlags: opts.configFlags,
};
for (const scorerName of scorerNames) {
const scorerValues = trialResults
.map((trial) => trial.scores[scorerName])
.filter((s) => s !== undefined);

// Collect out-of-scope flags for evaluation-level aggregation
allOutOfScopeFlags.push(...outOfScopeFlags);
if (scorerValues.length > 0) {
const avgScore =
scorerValues.reduce((sum, s) => sum + (s.score as number), 0) /
scorerValues.length;

averagedScores[scorerName] = {
...scorerValues[0],
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When constructing the averaged score, spreading scorerValues[0] copies all properties including the metadata field. However, the metadata (containing duration, startedAt, error) is specific to the first trial and doesn't represent the averaged trials. Consider either:

  1. Omitting metadata from averaged scores
  2. Creating new metadata that reflects all trials (e.g., total duration, list of durations)
  3. Explicitly selecting which fields to copy: { name: scorerValues[0].name, score: avgScore, metadata: scorerValues[0].metadata }

The current implementation may be misleading as the metadata doesn't correspond to the averaged score.

Suggested change
...scorerValues[0],
name: scorerValues[0].name,

Copilot uses AI. Check for mistakes.
score: avgScore,
};
}
}

const avgDuration =
trialResults.reduce((sum, t) => sum + t.duration, 0) / trialResults.length;

const lastTrial = trialResults[trialResults.length - 1];

caseSpan.setAttributes({
[Attr.Eval.Case.Output]:
typeof lastTrial.output === 'string'
? lastTrial.output
: JSON.stringify(lastTrial.output),
[Attr.Eval.Case.Scores]: JSON.stringify(averagedScores ? averagedScores : {}),
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ternary check averagedScores ? averagedScores : {} is redundant. The averagedScores object is always defined (initialized as an empty object at line 578), so this condition will always be truthy. Consider simplifying to JSON.stringify(averagedScores).

Suggested change
[Attr.Eval.Case.Scores]: JSON.stringify(averagedScores ? averagedScores : {}),
[Attr.Eval.Case.Scores]: JSON.stringify(averagedScores),

Copilot uses AI. Check for mistakes.
});

task.meta.case = {
index: data.index,
name: evalName,
expected: data.expected,
input: data.input,
output: lastTrial.output,
scores: averagedScores,
status: 'success',
errors: [],
duration: Math.round(avgDuration),
startedAt: start,
outOfScopeFlags,
pickedFlags: opts.configFlags,
};
Comment on lines +576 to +623
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When no trials complete successfully (trialResults.length === 0) but lastError is also null, no task.meta.case is set. This can happen if an unexpected error bypasses the trial error catching. Consider adding an else clause to handle this edge case or asserting that at least one of the conditions must be true.

Copilot uses AI. Check for mistakes.
} else if (lastError) {
const error: Error = lastError;
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The redundant assignment const error: Error = lastError; is unnecessary since lastError is already typed as Error | null and checked to be truthy at line 624. Consider using lastError directly instead of creating a new variable.

Copilot uses AI. Check for mistakes.
const failedScores: Record<string, ScoreWithName> = {};
for (const scorer of opts.scorers) {
failedScores[scorer.name] = {
name: scorer.name,
Comment on lines +628 to +629
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Direct access to scorer.name may fail if the scorer doesn't have a name property. The code elsewhere uses getScorerName(scorer) helper function (defined at line 171) which provides a fallback. Consider using the same helper here for consistency and to avoid potential undefined values:

const scorerName = getScorerName(scorer);
failedScores[scorerName] = { name: scorerName, ... };
Suggested change
failedScores[scorer.name] = {
name: scorer.name,
const scorerName = getScorerName(scorer);
failedScores[scorerName] = {
name: scorerName,

Copilot uses AI. Check for mistakes.
score: 0,
metadata: {
duration: 0,
startedAt: start,
error: error.message,
},
};
}

task.meta.case = {
name: evalName,
index: data.index,
expected: data.expected,
input: data.input,
output: String(error),
scores: failedScores,
status: 'fail',
errors: [error],
startedAt: start,
duration: Math.round(performance.now() - start),
outOfScopeFlags,
pickedFlags: opts.configFlags,
};
Comment on lines +626 to +652
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error handling block is nearly identical to the outer catch block at lines 661-687. The logic for creating failedScores and populating task.meta.case is duplicated. Consider extracting this into a helper function to reduce duplication and improve maintainability.

Copilot uses AI. Check for mistakes.
}
} catch (e) {
console.log(e);
const error = e as Error;

const ctx = getEvalContext();
outOfScopeFlags = ctx.outOfScopeFlags || ([] as OutOfScopeFlagAccess[]);

// Populate scores with error metadata for all scorers that didn't run
const failedScores: Record<string, ScoreWithName> = {};
for (const scorer of opts.scorers) {
failedScores[scorer.name] = {
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Direct access to scorer.name may fail if the scorer doesn't have a name property. The code elsewhere uses getScorerName(scorer) helper function (defined at line 171) which provides a fallback. Consider using the same helper here for consistency and to avoid potential undefined values:

const scorerName = getScorerName(scorer);
failedScores[scorerName] = { name: scorerName, ... };

Copilot uses AI. Check for mistakes.
Expand Down
2 changes: 2 additions & 0 deletions packages/ai/src/evals/eval.types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ export type EvalParams<
timeout?: number;
/** Optional reduction of flag namespace */
configFlags?: string[];
/** Number of times to run each test case (defaults to 1) */
trials?: number;
};

// Discriminated-union type for per-case runtime flags (console/meta only)
Expand Down
4 changes: 4 additions & 0 deletions packages/ai/src/otel/semconv/attributes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import {
ATTR_EVAL_CONFIG_FLAGS,
ATTR_EVAL_CAPABILITY_NAME,
ATTR_EVAL_STEP_NAME,
ATTR_EVAL_TRIAL_INDEX,
} from './eval_proposal';

import {
Expand Down Expand Up @@ -331,6 +332,9 @@ export const Attr = {
Scores: ATTR_EVAL_CASE_SCORES,
Metadata: ATTR_EVAL_CASE_METADATA,
},
Trial: {
Index: ATTR_EVAL_TRIAL_INDEX,
},
Task: {
Output: ATTR_EVAL_TASK_OUTPUT,
Name: ATTR_EVAL_TASK_NAME,
Expand Down
2 changes: 2 additions & 0 deletions packages/ai/src/otel/semconv/eval_proposal.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ export const ATTR_EVAL_CASE_OUTPUT = 'eval.case.output' as const;
export const ATTR_EVAL_CASE_EXPECTED = 'eval.case.expected' as const;
export const ATTR_EVAL_CASE_SCORES = 'eval.case.scores' as const;
export const ATTR_EVAL_CASE_METADATA = 'eval.case.metadata' as const;
// trial
export const ATTR_EVAL_TRIAL_INDEX = 'eval.trial.index' as const;
// task
export const ATTR_EVAL_TASK_OUTPUT = 'eval.task.output' as const;
export const ATTR_EVAL_TASK_NAME = 'eval.task.name' as const;
Expand Down
Loading