-
Notifications
You must be signed in to change notification settings - Fork 2
feat: trials #140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
feat: trials #140
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -454,109 +454,210 @@ async function registerEval< | |||||||||||
| async (caseSpan) => { | ||||||||||||
| const caseContext = trace.setSpan(context.active(), caseSpan); | ||||||||||||
|
|
||||||||||||
| try { | ||||||||||||
| const result = await runTask( | ||||||||||||
| caseContext, | ||||||||||||
| { | ||||||||||||
| id: evalId, | ||||||||||||
| version: evalVersion, | ||||||||||||
| name: evalName, | ||||||||||||
| }, | ||||||||||||
| { | ||||||||||||
| index: data.index, | ||||||||||||
| input: data.input, | ||||||||||||
| expected: data.expected, | ||||||||||||
| scorers: opts.scorers, | ||||||||||||
| task: opts.task, | ||||||||||||
| metadata: opts.metadata, | ||||||||||||
| configFlags: opts.configFlags, | ||||||||||||
| capability: opts.capability, | ||||||||||||
| step: opts.step, | ||||||||||||
| }, | ||||||||||||
| ); | ||||||||||||
| const { output, duration } = result; | ||||||||||||
| outOfScopeFlags = result.outOfScopeFlags; | ||||||||||||
|
|
||||||||||||
| finalConfigSnapshot = { | ||||||||||||
| flags: result.finalFlags || {}, | ||||||||||||
| pickedFlags: opts.configFlags, | ||||||||||||
| overrides: result.overrides, | ||||||||||||
| }; | ||||||||||||
| const numTrials = opts.trials ?? 1; | ||||||||||||
| const trialResults: { | ||||||||||||
| output: TOutput; | ||||||||||||
| scores: Record<string, ScoreWithName>; | ||||||||||||
| duration: number; | ||||||||||||
| }[] = []; | ||||||||||||
| let lastError: Error | null = null; | ||||||||||||
|
|
||||||||||||
| const scoreList: ScoreWithName[] = await Promise.all( | ||||||||||||
| opts.scorers.map(async (scorer) => { | ||||||||||||
| const scorerName = getScorerName(scorer); | ||||||||||||
| return startActiveSpan( | ||||||||||||
| `score ${scorerName}`, | ||||||||||||
| { | ||||||||||||
| attributes: { | ||||||||||||
| [Attr.GenAI.Operation.Name]: 'eval.score', | ||||||||||||
| [Attr.Eval.ID]: evalId, | ||||||||||||
| [Attr.Eval.Name]: evalName, | ||||||||||||
| [Attr.Eval.Version]: evalVersion, | ||||||||||||
| }, | ||||||||||||
| try { | ||||||||||||
| for (let trialIndex = 0; trialIndex < numTrials; trialIndex++) { | ||||||||||||
| await startActiveSpan( | ||||||||||||
| `trial ${trialIndex}`, | ||||||||||||
| { | ||||||||||||
| attributes: { | ||||||||||||
| [Attr.GenAI.Operation.Name]: 'eval.trial', | ||||||||||||
| [Attr.Eval.ID]: evalId, | ||||||||||||
| [Attr.Eval.Name]: evalName, | ||||||||||||
| [Attr.Eval.Version]: evalVersion, | ||||||||||||
| [Attr.Eval.Case.Index]: data.index, | ||||||||||||
| [Attr.Eval.Trial.Index]: trialIndex, | ||||||||||||
| }, | ||||||||||||
| async (scorerSpan) => { | ||||||||||||
| const start = performance.now(); | ||||||||||||
| const result = await scorer({ | ||||||||||||
| input: data.input, | ||||||||||||
| output: output, | ||||||||||||
| expected: data.expected, | ||||||||||||
| }, | ||||||||||||
| async (trialSpan) => { | ||||||||||||
| const trialContext = trace.setSpan(context.active(), trialSpan); | ||||||||||||
|
|
||||||||||||
| try { | ||||||||||||
| const result = await runTask( | ||||||||||||
| trialContext, | ||||||||||||
| { | ||||||||||||
| id: evalId, | ||||||||||||
| version: evalVersion, | ||||||||||||
| name: evalName, | ||||||||||||
| }, | ||||||||||||
| { | ||||||||||||
| index: data.index, | ||||||||||||
| input: data.input, | ||||||||||||
| expected: data.expected, | ||||||||||||
| scorers: opts.scorers, | ||||||||||||
| task: opts.task, | ||||||||||||
| metadata: opts.metadata, | ||||||||||||
| configFlags: opts.configFlags, | ||||||||||||
| capability: opts.capability, | ||||||||||||
| step: opts.step, | ||||||||||||
| }, | ||||||||||||
| ); | ||||||||||||
| const { output, duration } = result; | ||||||||||||
| outOfScopeFlags = result.outOfScopeFlags; | ||||||||||||
|
|
||||||||||||
| finalConfigSnapshot = { | ||||||||||||
| flags: result.finalFlags || {}, | ||||||||||||
| pickedFlags: opts.configFlags, | ||||||||||||
| overrides: result.overrides, | ||||||||||||
| }; | ||||||||||||
|
|
||||||||||||
| const scoreList: ScoreWithName[] = await Promise.all( | ||||||||||||
| opts.scorers.map(async (scorer) => { | ||||||||||||
| const scorerName = getScorerName(scorer); | ||||||||||||
| return startActiveSpan( | ||||||||||||
| `score ${scorerName}`, | ||||||||||||
| { | ||||||||||||
| attributes: { | ||||||||||||
| [Attr.GenAI.Operation.Name]: 'eval.score', | ||||||||||||
| [Attr.Eval.ID]: evalId, | ||||||||||||
| [Attr.Eval.Name]: evalName, | ||||||||||||
| [Attr.Eval.Version]: evalVersion, | ||||||||||||
| }, | ||||||||||||
| }, | ||||||||||||
| async (scorerSpan) => { | ||||||||||||
| const start = performance.now(); | ||||||||||||
| const result = await scorer({ | ||||||||||||
| input: data.input, | ||||||||||||
| output: output, | ||||||||||||
| expected: data.expected, | ||||||||||||
| }); | ||||||||||||
|
|
||||||||||||
| const duration = Math.round(performance.now() - start); | ||||||||||||
| const scoreValue = result.score as number; | ||||||||||||
|
|
||||||||||||
| scorerSpan.setAttributes({ | ||||||||||||
| [Attr.Eval.Score.Name]: scorerName, | ||||||||||||
| [Attr.Eval.Score.Value]: scoreValue, | ||||||||||||
| }); | ||||||||||||
|
|
||||||||||||
| return { | ||||||||||||
| name: scorerName, | ||||||||||||
| ...result, | ||||||||||||
| metadata: { duration, startedAt: start, error: null }, | ||||||||||||
| }; | ||||||||||||
| }, | ||||||||||||
| trialContext, | ||||||||||||
| ); | ||||||||||||
| }), | ||||||||||||
| ); | ||||||||||||
|
|
||||||||||||
| const scores = Object.fromEntries(scoreList.map((s) => [s.name, s])); | ||||||||||||
|
|
||||||||||||
| trialSpan.setAttributes({ | ||||||||||||
| [Attr.Eval.Case.Output]: | ||||||||||||
| typeof output === 'string' ? output : JSON.stringify(output), | ||||||||||||
| [Attr.Eval.Case.Scores]: JSON.stringify(scores ? scores : {}), | ||||||||||||
| }); | ||||||||||||
|
|
||||||||||||
| const duration = Math.round(performance.now() - start); | ||||||||||||
| const scoreValue = result.score as number; | ||||||||||||
| trialResults.push({ output, scores, duration }); | ||||||||||||
|
|
||||||||||||
| scorerSpan.setAttributes({ | ||||||||||||
| [Attr.Eval.Score.Name]: scorerName, | ||||||||||||
| [Attr.Eval.Score.Value]: scoreValue, | ||||||||||||
| }); | ||||||||||||
| allOutOfScopeFlags.push(...outOfScopeFlags); | ||||||||||||
| } catch (e) { | ||||||||||||
| console.log(e); | ||||||||||||
|
||||||||||||
| console.log(e); | |
| console.error(e); |
Copilot
AI
Nov 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The error handling in the trial loop silently catches errors and continues to the next trial. If a trial fails, only lastError is recorded, but the loop continues. This means:
- If some trials succeed and some fail, only successful trials are averaged (which may be intended)
- If all trials fail, the error is handled in the
else if (lastError)block
However, there's a potential issue: when a trial error is caught, allOutOfScopeFlags.push(...outOfScopeFlags) at line 561 won't be executed for that trial. This inconsistency should be addressed - either push the flags before the try block ends, or ensure they're collected even on error.
Copilot
AI
Nov 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When constructing the averaged score, spreading scorerValues[0] copies all properties including the metadata field. However, the metadata (containing duration, startedAt, error) is specific to the first trial and doesn't represent the averaged trials. Consider either:
- Omitting metadata from averaged scores
- Creating new metadata that reflects all trials (e.g., total duration, list of durations)
- Explicitly selecting which fields to copy:
{ name: scorerValues[0].name, score: avgScore, metadata: scorerValues[0].metadata }
The current implementation may be misleading as the metadata doesn't correspond to the averaged score.
| ...scorerValues[0], | |
| name: scorerValues[0].name, |
Copilot
AI
Nov 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ternary check averagedScores ? averagedScores : {} is redundant. The averagedScores object is always defined (initialized as an empty object at line 578), so this condition will always be truthy. Consider simplifying to JSON.stringify(averagedScores).
| [Attr.Eval.Case.Scores]: JSON.stringify(averagedScores ? averagedScores : {}), | |
| [Attr.Eval.Case.Scores]: JSON.stringify(averagedScores), |
Copilot
AI
Nov 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When no trials complete successfully (trialResults.length === 0) but lastError is also null, no task.meta.case is set. This can happen if an unexpected error bypasses the trial error catching. Consider adding an else clause to handle this edge case or asserting that at least one of the conditions must be true.
Copilot
AI
Nov 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The redundant assignment const error: Error = lastError; is unnecessary since lastError is already typed as Error | null and checked to be truthy at line 624. Consider using lastError directly instead of creating a new variable.
Copilot
AI
Nov 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Direct access to scorer.name may fail if the scorer doesn't have a name property. The code elsewhere uses getScorerName(scorer) helper function (defined at line 171) which provides a fallback. Consider using the same helper here for consistency and to avoid potential undefined values:
const scorerName = getScorerName(scorer);
failedScores[scorerName] = { name: scorerName, ... };| failedScores[scorer.name] = { | |
| name: scorer.name, | |
| const scorerName = getScorerName(scorer); | |
| failedScores[scorerName] = { | |
| name: scorerName, |
Copilot
AI
Nov 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This error handling block is nearly identical to the outer catch block at lines 661-687. The logic for creating failedScores and populating task.meta.case is duplicated. Consider extracting this into a helper function to reduce duplication and improve maintainability.
Copilot
AI
Nov 19, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Direct access to scorer.name may fail if the scorer doesn't have a name property. The code elsewhere uses getScorerName(scorer) helper function (defined at line 171) which provides a fallback. Consider using the same helper here for consistency and to avoid potential undefined values:
const scorerName = getScorerName(scorer);
failedScores[scorerName] = { name: scorerName, ... };
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ternary check
scores ? scores : {}is redundant. Thescoresobject is always defined (created from the scoreList at line 551), so this condition will always be truthy. Consider simplifying toJSON.stringify(scores).