Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/ai/src/evals/eval.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ export const mapSpanToCase = (item: { _time: string; data: any }): Case => {
expected: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Expected),
duration: duration,
status: data.status.code,
scores: scores ? (typeof scores === 'string' ? JSON.parse(scores) : scores) : undefined,
scores: scores ? (typeof scores === 'string' ? JSON.parse(scores) : scores) : {}, // undefined would be more honest, but this lets us do like `baseline.scores[name]` without crashing
runAt: item._time,
spanId: data.span_id,
traceId: data.trace_id,
Expand Down
1 change: 1 addition & 0 deletions packages/ai/src/evals/eval.types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ export type FlagDiff = {
flag: string;
current: string | undefined;
baseline: string | undefined;
default: string | undefined;
};

export type OutOfScopeFlagAccess = {
Expand Down
87 changes: 52 additions & 35 deletions packages/ai/src/evals/reporter.console-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export type SuiteData = {
baseline: Evaluation | undefined | null;
configFlags?: string[];
flagConfig?: Record<string, any>;
defaultFlagConfig?: Record<string, any>;
runId: string;
orgId?: string;
cases: Array<{
Expand Down Expand Up @@ -428,26 +429,24 @@ export function printSuiteBox({
const paddedName = scorerName.padEnd(maxNameLength);
const hasAllErrors = allCasesErrored(scorerName);

if (suite.baseline) {
const baselineAvg = calculateBaselineScorerAverage(suite.baseline, scorerName);
if (baselineAvg !== null) {
const currentPercent = hasAllErrors ? c.dim('N/A') : formatPercentage(avg);
const baselinePercent = formatPercentage(baselineAvg);
const { text: diffText, color: diffColor } = formatDiff(avg, baselineAvg);
const baselineAvg = suite.baseline
? calculateBaselineScorerAverage(suite.baseline, scorerName)
: null;

const paddedBaseline = baselinePercent.padStart(7);
const paddedCurrent = hasAllErrors ? currentPercent : currentPercent.padStart(7);
const paddedDiff = hasAllErrors ? c.dim('(all cases failed)') : diffText.padStart(8);
if (baselineAvg !== null) {
const currentPercent = hasAllErrors ? c.dim('N/A') : formatPercentage(avg);
const baselinePercent = formatPercentage(baselineAvg);
const { text: diffText, color: diffColor } = formatDiff(avg, baselineAvg);

logger(
`│ ${paddedName} ${c.blueBright(paddedBaseline)} → ${hasAllErrors ? paddedCurrent : c.magentaBright(paddedCurrent)} (${hasAllErrors ? paddedDiff : diffColor(paddedDiff)})`,
);
} else {
const currentPercent = hasAllErrors
? c.red('N/A (all cases failed)')
: formatPercentage(avg);
logger(`│ ${paddedName} ${currentPercent}`);
}
const paddedBaseline = baselinePercent.padStart(7);
const paddedCurrent = hasAllErrors ? currentPercent : currentPercent.padStart(7);
const diffDisplay = hasAllErrors
? c.dim('all cases failed')
: diffColor(diffText.padStart(8));

logger(
`│ ${paddedName} ${c.blueBright(paddedBaseline)} → ${hasAllErrors ? paddedCurrent : c.magentaBright(paddedCurrent)} (${diffDisplay})`,
);
} else {
const currentPercent = hasAllErrors ? c.red('N/A (all cases failed)') : formatPercentage(avg);
logger(`│ • ${paddedName} ${currentPercent}`);
Expand All @@ -467,15 +466,17 @@ export function printSuiteBox({
logger(`│ Baseline: ${c.gray('(none)')}`);
}

if (suite.baseline) {
const hasConfigChanges = flagDiff.length > 0;

logger('│ Config changes:', hasConfigChanges ? '' : c.gray('(none)'));
if (hasConfigChanges) {
for (const { flag, current, baseline } of flagDiff) {
logger(
`│ • ${flag}: ${current ?? '<not set>'} ${c.gray(`(baseline: ${baseline ?? '<not set>'})`)}`,
);
const hasConfigChanges = flagDiff.length > 0;

logger('│ Config changes:', hasConfigChanges ? '' : c.gray('(none)'));
if (hasConfigChanges) {
for (const { flag, current, baseline, default: defaultVal } of flagDiff) {
logger(`│ • ${flag}: ${current ?? '<not set>'}`);
if (defaultVal !== undefined) {
logger(`│ ${c.gray(`default: ${defaultVal}`)}`);
}
if (suite.baseline) {
logger(`│ ${c.gray(`baseline: ${baseline ?? '<not set>'}`)}`);
}
}
}
Expand Down Expand Up @@ -548,35 +549,51 @@ export function calculateBaselineScorerAverage(
}

/**
* Calculate flag diff between current run and baseline (filtered by configFlags)
* Calculate flag diff between current run vs baseline and defaults (filtered by configFlags).
* Shows a diff if current differs from at least one of baseline or default.
*/
export function calculateFlagDiff(suite: SuiteData): Array<FlagDiff> {
if (!suite.baseline || !suite.configFlags || suite.configFlags.length === 0) {
if (!suite.configFlags || suite.configFlags.length === 0) {
return [];
}

const diffs: Array<FlagDiff> = [];

const currentConfig = suite.flagConfig || {};
const baselineConfig = suite.baseline.flagConfig || {};
const baselineConfig = suite.baseline?.flagConfig || {};
const defaultConfig = suite.defaultFlagConfig || {};

const currentFlat = flattenObject(currentConfig);
const baselineFlat = flattenObject(baselineConfig);
const defaultFlat = flattenObject(defaultConfig);

const allKeys = new Set([...Object.keys(currentFlat), ...Object.keys(baselineFlat)]);
const allKeys = new Set([
...Object.keys(currentFlat),
...Object.keys(baselineFlat),
...Object.keys(defaultFlat),
]);

for (const key of allKeys) {
const isInScope = suite.configFlags.some((pattern) => key.startsWith(pattern));
if (!isInScope) continue;

const currentValue = currentFlat[key];
const baselineValue = baselineFlat[key];
const defaultValue = defaultFlat[key];

const currentStr = currentValue !== undefined ? JSON.stringify(currentValue) : undefined;
const baselineStr = baselineValue !== undefined ? JSON.stringify(baselineValue) : undefined;
const defaultStr = defaultValue !== undefined ? JSON.stringify(defaultValue) : undefined;

const diffFromBaseline = suite.baseline && currentStr !== baselineStr;
const diffFromDefault = currentStr !== defaultStr;

if (JSON.stringify(currentValue) !== JSON.stringify(baselineValue)) {
if (diffFromBaseline || diffFromDefault) {
diffs.push({
flag: key,
current: currentValue !== undefined ? JSON.stringify(currentValue) : undefined,
baseline: baselineValue !== undefined ? JSON.stringify(baselineValue) : undefined,
current: currentStr,
baseline: suite.baseline ? baselineStr : undefined,
default: defaultStr,
});
}
}
Expand All @@ -603,7 +620,7 @@ export function printFinalReport({

for (const suite of suiteData) {
const scorerAverages = calculateScorerAverages(suite);
const flagDiff = suite.baseline ? calculateFlagDiff(suite) : [];
const flagDiff = calculateFlagDiff(suite);
printSuiteBox({ suite, scorerAverages, calculateBaselineScorerAverage, flagDiff, logger });
logger('');
}
Expand Down
3 changes: 3 additions & 0 deletions packages/ai/src/evals/reporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,16 @@ export class AxiomReporter implements Reporter {
flagConfig = dotNotationToNested({ ...defaultsFlat, ...overridesFlat });
}

const defaultFlagConfig = meta.evaluation.configEnd?.flags;

this._suiteData.push({
name: meta.evaluation.name,
file: relativePath,
duration: durationSeconds + 's',
baseline: suiteBaseline || null,
configFlags: meta.evaluation.configFlags,
flagConfig,
defaultFlagConfig,
runId: meta.evaluation.runId,
orgId: meta.evaluation.orgId,
cases,
Expand Down
Loading
Loading