diff --git a/packages/ai/src/evals/eval.service.ts b/packages/ai/src/evals/eval.service.ts index ae4ec162..b35724a8 100644 --- a/packages/ai/src/evals/eval.service.ts +++ b/packages/ai/src/evals/eval.service.ts @@ -163,7 +163,7 @@ export const mapSpanToCase = (item: { _time: string; data: any }): Case => { expected: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Expected), duration: duration, status: data.status.code, - scores: scores ? (typeof scores === 'string' ? JSON.parse(scores) : scores) : undefined, + scores: scores ? (typeof scores === 'string' ? JSON.parse(scores) : scores) : {}, // undefined would be more honest, but this lets us do like `baseline.scores[name]` without crashing runAt: item._time, spanId: data.span_id, traceId: data.trace_id, diff --git a/packages/ai/src/evals/eval.types.ts b/packages/ai/src/evals/eval.types.ts index 8c4c945e..cdc2a0ce 100644 --- a/packages/ai/src/evals/eval.types.ts +++ b/packages/ai/src/evals/eval.types.ts @@ -229,6 +229,7 @@ export type FlagDiff = { flag: string; current: string | undefined; baseline: string | undefined; + default: string | undefined; }; export type OutOfScopeFlagAccess = { diff --git a/packages/ai/src/evals/reporter.console-utils.ts b/packages/ai/src/evals/reporter.console-utils.ts index 3261f7e4..517ff463 100644 --- a/packages/ai/src/evals/reporter.console-utils.ts +++ b/packages/ai/src/evals/reporter.console-utils.ts @@ -24,6 +24,7 @@ export type SuiteData = { baseline: Evaluation | undefined | null; configFlags?: string[]; flagConfig?: Record; + defaultFlagConfig?: Record; runId: string; orgId?: string; cases: Array<{ @@ -428,26 +429,24 @@ export function printSuiteBox({ const paddedName = scorerName.padEnd(maxNameLength); const hasAllErrors = allCasesErrored(scorerName); - if (suite.baseline) { - const baselineAvg = calculateBaselineScorerAverage(suite.baseline, scorerName); - if (baselineAvg !== null) { - const currentPercent = hasAllErrors ? c.dim('N/A') : formatPercentage(avg); - const baselinePercent = formatPercentage(baselineAvg); - const { text: diffText, color: diffColor } = formatDiff(avg, baselineAvg); + const baselineAvg = suite.baseline + ? calculateBaselineScorerAverage(suite.baseline, scorerName) + : null; - const paddedBaseline = baselinePercent.padStart(7); - const paddedCurrent = hasAllErrors ? currentPercent : currentPercent.padStart(7); - const paddedDiff = hasAllErrors ? c.dim('(all cases failed)') : diffText.padStart(8); + if (baselineAvg !== null) { + const currentPercent = hasAllErrors ? c.dim('N/A') : formatPercentage(avg); + const baselinePercent = formatPercentage(baselineAvg); + const { text: diffText, color: diffColor } = formatDiff(avg, baselineAvg); - logger( - `│ ${paddedName} ${c.blueBright(paddedBaseline)} → ${hasAllErrors ? paddedCurrent : c.magentaBright(paddedCurrent)} (${hasAllErrors ? paddedDiff : diffColor(paddedDiff)})`, - ); - } else { - const currentPercent = hasAllErrors - ? c.red('N/A (all cases failed)') - : formatPercentage(avg); - logger(`│ • ${paddedName} ${currentPercent}`); - } + const paddedBaseline = baselinePercent.padStart(7); + const paddedCurrent = hasAllErrors ? currentPercent : currentPercent.padStart(7); + const diffDisplay = hasAllErrors + ? c.dim('all cases failed') + : diffColor(diffText.padStart(8)); + + logger( + `│ ${paddedName} ${c.blueBright(paddedBaseline)} → ${hasAllErrors ? paddedCurrent : c.magentaBright(paddedCurrent)} (${diffDisplay})`, + ); } else { const currentPercent = hasAllErrors ? c.red('N/A (all cases failed)') : formatPercentage(avg); logger(`│ • ${paddedName} ${currentPercent}`); @@ -467,15 +466,17 @@ export function printSuiteBox({ logger(`│ Baseline: ${c.gray('(none)')}`); } - if (suite.baseline) { - const hasConfigChanges = flagDiff.length > 0; - - logger('│ Config changes:', hasConfigChanges ? '' : c.gray('(none)')); - if (hasConfigChanges) { - for (const { flag, current, baseline } of flagDiff) { - logger( - `│ • ${flag}: ${current ?? ''} ${c.gray(`(baseline: ${baseline ?? ''})`)}`, - ); + const hasConfigChanges = flagDiff.length > 0; + + logger('│ Config changes:', hasConfigChanges ? '' : c.gray('(none)')); + if (hasConfigChanges) { + for (const { flag, current, baseline, default: defaultVal } of flagDiff) { + logger(`│ • ${flag}: ${current ?? ''}`); + if (defaultVal !== undefined) { + logger(`│ ${c.gray(`default: ${defaultVal}`)}`); + } + if (suite.baseline) { + logger(`│ ${c.gray(`baseline: ${baseline ?? ''}`)}`); } } } @@ -548,22 +549,29 @@ export function calculateBaselineScorerAverage( } /** - * Calculate flag diff between current run and baseline (filtered by configFlags) + * Calculate flag diff between current run vs baseline and defaults (filtered by configFlags). + * Shows a diff if current differs from at least one of baseline or default. */ export function calculateFlagDiff(suite: SuiteData): Array { - if (!suite.baseline || !suite.configFlags || suite.configFlags.length === 0) { + if (!suite.configFlags || suite.configFlags.length === 0) { return []; } const diffs: Array = []; const currentConfig = suite.flagConfig || {}; - const baselineConfig = suite.baseline.flagConfig || {}; + const baselineConfig = suite.baseline?.flagConfig || {}; + const defaultConfig = suite.defaultFlagConfig || {}; const currentFlat = flattenObject(currentConfig); const baselineFlat = flattenObject(baselineConfig); + const defaultFlat = flattenObject(defaultConfig); - const allKeys = new Set([...Object.keys(currentFlat), ...Object.keys(baselineFlat)]); + const allKeys = new Set([ + ...Object.keys(currentFlat), + ...Object.keys(baselineFlat), + ...Object.keys(defaultFlat), + ]); for (const key of allKeys) { const isInScope = suite.configFlags.some((pattern) => key.startsWith(pattern)); @@ -571,12 +579,21 @@ export function calculateFlagDiff(suite: SuiteData): Array { const currentValue = currentFlat[key]; const baselineValue = baselineFlat[key]; + const defaultValue = defaultFlat[key]; + + const currentStr = currentValue !== undefined ? JSON.stringify(currentValue) : undefined; + const baselineStr = baselineValue !== undefined ? JSON.stringify(baselineValue) : undefined; + const defaultStr = defaultValue !== undefined ? JSON.stringify(defaultValue) : undefined; + + const diffFromBaseline = suite.baseline && currentStr !== baselineStr; + const diffFromDefault = currentStr !== defaultStr; - if (JSON.stringify(currentValue) !== JSON.stringify(baselineValue)) { + if (diffFromBaseline || diffFromDefault) { diffs.push({ flag: key, - current: currentValue !== undefined ? JSON.stringify(currentValue) : undefined, - baseline: baselineValue !== undefined ? JSON.stringify(baselineValue) : undefined, + current: currentStr, + baseline: suite.baseline ? baselineStr : undefined, + default: defaultStr, }); } } @@ -603,7 +620,7 @@ export function printFinalReport({ for (const suite of suiteData) { const scorerAverages = calculateScorerAverages(suite); - const flagDiff = suite.baseline ? calculateFlagDiff(suite) : []; + const flagDiff = calculateFlagDiff(suite); printSuiteBox({ suite, scorerAverages, calculateBaselineScorerAverage, flagDiff, logger }); logger(''); } diff --git a/packages/ai/src/evals/reporter.ts b/packages/ai/src/evals/reporter.ts index 7612fabc..ca3cb20b 100644 --- a/packages/ai/src/evals/reporter.ts +++ b/packages/ai/src/evals/reporter.ts @@ -122,6 +122,8 @@ export class AxiomReporter implements Reporter { flagConfig = dotNotationToNested({ ...defaultsFlat, ...overridesFlat }); } + const defaultFlagConfig = meta.evaluation.configEnd?.flags; + this._suiteData.push({ name: meta.evaluation.name, file: relativePath, @@ -129,6 +131,7 @@ export class AxiomReporter implements Reporter { baseline: suiteBaseline || null, configFlags: meta.evaluation.configFlags, flagConfig, + defaultFlagConfig, runId: meta.evaluation.runId, orgId: meta.evaluation.orgId, cases, diff --git a/packages/ai/test/evals/reporter.console-utils.test.ts b/packages/ai/test/evals/reporter.console-utils.test.ts index e4696068..6edfdae6 100644 --- a/packages/ai/test/evals/reporter.console-utils.test.ts +++ b/packages/ai/test/evals/reporter.console-utils.test.ts @@ -16,6 +16,7 @@ import { printRuntimeFlags, printOutOfScopeFlags, printTestCaseSuccessOrFailed, + printSuiteBox, type SuiteData, } from '../../src/evals/reporter.console-utils'; import type { MetaWithCase, Case, Evaluation } from '../../src/evals/eval.types'; @@ -193,17 +194,27 @@ describe('reporter.console-utils', () => { }); describe('calculateFlagDiff', () => { - it('returns empty if no baseline or flags', () => { + it('returns empty if no configFlags', () => { const suite = { baseline: null } as unknown as SuiteData; expect(calculateFlagDiff(suite)).toEqual([]); }); - it('detects changes in scoped flags', () => { + it('returns empty if configFlags is empty', () => { + const suite = { + configFlags: [], + flagConfig: { 'feature.enabled': true }, + defaultFlagConfig: { 'feature.enabled': false }, + } as unknown as SuiteData; + expect(calculateFlagDiff(suite)).toEqual([]); + }); + + it('detects changes from baseline only', () => { const suite = { configFlags: ['feature'], - flagConfig: { 'feature.enabled': true, 'other.flag': true }, + flagConfig: { 'feature.enabled': true }, + defaultFlagConfig: { 'feature.enabled': true }, baseline: { - flagConfig: { 'feature.enabled': false, 'other.flag': false }, + flagConfig: { 'feature.enabled': false }, }, } as unknown as SuiteData; @@ -213,13 +224,66 @@ describe('reporter.console-utils', () => { flag: 'feature.enabled', current: 'true', baseline: 'false', + default: 'true', + }); + }); + + it('detects changes from default only', () => { + const suite = { + configFlags: ['feature'], + flagConfig: { 'feature.enabled': true }, + defaultFlagConfig: { 'feature.enabled': false }, + baseline: null, + } as unknown as SuiteData; + + const diff = calculateFlagDiff(suite); + expect(diff).toHaveLength(1); + expect(diff[0]).toEqual({ + flag: 'feature.enabled', + current: 'true', + baseline: undefined, + default: 'false', + }); + }); + + it('detects changes from both baseline and default', () => { + const suite = { + configFlags: ['feature'], + flagConfig: { 'feature.model': '"gpt-5"' }, + defaultFlagConfig: { 'feature.model': '"gpt-4o"' }, + baseline: { + flagConfig: { 'feature.model': '"gpt-4"' }, + }, + } as unknown as SuiteData; + + const diff = calculateFlagDiff(suite); + expect(diff).toHaveLength(1); + expect(diff[0]).toEqual({ + flag: 'feature.model', + current: '"\\"gpt-5\\""', + baseline: '"\\"gpt-4\\""', + default: '"\\"gpt-4o\\""', }); }); + it('returns empty when same as both baseline and default', () => { + const suite = { + configFlags: ['feature'], + flagConfig: { 'feature.enabled': true }, + defaultFlagConfig: { 'feature.enabled': true }, + baseline: { + flagConfig: { 'feature.enabled': true }, + }, + } as unknown as SuiteData; + + expect(calculateFlagDiff(suite)).toEqual([]); + }); + it('ignores changes outside configFlags scope', () => { const suite = { configFlags: ['feature'], flagConfig: { 'other.flag': true }, + defaultFlagConfig: { 'other.flag': false }, baseline: { flagConfig: { 'other.flag': false }, }, @@ -227,6 +291,46 @@ describe('reporter.console-utils', () => { expect(calculateFlagDiff(suite)).toEqual([]); }); + + it('handles flag missing from baseline', () => { + const suite = { + configFlags: ['feature'], + flagConfig: { 'feature.new': true }, + defaultFlagConfig: { 'feature.new': false }, + baseline: { + flagConfig: {}, + }, + } as unknown as SuiteData; + + const diff = calculateFlagDiff(suite); + expect(diff).toHaveLength(1); + expect(diff[0]).toEqual({ + flag: 'feature.new', + current: 'true', + baseline: undefined, + default: 'false', + }); + }); + + it('handles flag missing from default', () => { + const suite = { + configFlags: ['feature'], + flagConfig: { 'feature.custom': true }, + defaultFlagConfig: {}, + baseline: { + flagConfig: { 'feature.custom': false }, + }, + } as unknown as SuiteData; + + const diff = calculateFlagDiff(suite); + expect(diff).toHaveLength(1); + expect(diff[0]).toEqual({ + flag: 'feature.custom', + current: 'true', + baseline: 'false', + default: undefined, + }); + }); }); describe('printOrphanedBaselineCases', () => { @@ -508,4 +612,178 @@ describe('reporter.console-utils', () => { expect(line).toContain('(scorer not run)'); }); }); + + describe('printSuiteBox', () => { + const createBaseSuite = (overrides: Partial = {}): SuiteData => + ({ + name: 'test-suite', + file: '/path/to/test.ts', + duration: '1.23s', + runId: 'run-123', + cases: [{ index: 0, scores: { accuracy: { score: 0.8 } } }], + configFlags: ['feature'], + flagConfig: {}, + defaultFlagConfig: {}, + baseline: null, + ...overrides, + }) as SuiteData; + + const mockCalculateBaselineScorerAverage = (baseline: Evaluation, scorerName: string) => { + const scores: number[] = []; + for (const caseData of baseline.cases) { + if (caseData.scores[scorerName]) { + scores.push(caseData.scores[scorerName].value); + } + } + if (scores.length === 0) return null; + return scores.reduce((a, b) => a + b, 0) / scores.length; + }; + + it('prints config changes when different from baseline only', () => { + const { logger, getLines } = createMockLogger(); + + const suite = createBaseSuite({ + flagConfig: { 'feature.enabled': true }, + defaultFlagConfig: { 'feature.enabled': true }, + baseline: { + name: 'baseline', + version: 1, + cases: [], + flagConfig: { 'feature.enabled': false }, + } as unknown as Evaluation, + }); + + const flagDiff = calculateFlagDiff(suite); + + printSuiteBox({ + suite, + scorerAverages: { accuracy: 0.8 }, + calculateBaselineScorerAverage: mockCalculateBaselineScorerAverage, + flagDiff, + logger, + }); + + const lines = getLines().map(stripAnsi); + expect(lines.some((l) => l.includes('Config changes:'))).toBe(true); + expect(lines.some((l) => l.includes('feature.enabled: true'))).toBe(true); + expect(lines.some((l) => l.includes('default: true'))).toBe(true); + expect(lines.some((l) => l.includes('baseline: false'))).toBe(true); + }); + + it('prints config changes when different from default only (no baseline)', () => { + const { logger, getLines } = createMockLogger(); + + const suite = createBaseSuite({ + flagConfig: { 'feature.enabled': true }, + defaultFlagConfig: { 'feature.enabled': false }, + baseline: null, + }); + + const flagDiff = calculateFlagDiff(suite); + + printSuiteBox({ + suite, + scorerAverages: { accuracy: 0.8 }, + calculateBaselineScorerAverage: mockCalculateBaselineScorerAverage, + flagDiff, + logger, + }); + + const lines = getLines().map(stripAnsi); + expect(lines.some((l) => l.includes('Config changes:'))).toBe(true); + expect(lines.some((l) => l.includes('feature.enabled: true'))).toBe(true); + expect(lines.some((l) => l.includes('default: false'))).toBe(true); + expect(lines.some((l) => l.includes('baseline:'))).toBe(false); + }); + + it('prints config changes when different from both baseline and default', () => { + const { logger, getLines } = createMockLogger(); + + const suite = createBaseSuite({ + flagConfig: { 'feature.model': 'gpt-5' }, + defaultFlagConfig: { 'feature.model': 'gpt-4o' }, + baseline: { + name: 'baseline', + version: 1, + cases: [], + flagConfig: { 'feature.model': 'gpt-4' }, + } as unknown as Evaluation, + }); + + const flagDiff = calculateFlagDiff(suite); + + printSuiteBox({ + suite, + scorerAverages: { accuracy: 0.8 }, + calculateBaselineScorerAverage: mockCalculateBaselineScorerAverage, + flagDiff, + logger, + }); + + const lines = getLines().map(stripAnsi); + expect(lines.some((l) => l.includes('Config changes:'))).toBe(true); + expect(lines.some((l) => l.includes('feature.model:'))).toBe(true); + expect(lines.some((l) => l.includes('default:'))).toBe(true); + expect(lines.some((l) => l.includes('baseline:'))).toBe(true); + }); + + it('prints (none) when same as both baseline and default', () => { + const { logger, getOutput } = createMockLogger(); + + const suite = createBaseSuite({ + flagConfig: { 'feature.enabled': true }, + defaultFlagConfig: { 'feature.enabled': true }, + baseline: { + name: 'baseline', + version: 1, + cases: [], + flagConfig: { 'feature.enabled': true }, + } as unknown as Evaluation, + }); + + const flagDiff = calculateFlagDiff(suite); + + printSuiteBox({ + suite, + scorerAverages: { accuracy: 0.8 }, + calculateBaselineScorerAverage: mockCalculateBaselineScorerAverage, + flagDiff, + logger, + }); + + const output = stripAnsi(getOutput()); + expect(output).toContain('Config changes:'); + expect(output).toContain('(none)'); + }); + + it('shows baseline as when flag missing from baseline', () => { + const { logger, getLines } = createMockLogger(); + + const suite = createBaseSuite({ + flagConfig: { 'feature.new': true }, + defaultFlagConfig: { 'feature.new': false }, + baseline: { + name: 'baseline', + version: 1, + cases: [], + flagConfig: {}, + } as unknown as Evaluation, + }); + + const flagDiff = calculateFlagDiff(suite); + + printSuiteBox({ + suite, + scorerAverages: { accuracy: 0.8 }, + calculateBaselineScorerAverage: mockCalculateBaselineScorerAverage, + flagDiff, + logger, + }); + + const lines = getLines().map(stripAnsi); + expect(lines.some((l) => l.includes('feature.new: true'))).toBe(true); + expect(lines.some((l) => l.includes('default: false'))).toBe(true); + expect(lines.some((l) => l.includes('baseline: '))).toBe(true); + }); + }); });