Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 47 additions & 127 deletions src/utils/schema_generation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,150 +9,70 @@ export type JsonSchemaProperty = {
format?: string;
};

export type JsonSchemaArray = {
type: 'array';
items: JsonSchemaProperty;
};

export type SchemaGenerationOptions = {
/** Maximum number of items to use for schema generation. Default is 5. */
limit?: number;
/** If true, strips empty arrays from items before inference. Default is true. */
clean?: boolean;
};

// Local counterpart to the dataset API's `clean=true` — empty arrays carry no schema info.
export function removeEmptyArrays(obj: unknown): unknown {
if (Array.isArray(obj)) {
return obj.map(removeEmptyArrays);
}
if (typeof obj !== 'object' || obj === null) {
return obj;
}
return Object.entries(obj).reduce((acc, [key, value]) => {
const processed = removeEmptyArrays(value);
if (Array.isArray(processed) && processed.length === 0) {
return acc;
}
acc[key] = processed;
return acc;
}, {} as Record<string, unknown>);
}

const FORMAT_DETECTORS: [string, (s: string) => boolean][] = [
['date-time', (s) => /^\d{4}-\d{2}-\d{2}[Tt ]\d{2}:\d{2}:\d{2}(\.\d+)?([Zz]|[+-]\d{2}:\d{2})?$/.test(s)],
['date', (s) => /^\d{4}-\d{2}-\d{2}$/.test(s)],
['uuid', (s) => /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test(s)],
['email', (s) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(s)],
['uri', (s) => /^[a-zA-Z][a-zA-Z0-9+.-]*:\/\/\S+$/.test(s)],
const FORMAT_DETECTORS: [string, RegExp][] = [
['date-time', /^\d{4}-\d{2}-\d{2}[Tt ]\d{2}:\d{2}:\d{2}(\.\d+)?([Zz]|[+-]\d{2}:\d{2})?$/],
['date', /^\d{4}-\d{2}-\d{2}$/],
['uuid', /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i],
['email', /^[^\s@]+@[^\s@]+\.[^\s@]+$/],
['uri', /^[a-zA-Z][a-zA-Z0-9+.-]*:\/\/\S+$/],
];

function detectFormat(value: string): string | undefined {
for (const [name, test] of FORMAT_DETECTORS) {
if (test(value)) return name;
}
return undefined;
}
const detectFormat = (s: string) => FORMAT_DETECTORS.find(([, re]) => re.test(s))?.[0];

function inferType(value: unknown): JsonSchemaPrimitiveType {
if (value === null) return 'null';
if (Array.isArray(value)) return 'array';
if (typeof value === 'number') return Number.isInteger(value) && Number.isFinite(value) ? 'integer' : 'number';
if (typeof value === 'boolean') return 'boolean';
if (typeof value === 'string') return 'string';
return 'object';
function stripEmptyArrays(v: unknown): unknown {
if (Array.isArray(v)) return v.map(stripEmptyArrays);
if (!v || typeof v !== 'object') return v;
const out: Record<string, unknown> = {};
for (const [k, val] of Object.entries(v)) {
const p = stripEmptyArrays(val);
if (!Array.isArray(p) || p.length) out[k] = p;
}
return out;
}

function inferSchema(value: unknown): JsonSchemaProperty {
const type = inferType(value);

if (type === 'object') {
const entries = Object.entries(value as Record<string, unknown>);
if (entries.length === 0) return { type: 'object' };
function infer(v: unknown): JsonSchemaProperty {
if (v === null) return { type: 'null' };
if (Array.isArray(v)) return v.length ? { type: 'array', items: v.map(infer).reduce(merge) } : { type: 'array' };
if (typeof v === 'object') {
const properties: Record<string, JsonSchemaProperty> = {};
for (const [k, v] of entries) {
properties[k] = inferSchema(v);
}
return { type: 'object', properties };
for (const [k, val] of Object.entries(v)) properties[k] = infer(val);
return Object.keys(properties).length ? { type: 'object', properties } : { type: 'object' };
}

if (type === 'array') {
const arr = value as unknown[];
if (arr.length === 0) return { type: 'array' };
const merged = arr.map(inferSchema).reduce(mergeSchemas);
return { type: 'array', items: merged };
if (typeof v === 'number') return { type: Number.isInteger(v) && Number.isFinite(v) ? 'integer' : 'number' };
if (typeof v === 'string') {
const f = detectFormat(v);
return f ? { type: 'string', format: f } : { type: 'string' };
}

if (type === 'string') {
const format = detectFormat(value as string);
return format ? { type: 'string', format } : { type: 'string' };
}

return { type };
return { type: typeof v as JsonSchemaPrimitiveType };
}

// Merge rules:
// integer ⊕ number → number (number is a superset).
// different types → type array, e.g. ['string', 'null'].
// two objects → union of keys, recursive merge of shared values.
// two arrays → merged `items`.
// string format kept only when both sides agree.
function mergeSchemas(a: JsonSchemaProperty, b: JsonSchemaProperty): JsonSchemaProperty {
const aTypes = Array.isArray(a.type) ? a.type : [a.type];
const bTypes = Array.isArray(b.type) ? b.type : [b.type];
let typeSet = Array.from(new Set([...aTypes, ...bTypes]));

// integer ⊆ number — collapse to number when both appear.
if (typeSet.includes('integer') && typeSet.includes('number')) {
typeSet = typeSet.filter((t) => t !== 'integer');
}

const result: JsonSchemaProperty = {
type: typeSet.length === 1 ? typeSet[0] : typeSet,
};

if (a.format && a.format === b.format) {
result.format = a.format;
}

function merge(a: JsonSchemaProperty, b: JsonSchemaProperty): JsonSchemaProperty {
const at = Array.isArray(a.type) ? a.type : [a.type];
const bt = Array.isArray(b.type) ? b.type : [b.type];
let types = [...new Set([...at, ...bt])];
if (types.includes('integer') && types.includes('number')) types = types.filter((t) => t !== 'integer');
const r: JsonSchemaProperty = { type: types.length === 1 ? types[0] : types };
if (a.format && a.format === b.format) r.format = a.format;
if (a.properties || b.properties) {
const ap = a.properties ?? {};
const bp = b.properties ?? {};
const allKeys = new Set([...Object.keys(ap), ...Object.keys(bp)]);
if (allKeys.size > 0) {
const merged: Record<string, JsonSchemaProperty> = {};
for (const k of allKeys) {
const av = ap[k];
const bv = bp[k];
if (av && bv) merged[k] = mergeSchemas(av, bv);
else merged[k] = (av ?? bv) as JsonSchemaProperty;
}
result.properties = merged;
const merged: Record<string, JsonSchemaProperty> = {};
for (const k of new Set([...Object.keys(ap), ...Object.keys(bp)])) {
merged[k] = ap[k] && bp[k] ? merge(ap[k], bp[k]) : (ap[k] ?? bp[k])!;
}
if (Object.keys(merged).length) r.properties = merged;
}

if (a.items && b.items) {
result.items = mergeSchemas(a.items, b.items);
} else if (a.items || b.items) {
result.items = (a.items ?? b.items) as JsonSchemaProperty;
}

return result;
if (a.items || b.items) r.items = a.items && b.items ? merge(a.items, b.items) : (a.items ?? b.items)!;
return r;
}

export function generateSchemaFromItems(
datasetItems: unknown[],
options: SchemaGenerationOptions = {},
): JsonSchemaArray | null {
items: unknown[],
options: { limit?: number; clean?: boolean } = {},
): { type: 'array'; items: JsonSchemaProperty } | null {
const { limit = 5, clean = true } = options;

const itemsToUse = datasetItems.slice(0, limit);
if (itemsToUse.length === 0) return null;

const processed = clean ? itemsToUse.map(removeEmptyArrays) : itemsToUse;

const itemSchemas = processed.map(inferSchema);
const merged = itemSchemas.reduce(mergeSchemas);

return { type: 'array', items: merged };
const slice = items.slice(0, limit);
if (!slice.length) return null;
const processed = clean ? slice.map(stripEmptyArrays) : slice;
return { type: 'array', items: processed.map(infer).reduce(merge) };
}
Loading
Loading