Skip to content

Commit

Permalink
feat(frontend): add chunking configuration UI for knowledge base sett…
Browse files Browse the repository at this point in the history
…ings (#624)

…ings

---------

Co-authored-by: Mini256 <[email protected]>
  • Loading branch information
634750802 and Mini256 authored Feb 18, 2025
1 parent 51ef131 commit 22abfc6
Show file tree
Hide file tree
Showing 5 changed files with 309 additions and 41 deletions.
118 changes: 80 additions & 38 deletions frontend/app/src/api/knowledge-base.ts
Original file line number Diff line number Diff line change
@@ -1,40 +1,3 @@
/*
POST
/api/v1/admin/knowledge_bases
Create Knowledge Base
GET
/api/v1/admin/knowledge_bases
List Knowledge Bases
GET
/api/v1/admin/knowledge_bases/{knowledge_base_id}
Get Knowledge Base
PUT /api/v1/admin/knowledge_bases/{knowledge_base_id}
Update Knowledge Base Setting
DELETE /api/v1/admin/knowledge_bases/{knowledge_base_id}
Delete Knowledge Base
GET
/api/v1/admin/knowledge_bases/{knowledge_base_id}/overview
Get Knowledge Base Index Overview
GET
/api/v1/admin/knowledge_bases/{kb_id}/documents
List Knowledge Base Documents
GET
/api/v1/admin/knowledge_bases/{kb_id}/documents/{doc_id}/chunks
List Knowledge Base Chunks
POST
/api/v1/admin/knowledge_bases/{kb_id}/documents/reindex
Batch Reindex Knowledge Base Documents
POST
/api/v1/admin/knowledge_bases/{kb_id}/retry-failed-index-tasks
Retry Failed Tasks
*/

import { type BaseCreateDatasourceParams, type CreateDatasourceSpecParams, type Datasource, type DatasourceKgIndexError, datasourceSchema, type DatasourceVectorIndexError } from '@/api/datasources';
import { documentSchema } from '@/api/documents';
import { type EmbeddingModelSummary, embeddingModelSummarySchema } from '@/api/embedding-models';
Expand Down Expand Up @@ -78,6 +41,7 @@ export interface KnowledgeBase extends KnowledgeBaseSummary {
data_sources: Datasource[];
llm?: LLMSummary | null;
embedding_model?: EmbeddingModelSummary | null;
chunking_config: KnowledgeBaseChunkingConfig | null;
}

export type KnowledgeGraphIndexProgress = {
Expand All @@ -89,8 +53,85 @@ export type KnowledgeGraphIndexProgress = {
relationships?: IndexTotalStats
}

export type KnowledgeBaseSplitterType = KnowledgeBaseChunkingSplitterRule['splitter'];

export type KnowledgeBaseChunkingSentenceSplitterConfig = {
chunk_size: number
chunk_overlap: number
paragraph_separator: string
}

export type KnowledgeBaseChunkingMarkdownSplitterConfig = {
chunk_size: number
chunk_header_level: number
}

export type KnowledgeBaseChunkingSentenceSplitterRule = {
splitter: 'SentenceSplitter'
splitter_config: KnowledgeBaseChunkingSentenceSplitterConfig
}

export type KnowledgeBaseChunkingMarkdownSplitterRule = {
splitter: 'MarkdownSplitter'
splitter_config: KnowledgeBaseChunkingMarkdownSplitterConfig
}

export type KnowledgeBaseChunkingSplitterRule = KnowledgeBaseChunkingSentenceSplitterRule | KnowledgeBaseChunkingMarkdownSplitterRule;

export type KnowledgeBaseChunkingConfigGeneral = {
mode: 'general'
} & KnowledgeBaseChunkingSentenceSplitterConfig;

export type KnowledgeBaseChunkingConfigAdvanced = {
mode: 'advanced'
rules: {
'text/plain': KnowledgeBaseChunkingSplitterRule;
'text/markdown': KnowledgeBaseChunkingSplitterRule
}
}

export type KnowledgeBaseChunkingConfig = KnowledgeBaseChunkingConfigGeneral | KnowledgeBaseChunkingConfigAdvanced;

export type KnowledgeGraphDocumentChunk = z.infer<typeof knowledgeGraphDocumentChunkSchema>;

const knowledgeBaseChunkingSentenceSplitterConfigSchema = z.object({
chunk_size: z.number().int().min(1),
chunk_overlap: z.number().int().min(0),
paragraph_separator: z.string(),
}) satisfies z.ZodType<KnowledgeBaseChunkingSentenceSplitterConfig, any, any>;

const knowledgeBaseChunkingMarkdownSplitterConfigSchema = z.object({
chunk_size: z.number().int().min(1),
chunk_header_level: z.number().int().min(1).max(6),
}) satisfies z.ZodType<KnowledgeBaseChunkingMarkdownSplitterConfig, any, any>;

const knowledgeBaseChunkingSplitterRuleSchema = z.discriminatedUnion('splitter', [
z.object({
splitter: z.literal('MarkdownSplitter'),
splitter_config: knowledgeBaseChunkingMarkdownSplitterConfigSchema,
}),
z.object({
splitter: z.literal('SentenceSplitter'),
splitter_config: knowledgeBaseChunkingSentenceSplitterConfigSchema,
}),
]) satisfies z.ZodType<KnowledgeBaseChunkingSplitterRule, any, any>;

export const knowledgeBaseChunkingConfigSchema = z.discriminatedUnion('mode', [
z.object({
mode: z.literal('general'),
chunk_size: z.number().int().min(1),
chunk_overlap: z.number().int().min(0),
paragraph_separator: z.string(),
}),
z.object({
mode: z.literal('advanced'),
rules: z.object({
'text/plain': knowledgeBaseChunkingSplitterRuleSchema,
'text/markdown': knowledgeBaseChunkingSplitterRuleSchema,
}),
}),
]) satisfies z.ZodType<KnowledgeBaseChunkingConfig, any, any>;

const knowledgeBaseSummarySchema = z.object({
id: z.number(),
name: z.string(),
Expand All @@ -109,6 +150,7 @@ const knowledgeBaseSchema = knowledgeBaseSummarySchema.extend({
data_sources: datasourceSchema.array(),
llm: llmSummarySchema.nullable().optional(),
embedding_model: embeddingModelSummarySchema.nullable().optional(),
chunking_config: knowledgeBaseChunkingConfigSchema.nullable(),
}) satisfies ZodType<KnowledgeBase, any, any>;

const knowledgeGraphIndexProgressSchema = z.object({
Expand Down Expand Up @@ -154,7 +196,7 @@ const knowledgeBaseLinkedChatEngine = z.object({
id: z.number(),
name: z.string(),
is_default: z.boolean(),
})
});

export async function listKnowledgeBases ({ page = 1, size = 10 }: PageParams) {
return await fetch(requestUrl('/api/v1/admin/knowledge_bases', { page, size }), {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
import { type KnowledgeBase, type KnowledgeBaseChunkingConfig, type KnowledgeBaseChunkingConfigAdvanced, type KnowledgeBaseChunkingConfigGeneral, knowledgeBaseChunkingConfigSchema, type KnowledgeBaseChunkingMarkdownSplitterConfig, type KnowledgeBaseChunkingSentenceSplitterConfig, type KnowledgeBaseChunkingSplitterRule } from '@/api/knowledge-base';
import { FormInput } from '@/components/form/control-widget';
import { formFieldLayout } from '@/components/form/field-layout';
import { createAccessorHelper, GeneralSettingsField } from '@/components/settings-form';
import { FormField, FormItem, FormLabel } from '@/components/ui/form.beta';
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/select';
import { ToggleGroup, ToggleGroupItem } from '@/components/ui/toggle-group';
import { cn } from '@/lib/utils';
import { cloneElement, type ReactElement } from 'react';

const helper = createAccessorHelper<KnowledgeBase>();

export function KnowledgeBaseChunkingConfigFields () {
return (
<GeneralSettingsField
accessor={helper.field('chunking_config', defaultConfig)}
schema={knowledgeBaseChunkingConfigSchema}
>
<ModeSwitch />
</GeneralSettingsField>
);
}

const fieldLayout = formFieldLayout<{ value: KnowledgeBaseChunkingConfigGeneral }>();
const advancedFieldLayout = formFieldLayout<{ value: KnowledgeBaseChunkingConfigAdvanced }>();

function ModeSwitch () {
return (
<FormField<{ value: KnowledgeBase['chunking_config'] }, 'value'>
name="value"
render={(field, form) => <>
<FormItem>
<FormLabel>
Chunking Mode
</FormLabel>
<ToggleGroup
className="w-full flex items-center"
type="single"
value={field.state.value?.mode ?? undefined}
onValueChange={(value => {
field.setValue(switchMode(value as never));
})}
onBlur={field.handleBlur}
>
<ToggleGroupItem className="flex-1 border block text-left font-normal h-auto py-4 opacity-50 data-[state=on]:opacity-100 hover:opacity-100 hover:bg-transparent hover:text-foreground transition-all" value="general">
<div className="font-semibold">
General
</div>
<p className="text-muted-foreground text-xs">General text chunking mode, use best practices to process different types of documents.</p>
</ToggleGroupItem>
<ToggleGroupItem className="flex-1 border block text-left font-normal h-auto py-4 opacity-50 data-[state=on]:opacity-100 hover:opacity-100 hover:bg-transparent hover:text-foreground transition-all" value="advanced">
<div className="font-semibold">
Advanced
</div>
<p className="text-muted-foreground text-xs">Advanced text chunking mode, customize the processing procedures for different file types by rules.</p>
</ToggleGroupItem>
</ToggleGroup>
<div className="pl-4 border-l-4">
{form.state.values.value?.mode === 'general' && <GeneralChunkingConfig />}
{form.state.values.value?.mode === 'advanced' && <AdvancedChunkingConfig />}
</div>
</FormItem>
</>}
/>
);
}

function GeneralChunkingConfig () {
return (
<div className="grid md:grid-cols-3 gap-4">
<fieldLayout.Basic name="value.chunk_size" label="Chunk Size">
<FormInputLayout suffix="tokens">
<FormInput type="number" />
</FormInputLayout>
</fieldLayout.Basic>
<fieldLayout.Basic name="value.chunk_overlap" label="Chunk Overlap">
<FormInputLayout suffix="tokens">
<FormInput type="number" />
</FormInputLayout>
</fieldLayout.Basic>
<fieldLayout.Basic name="value.paragraph_separator" label="Paragraph Separator">
<FormInput />
</fieldLayout.Basic>
</div>
);
}

function AdvancedChunkingConfig () {
return (
<div className="space-y-4">
<div className="space-y-2">
<div className="text-sm font-medium text-muted-foreground">Plain Text (text/plain)</div>
<SplitterRuleConfig rule="text/plain" />
</div>
<div className="space-y-2">
<div className="text-sm font-medium text-muted-foreground">Markdown (text/markdown)</div>
<SplitterRuleConfig rule="text/markdown" />
</div>
</div>
);
}

function SplitterRuleConfig ({ rule }: { rule: keyof KnowledgeBaseChunkingConfigAdvanced['rules'] }) {
const name = `value.rules.${rule}` as const;
return (
<div className="space-y-4">
<FormField<{ value: KnowledgeBaseChunkingConfigAdvanced }, typeof name>
name={name}
render={(field, form) => (
<>
<Select
name={name}
value={field.state.value.splitter}
onValueChange={value => {
field.setValue(({
splitter: value,
splitter_config: switchSplitter(value as never),
} as KnowledgeBaseChunkingSplitterRule));
}}
>
<SelectTrigger>
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="SentenceSplitter">SentenceSplitter</SelectItem>
<SelectItem value="MarkdownSplitter">MarkdownSplitter</SelectItem>
</SelectContent>
</Select>

{field.state.value.splitter === 'SentenceSplitter' && (
<div className="pl-4 grid grid-cols-3 gap-4">
<advancedFieldLayout.Basic name={`value.rules.${rule}.splitter_config.chunk_size`} label="Chunk Size">
<FormInputLayout suffix="tokens">
<FormInput type="number" min={1} step={1} />
</FormInputLayout>
</advancedFieldLayout.Basic>
<advancedFieldLayout.Basic name={`value.rules.${rule}.splitter_config.chunk_overlap`} label="Chunk Overlap">
<FormInputLayout suffix="tokens">
<FormInput type="number" min={0} step={1} />
</FormInputLayout>
</advancedFieldLayout.Basic>
<advancedFieldLayout.Basic name={`value.rules.${rule}.splitter_config.paragraph_separator`} label="Paragraph Separator">
<FormInput />
</advancedFieldLayout.Basic>
</div>
)}
{field.state.value.splitter === 'MarkdownSplitter' && (
<div className="pl-4 grid grid-cols-3 gap-4">
<advancedFieldLayout.Basic name={`value.rules.${rule}.splitter_config.chunk_size`} label="Chunk Size">
<FormInputLayout suffix="tokens">
<FormInput type="number" min={1} step={1} />
</FormInputLayout>
</advancedFieldLayout.Basic>
<advancedFieldLayout.Basic name={`value.rules.${rule}.splitter_config.chunk_header_level`} label="Chunk Header Level">
<FormInput type="number" min={1} max={6} step={1} />
</advancedFieldLayout.Basic>
</div>
)}
</>
)}
/>
</div>
);
}

function FormInputLayout ({ suffix, children, ...props }: { suffix: string, children: ReactElement }) {
return (
<div className="relative">
{cloneElement(children, {
className: cn((props as any).className, 'pr-14'),
...props,
} as any)}
<span className="absolute h-full top-0 right-1 flex items-center px-2 text-muted-foreground text-xs font-medium select-none">
{suffix}
</span>
</div>
);
}

function switchMode (mode: KnowledgeBaseChunkingConfig['mode']): KnowledgeBaseChunkingConfig {
switch (mode) {
case 'general':
return {
mode: 'general',
...switchSplitter('SentenceSplitter'),
};
case 'advanced': {
return {
mode: 'advanced',
rules: {
'text/plain': {
splitter: 'SentenceSplitter',
splitter_config: switchSplitter('SentenceSplitter'),
},
'text/markdown': {
splitter: 'MarkdownSplitter',
splitter_config: switchSplitter('MarkdownSplitter'),
},
},
};
}
}
}

function switchSplitter (splitter: 'SentenceSplitter'): KnowledgeBaseChunkingSentenceSplitterConfig;
function switchSplitter (splitter: 'MarkdownSplitter'): KnowledgeBaseChunkingMarkdownSplitterConfig;
function switchSplitter (splitter: 'SentenceSplitter' | 'MarkdownSplitter') {
switch (splitter) {
case 'SentenceSplitter':
return {
chunk_size: 1024,
chunk_overlap: 200,
paragraph_separator: '\\n\\n',
} satisfies KnowledgeBaseChunkingSentenceSplitterConfig;
case 'MarkdownSplitter':
return {
chunk_size: 1200,
chunk_header_level: 2,
} satisfies KnowledgeBaseChunkingMarkdownSplitterConfig;
}
}

const defaultConfig = switchMode('general');
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { EmbeddingModelSelect, LLMSelect } from '@/components/form/biz';
import { FormInput, FormSwitch, FormTextarea } from '@/components/form/control-widget';
import { formFieldLayout } from '@/components/form/field-layout';
import { mutateKnowledgeBases } from '@/components/knowledge-base/hooks';
import { KnowledgeBaseChunkingConfigFields } from '@/components/knowledge-base/knowledge-base-chunking-config-fields';
import { fieldAccessor, type GeneralSettingsFieldAccessor, GeneralSettingsForm, shallowPick } from '@/components/settings-form';
import { GeneralSettingsField as GeneralSettingsField } from '@/components/settings-form/GeneralSettingsField';
import type { KeyOfType } from '@/lib/typing-utils';
Expand Down Expand Up @@ -66,6 +67,7 @@ export function KnowledgeBaseSettingsForm ({ knowledgeBase }: { knowledgeBase: K
<FormSwitch />
</field.Contained>
</GeneralSettingsField>
<KnowledgeBaseChunkingConfigFields />
<GeneralSettingsField readonly schema={createdAtSchema} accessor={createdAtAccessor}>
<field.Basic name="value" label="Created At">
<FormInput />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ export function GeneralSettingsField<Data, FieldData> ({
},
});


return (
<Form<{ value: FieldData }, undefined> disabled={disabled || readonly || fieldReadonly} form={form}>
<form className="space-y-6" {...formDomEventHandlers(form)}>
Expand Down
Loading

0 comments on commit 22abfc6

Please sign in to comment.