From d6ca5d811d584d466a44594ee207415d8b3459eb Mon Sep 17 00:00:00 2001 From: Dens Sumesh Date: Fri, 13 Dec 2024 17:07:46 -0800 Subject: [PATCH] feature: heading based chunking --- .../search/src/components/UploadFile.tsx | 39 +++- pdf2md/server/src/operators/pdf_chunk.rs | 16 +- server/src/bin/file-worker.rs | 26 +++ server/src/lib.rs | 1 + server/src/operators/file_operator.rs | 202 ++++++++++++++---- 5 files changed, 240 insertions(+), 44 deletions(-) diff --git a/frontends/search/src/components/UploadFile.tsx b/frontends/search/src/components/UploadFile.tsx index 0126b867e3..5d6f5570a2 100644 --- a/frontends/search/src/components/UploadFile.tsx +++ b/frontends/search/src/components/UploadFile.tsx @@ -44,7 +44,10 @@ export const UploadFile = () => { const [targetSplitsPerChunk, setTargetSplitsPerChunk] = createSignal(20); const [rebalanceChunks, setRebalanceChunks] = createSignal(false); const [useGptChunking, setUseGptChunking] = createSignal(false); + const [useHeadingBasedChunking, setUseHeadingBasedChunking] = + createSignal(false); const [groupTrackingId, setGroupTrackingId] = createSignal(""); + const [systemPrompt, setSystemPrompt] = createSignal(""); const [showFileInput, setShowFileInput] = createSignal(true); const [showFolderInput, setShowFolderInput] = createSignal(false); @@ -149,7 +152,11 @@ export const UploadFile = () => { split_delimiters: splitDelimiters(), target_splits_per_chunk: targetSplitsPerChunk(), rebalance_chunks: rebalanceChunks(), - pdf2md_options: { use_pdf2md_ocr: useGptChunking() }, + pdf2md_options: { + use_pdf2md_ocr: useGptChunking(), + split_headings: useHeadingBasedChunking(), + system_prompt: systemPrompt(), + }, group_tracking_id: groupTrackingId() === "" ? undefined : groupTrackingId(), // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment @@ -343,6 +350,36 @@ export const UploadFile = () => { onInput={(e) => setUseGptChunking(e.currentTarget.checked)} class="h-4 w-4 rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700" /> +
+
Heading Based Chunking
+ } + tooltipText="If set to true, Trieve will use the headings in the document to chunk the text." + /> +
+ + setUseHeadingBasedChunking(e.currentTarget.checked) + } + class="h-4 w-4 rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700" + /> +
+
+
System Prompt
+ } + tooltipText="System prompt to use when chunking. This is an optional field which allows you to specify the system prompt to use when chunking the text. If not specified, the default system prompt is used. However, you may want to use a different system prompt." + /> +
+