devflowinc · cdxker · Jan 8, 2025 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
diff --git a/clients/ts-sdk/openapi.json b/clients/ts-sdk/openapi.json
@@ -16961,6 +16961,11 @@
             "description": "Rebalance chunks is an optional field which allows you to specify whether or not to rebalance the chunks created from the file. If not specified, the default true is used. If true, Trieve will evenly distribute remainder splits across chunks such that 66 splits with a `target_splits_per_chunk` of 20 will result in 3 chunks with 22 splits each.",
             "nullable": true
           },
+          "split_avg": {
+            "type": "boolean",
+            "description": "Split average will automatically split your file into multiple chunks and average all of the resulting vectors into a single output chunk. Default is false. Explicitly enabling this will cause each file to only produce a single chunk.",
+            "nullable": true
+          },
           "split_delimiters": {
             "type": "array",
             "items": {

diff --git a/clients/ts-sdk/src/types.gen.ts b/clients/ts-sdk/src/types.gen.ts
@@ -3396,6 +3396,10 @@ export type UploadFileReqPayload = {
      * Rebalance chunks is an optional field which allows you to specify whether or not to rebalance the chunks created from the file. If not specified, the default true is used. If true, Trieve will evenly distribute remainder splits across chunks such that 66 splits with a `target_splits_per_chunk` of 20 will result in 3 chunks with 22 splits each.
      */
     rebalance_chunks?: (boolean) | null;
+    /**
+     * Split average will automatically split your file into multiple chunks and average all of the resulting vectors into a single output chunk. Default is false. Explicitly enabling this will cause each file to only produce a single chunk.
+     */
+    split_avg?: (boolean) | null;
     /**
      * Split delimiters is an optional field which allows you to specify the delimiters to use when splitting the file before chunking the text. If not specified, the default [.!?\n] are used to split into sentences. However, you may want to use spaces or other delimiters.
      */

diff --git a/server/src/bin/csv-jsonl-worker.rs b/server/src/bin/csv-jsonl-worker.rs
@@ -516,6 +516,7 @@ async fn process_csv_jsonl_file(
             split_delimiters: None,
             target_splits_per_chunk: None,
             pdf2md_options: None,
+            split_avg: None,
             base64_file: "".to_string(),
         },
         csv_jsonl_worker_message.dataset_id,

diff --git a/server/src/bin/file-worker.rs b/server/src/bin/file-worker.rs
@@ -644,6 +644,50 @@ async fn upload_file(
     )
     .await?;
 
+    // If chunk splitting turned off, create only a single chunk using html_content
+    if file_worker_message
+        .upload_file_data
+        .split_avg
+        .unwrap_or(false)
+    {
+        let chunk = ChunkReqPayload {
+            chunk_html: Some(html_content.clone()),
+            semantic_content: None,
+            link: file_worker_message.upload_file_data.link.clone(),
+            tag_set: file_worker_message.upload_file_data.tag_set.clone(),
+            metadata: file_worker_message.upload_file_data.metadata.clone(),
+            group_ids: None,
+            group_tracking_ids: None,
+            location: None,
+            tracking_id: file_worker_message
+                .upload_file_data
+                .clone()
+                .group_tracking_id,
+            upsert_by_tracking_id: None,
+            time_stamp: file_worker_message.upload_file_data.time_stamp.clone(),
+            weight: None,
+            split_avg: Some(true),
+            convert_html_to_text: None,
+            image_urls: None,
+            num_value: None,
+            fulltext_boost: None,
+            semantic_boost: None,
+        };
+
+        create_file_chunks(
+            file_worker_message.file_id,
+            file_worker_message.upload_file_data.clone(),
+            vec![chunk],
+            dataset_org_plan_sub.clone(),
+            None,
+            web_pool.clone(),
+            event_queue.clone(),
+            redis_conn.clone(),
+        )
+        .await?;
+        return Ok(Some(file_id));
+    }
+
     let Ok(chunk_htmls) =
         preprocess_file_to_chunks(html_content, file_worker_message.upload_file_data.clone())
     else {

diff --git a/server/src/handlers/file_handler.rs b/server/src/handlers/file_handler.rs
@@ -85,6 +85,8 @@ pub struct UploadFileReqPayload {
     pub group_tracking_id: Option<String>,
     /// Parameter to use pdf2md_ocr. If true, the file will be converted to markdown using gpt-4o. Default is false.
     pub pdf2md_options: Option<Pdf2MdOptions>,
+    /// Split average will automatically split your file into multiple chunks and average all of the resulting vectors into a single output chunk. Default is false. Explicitly enabling this will cause each file to only produce a single chunk.
+    pub split_avg: Option<bool>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone, ToSchema)]
@@ -135,6 +137,13 @@ pub async fn upload_file_handler(
         .await
         .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
 
+    // Disallow split_avg with pdf2md
+    if data.pdf2md_options.is_some() && data.split_avg.unwrap_or(false) {
+        return Err(
+            ServiceError::BadRequest("split_avg is not supported with pdf2md".to_string()).into(),
+        );
+    }
+
     let file_size_sum_pool = pool.clone();
     let file_size_sum = get_file_size_sum_org(
         dataset_org_plan_sub.organization.organization.id,