Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: create option to disable chunk splitting for files #3022

Merged
merged 4 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -16961,6 +16961,11 @@
"description": "Rebalance chunks is an optional field which allows you to specify whether or not to rebalance the chunks created from the file. If not specified, the default true is used. If true, Trieve will evenly distribute remainder splits across chunks such that 66 splits with a `target_splits_per_chunk` of 20 will result in 3 chunks with 22 splits each.",
"nullable": true
},
"split_avg": {
"type": "boolean",
"description": "Split average will automatically split your file into multiple chunks and average all of the resulting vectors into a single output chunk. Default is false. Explicitly enabling this will cause each file to only produce a single chunk.",
"nullable": true
},
"split_delimiters": {
"type": "array",
"items": {
Expand Down
4 changes: 4 additions & 0 deletions clients/ts-sdk/src/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3396,6 +3396,10 @@ export type UploadFileReqPayload = {
* Rebalance chunks is an optional field which allows you to specify whether or not to rebalance the chunks created from the file. If not specified, the default true is used. If true, Trieve will evenly distribute remainder splits across chunks such that 66 splits with a `target_splits_per_chunk` of 20 will result in 3 chunks with 22 splits each.
*/
rebalance_chunks?: (boolean) | null;
/**
* Split average will automatically split your file into multiple chunks and average all of the resulting vectors into a single output chunk. Default is false. Explicitly enabling this will cause each file to only produce a single chunk.
*/
split_avg?: (boolean) | null;
/**
* Split delimiters is an optional field which allows you to specify the delimiters to use when splitting the file before chunking the text. If not specified, the default [.!?\n] are used to split into sentences. However, you may want to use spaces or other delimiters.
*/
Expand Down
1 change: 1 addition & 0 deletions server/src/bin/csv-jsonl-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ async fn process_csv_jsonl_file(
split_delimiters: None,
target_splits_per_chunk: None,
pdf2md_options: None,
split_avg: None,
base64_file: "".to_string(),
},
csv_jsonl_worker_message.dataset_id,
Expand Down
44 changes: 44 additions & 0 deletions server/src/bin/file-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,50 @@ async fn upload_file(
)
.await?;

// If chunk splitting turned off, create only a single chunk using html_content
if file_worker_message
.upload_file_data
.split_avg
.unwrap_or(false)
{
let chunk = ChunkReqPayload {
chunk_html: Some(html_content.clone()),
semantic_content: None,
link: file_worker_message.upload_file_data.link.clone(),
tag_set: file_worker_message.upload_file_data.tag_set.clone(),
metadata: file_worker_message.upload_file_data.metadata.clone(),
group_ids: None,
group_tracking_ids: None,
location: None,
tracking_id: file_worker_message
.upload_file_data
.clone()
.group_tracking_id,
upsert_by_tracking_id: None,
time_stamp: file_worker_message.upload_file_data.time_stamp.clone(),
weight: None,
split_avg: Some(true),
convert_html_to_text: None,
image_urls: None,
num_value: None,
fulltext_boost: None,
semantic_boost: None,
};

create_file_chunks(
file_worker_message.file_id,
file_worker_message.upload_file_data.clone(),
vec![chunk],
dataset_org_plan_sub.clone(),
None,
web_pool.clone(),
event_queue.clone(),
redis_conn.clone(),
)
.await?;
return Ok(Some(file_id));
}

let Ok(chunk_htmls) =
preprocess_file_to_chunks(html_content, file_worker_message.upload_file_data.clone())
else {
Expand Down
9 changes: 9 additions & 0 deletions server/src/handlers/file_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ pub struct UploadFileReqPayload {
pub group_tracking_id: Option<String>,
/// Parameter to use pdf2md_ocr. If true, the file will be converted to markdown using gpt-4o. Default is false.
pub pdf2md_options: Option<Pdf2MdOptions>,
/// Split average will automatically split your file into multiple chunks and average all of the resulting vectors into a single output chunk. Default is false. Explicitly enabling this will cause each file to only produce a single chunk.
pub split_avg: Option<bool>,
}

#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)]
Expand Down Expand Up @@ -135,6 +137,13 @@ pub async fn upload_file_handler(
.await
.map_err(|err| ServiceError::BadRequest(err.to_string()))?;

// Disallow split_avg with pdf2md
if data.pdf2md_options.is_some() && data.split_avg.unwrap_or(false) {
return Err(
ServiceError::BadRequest("split_avg is not supported with pdf2md".to_string()).into(),
);
}

let file_size_sum_pool = pool.clone();
let file_size_sum = get_file_size_sum_org(
dataset_org_plan_sub.organization.organization.id,
Expand Down