Skip to content

Commit

Permalink
feature: incrementally add pages
Browse files Browse the repository at this point in the history
  • Loading branch information
densumesh committed Dec 14, 2024
1 parent c0b3c3b commit a1f0d24
Show file tree
Hide file tree
Showing 8 changed files with 264 additions and 181 deletions.
8 changes: 6 additions & 2 deletions frontends/search/src/components/UploadFile.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ interface RequestBody {
group_tracking_id?: string;
metadata: any;
time_stamp?: string;
use_pdf2md_ocr?: boolean;
pdf2md_options?: {
use_pdf2md_ocr: boolean;
system_prompt?: string;
split_headings?: boolean;
};
}

export const UploadFile = () => {
Expand Down Expand Up @@ -145,7 +149,7 @@ export const UploadFile = () => {
split_delimiters: splitDelimiters(),
target_splits_per_chunk: targetSplitsPerChunk(),
rebalance_chunks: rebalanceChunks(),
use_pdf2md_ocr: useGptChunking(),
pdf2md_options: { use_pdf2md_ocr: useGptChunking() },
group_tracking_id:
groupTrackingId() === "" ? undefined : groupTrackingId(),
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
Expand Down
6 changes: 3 additions & 3 deletions pdf2md/server/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ impl From<TaskResponse> for Vec<Chunk> {

#[derive(Debug, serde::Serialize, serde::Deserialize, Clone)]
pub struct GetTaskRequest {
pub pagination_token: Option<uuid::Uuid>,
pub pagination_token: Option<u32>,
pub limit: Option<u32>,
}

Expand All @@ -265,7 +265,7 @@ pub struct GetTaskResponse {
pub status: String,
pub created_at: String,
pub pages: Option<Vec<Chunk>>,
pub pagination_token: Option<String>,
pub pagination_token: Option<u32>,
}

impl GetTaskResponse {
Expand Down Expand Up @@ -296,7 +296,7 @@ impl GetTaskResponse {
pages_processed: task.pages_processed,
status: task.status,
created_at: task.created_at.to_string(),
pagination_token: pages.last().map(|c| c.id.clone()),
pagination_token: pages.last().map(|c| c.page),
pages: Some(pages.into_iter().map(Chunk::from).collect()),
}
}
Expand Down
6 changes: 3 additions & 3 deletions pdf2md/server/src/operators/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,18 +170,18 @@ pub async fn get_task(
pub async fn get_task_pages(
task: FileTaskClickhouse,
limit: Option<u32>,
offset_id: Option<uuid::Uuid>,
offset_id: Option<u32>,
clickhouse_client: &clickhouse::Client,
) -> Result<Vec<ChunkClickhouse>, ServiceError> {
if FileTaskStatus::from(task.status.clone()) == FileTaskStatus::Completed || task.pages > 0 {
let limit = limit.unwrap_or(20);

let pages: Vec<ChunkClickhouse> = clickhouse_client
.query(
"SELECT ?fields FROM file_chunks WHERE task_id = ? AND id > ? ORDER BY page LIMIT ?",
"SELECT ?fields FROM file_chunks WHERE task_id = ? AND page > ? ORDER BY page LIMIT ?",
)
.bind(task.id.clone())
.bind(offset_id.unwrap_or(uuid::Uuid::nil()))
.bind(offset_id.unwrap_or(0))
.bind(limit)
.fetch_all()
.await
Expand Down
2 changes: 1 addition & 1 deletion pdf2md/server/src/routes/jinja_templates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::{
get_env, Templates,
};
use actix_web::{get, HttpResponse};
use minijinja::{context, path_loader, Environment};
use minijinja::context;

#[utoipa::path(
get,
Expand Down
2 changes: 1 addition & 1 deletion server/src/bin/csv-jsonl-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ async fn process_csv_jsonl_file(
rebalance_chunks: Some(false),
split_delimiters: None,
target_splits_per_chunk: None,
use_pdf2md_ocr: None,
pdf2md_options: None,
base64_file: "".to_string(),
},
csv_jsonl_worker_message.dataset_id,
Expand Down
Loading

0 comments on commit a1f0d24

Please sign in to comment.