Skip to content

Commit

Permalink
feature: integrate chunkr as a provider for markdown conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
akhileshsharma99 authored and skeptrunedev committed Nov 21, 2024
1 parent e4f8e14 commit afeab01
Show file tree
Hide file tree
Showing 14 changed files with 509 additions and 55 deletions.
4 changes: 4 additions & 0 deletions pdf2md/.env.dist
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ LLM_MODEL=gpt-4o-mini

# PDF2MD HTTP API server
API_KEY=admin

# Chunkr - Get your API key from https://chunkr.ai
CHUNKR_API_URL=https://api.chunkr.ai
CHUNKR_API_KEY=*********************
2 changes: 1 addition & 1 deletion pdf2md/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

```bash
cd server
cp .env.dist .env
cp ../.env.dist .env
```

### Start docker dependency services
Expand Down
4 changes: 2 additions & 2 deletions pdf2md/server/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pdf2md/server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ utoipa = { version = "5.2.0", features = ["actix_extras", "uuid", "chrono"] }
utoipa-redoc = { version = "5.0.0", features = ["actix-web"] }
actix-web = "4.9.0"
serde = "1.0.215"
serde_json = "1.0.132"
serde_json = "1.0.133"
uuid = { version = "1", features = ["v4", "serde"] }
log = "0.4"
rust-s3 = "0.35.1"
Expand Down
5 changes: 5 additions & 0 deletions pdf2md/server/ch_migrations/1732072156_chunkr/down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ALTER TABLE file_tasks
DROP COLUMN IF EXISTS provider;

ALTER TABLE file_tasks
DROP COLUMN IF EXISTS chunkr_task_id;
8 changes: 8 additions & 0 deletions pdf2md/server/ch_migrations/1732072156_chunkr/up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ALTER TABLE file_tasks
ADD COLUMN IF NOT EXISTS provider String;

ALTER TABLE file_tasks
ADD COLUMN IF NOT EXISTS chunkr_task_id String;

ALTER TABLE file_tasks
ADD COLUMN IF NOT EXISTS chunkr_api_key Nullable(String);
88 changes: 87 additions & 1 deletion pdf2md/server/src/models.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::operators::chunkr::{Status, TaskResponse};
use derive_more::derive::Display;
use s3::creds::time::OffsetDateTime;
use utoipa::ToSchema;
Expand Down Expand Up @@ -59,7 +60,28 @@ pub struct CreateFileTaskResponse {
pub id: uuid::Uuid,
pub file_name: String,
pub status: FileTaskStatus,
pub pos_in_queue: String,
/// Only returned if the provider is LLM.
pub pos_in_queue: Option<String>,
}

#[derive(serde::Deserialize, serde::Serialize, Clone, Debug, ToSchema, Display)]
pub enum Provider {
#[display("Chunkr")]
Chunkr,
#[display("LLM")]
LLM,
}

impl std::str::FromStr for Provider {
type Err = String;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"Chunkr" => Ok(Provider::Chunkr),
"LLM" => Ok(Provider::LLM),
_ => Err(format!("Unknown provider: {}", s)),
}
}
}

#[derive(serde::Deserialize, serde::Serialize, Clone, Debug, ToSchema)]
Expand All @@ -68,6 +90,8 @@ pub struct UploadFileReqPayload {
pub file_name: String,
/// Base64 encoded file. This is the standard base64 encoding.
pub base64_file: String,
/// The provider to use for the task. If Chunkr is used then llm_model, llm_api_key, system_prompt, webhook_url, webhook_payload_template are ignored. If not provided, Chunkr will be used.
pub provider: Option<Provider>,
/// The name of the llm model to use for the task. If not provided, the default model will be used. We support all models from (OpenRouter)[https://openrouter.ai/models]
pub llm_model: Option<String>,
/// The API key to use for the llm being used.
Expand All @@ -85,6 +109,8 @@ pub struct UploadFileReqPayload {
/// Example: {"status": "{{status}}", "data": {"output": "{{result}}"}}
/// If not provided, the default template will be used.
pub webhook_payload_template: Option<String>,
/// The API key to use for the Chunkr API.
pub chunkr_api_key: Option<String>,
}

#[derive(Debug)]
Expand Down Expand Up @@ -144,8 +170,11 @@ pub struct FileTaskClickhouse {
pub pages: u32,
pub pages_processed: u32,
pub status: String,
pub provider: String,
#[serde(with = "clickhouse::serde::time::datetime")]
pub created_at: OffsetDateTime,
pub chunkr_task_id: String,
pub chunkr_api_key: Option<String>,
}

#[derive(Debug, serde::Serialize, serde::Deserialize, clickhouse::Row, Clone)]
Expand Down Expand Up @@ -182,6 +211,44 @@ impl From<ChunkClickhouse> for Chunk {
}
}

impl From<TaskResponse> for Vec<Chunk> {
fn from(response: TaskResponse) -> Self {
if let Some(output) = response.output {
let mut page_contents: std::collections::HashMap<u32, String> =
std::collections::HashMap::new();

for chunk in output.chunks {
for segment in chunk.segments {
let page_num = segment.page_number;
if let Some(markdown) = segment.markdown {
page_contents
.entry(page_num)
.and_modify(|content| {
content.push_str("\n\n");
content.push_str(&markdown);
})
.or_insert(markdown);
}
}
}

page_contents
.into_iter()
.map(|(page_num, content)| Chunk {
id: uuid::Uuid::new_v4().to_string(),
task_id: response.task_id.clone(),
content,
page_num,
usage: serde_json::json!({}),
created_at: response.created_at.to_string(),
})
.collect()
} else {
vec![]
}
}
}

#[derive(Debug, serde::Serialize, serde::Deserialize, Clone)]
pub struct GetTaskRequest {
pub pagination_token: Option<uuid::Uuid>,
Expand Down Expand Up @@ -215,6 +282,7 @@ impl GetTaskResponse {
pages: None,
}
}

pub fn new_with_pages(
task: FileTaskClickhouse,
pages: Vec<ChunkClickhouse>,
Expand All @@ -232,6 +300,24 @@ impl GetTaskResponse {
pages: Some(pages.into_iter().map(Chunk::from).collect()),
}
}

pub fn new_with_chunkr(task: FileTaskClickhouse, chunkr_task: TaskResponse) -> Self {
let pages = Vec::from(chunkr_task.clone());
Self {
id: task.id.clone(),
file_name: task.file_name.clone(),
file_url: Some(chunkr_task.pdf_url.unwrap_or_default()),
total_document_pages: task.pages,
pages_processed: match chunkr_task.status {
Status::Succeeded => task.pages,
_ => 0,
},
status: format!("{}", chunkr_task.status),
created_at: task.created_at.to_string(),
pagination_token: None,
pages: Some(pages),
}
}
}

#[derive(Debug, serde::Serialize, serde::Deserialize, Display, Clone, PartialEq, Eq, ToSchema)]
Expand Down
Loading

0 comments on commit afeab01

Please sign in to comment.