Skip to content

Commit

Permalink
feature: all done except logic to handle whether it's 0 or 1 indexed
Browse files Browse the repository at this point in the history
  • Loading branch information
skeptrunedev authored and cdxker committed Nov 19, 2024
1 parent c8f93f3 commit f1c1c35
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 54 deletions.
15 changes: 7 additions & 8 deletions pdf2md/server/src/operators/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,19 @@ pub async fn insert_page(
let prev_task = get_task(task.id, clickhouse_client).await?;

log::info!(
"total_pages: {} pages processed: {}",
"processed {} of {} pages",
total_pages_processed,
prev_task.pages
);

update_task_status(
task.id,
FileTaskStatus::ChunkingFile(total_pages_processed),
clickhouse_client,
)
.await?;
if total_pages_processed >= prev_task.pages {
update_task_status(task.id, FileTaskStatus::Completed, clickhouse_client).await?;
} else {
update_task_status(
task.id,
FileTaskStatus::ProcessingFile(total_pages_processed),
clickhouse_client,
)
.await?;
}

Ok(())
Expand Down
5 changes: 4 additions & 1 deletion pdf2md/server/src/routes/get_task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ async fn get_task(
let bucket = get_aws_bucket()?;
let file_url = get_signed_url(&bucket, format!("{}.pdf", &task.id).as_str()).await?;

let result = models::GetTaskResponse::new_with_pages(task, pages, file_url);
let mut result = models::GetTaskResponse::new_with_pages(task, pages, file_url);
if result.clone().pages.unwrap_or_default().len() < data.limit.unwrap_or(20) as usize {
result.pagination_token = None;
}
Ok(HttpResponse::Ok().json(result))
}
5 changes: 3 additions & 2 deletions pdf2md/server/src/templates/demo-ui.html
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,15 @@
</div>
<div class="px-4">
<div
class="mt-10 sm:mt-14 md:mt-24 grid grid-cols-2 gap-4 max-w-7xl mx-auto border border-gray-900"
id="result-container"
class="mt-10 sm:mt-14 md:mt-24 grid grid-cols-2 gap-4 max-w-7xl mx-auto"
>
<div id="my-pdf"></div>
<div id="markdown-container" class="max-h-[75vh] overflow-y-auto"></div>
</div>
</div>
<div class="flow-root">
<div class="mt-10 sm:mt-14 md:mt-24 pt-4 hidden"></div>
<div class="mt-10 sm:mt-14 md:mt-24 pt-4 border border-gray-900 hidden"></div>
<div class="my-4 animate-pulse hidden h-1 bg-gray-700"></div>
</div>
<div
Expand Down
12 changes: 7 additions & 5 deletions pdf2md/server/src/workers/supervisor-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,17 @@ pub async fn chunk_pdf(
.map_err(|e| ServiceError::BadRequest(format!("Could not load pdf: {}", e)))?;

let all_pages = doc.get_pages();
let max_page_num = *all_pages.keys().last().unwrap();
let offset = *all_pages.keys().next().unwrap_or(&0);
let max_page_num = *all_pages.keys().last().unwrap_or(&0);
let pages_per_doc = 10;
let num_docs = (max_page_num as f64 / pages_per_doc as f64).ceil() as u32;

let mut buffer = Vec::new();

// Process each chunk
for i in 0..num_docs {
let start_page = i * pages_per_doc + 1;
let end_page = std::cmp::min((i + 1) * pages_per_doc, max_page_num);
let start_page = i * pages_per_doc + offset;
let end_page = std::cmp::min((i + 1) * pages_per_doc, max_page_num) + offset;

// Split the documentid
let mut split_doc = split_pdf(doc.clone(), start_page, end_page)
Expand Down Expand Up @@ -179,7 +180,7 @@ pub async fn chunk_pdf(

update_task_status(
task.id,
FileTaskStatus::ProcessingFile(num_docs * pages_per_doc),
FileTaskStatus::ProcessingFile(max_page_num),
&clickhouse_client,
)
.await?;
Expand All @@ -189,7 +190,7 @@ pub async fn chunk_pdf(

pub fn split_pdf(doc: Document, start_page: u32, end_page: u32) -> Result<Document, String> {
let mut new_document = Document::with_version(doc.version.clone());
let page_numbers_to_keep: Vec<u32> = (start_page..=end_page).collect();
let page_numbers_to_keep: Vec<u32> = (start_page..end_page).collect();

// Get mapping of page numbers to object IDs
let page_map = doc.get_pages();
Expand All @@ -200,6 +201,7 @@ pub fn split_pdf(doc: Document, start_page: u32, end_page: u32) -> Result<Docume

// Filter and collect pages we want to keep
for page_num in page_numbers_to_keep {
log::info!("Processing page {}", page_num);
if let Some(&object_id) = page_map.get(&page_num) {
if let Ok(page_object) = doc.get_object(object_id) {
documents_pages.insert(object_id, page_object.clone());
Expand Down
101 changes: 63 additions & 38 deletions pdf2md/server/static/pdf2md.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,42 @@ const displayTask = (task) => {
const markdownContainer = document.getElementById("markdown-container");
const taskId = markdownContainer.getAttribute("data-task-id");
const taskStatus = markdownContainer.getAttribute("data-task-status");
const taskNumPages = markdownContainer.getAttribute("data-task-num-pages");
const taskNumPagesProcessed = markdownContainer.getAttribute(
"data-task-pages-processed"
);
if (
taskId === task.id &&
taskStatus === task.status &&
taskNumPages === task.num_pages.toString()
taskNumPagesProcessed === task.pages_processed.toString()
) {
console.log("Task already displayed", task.id);
return;
}

const pages = task.pages;
if (!pages) {
return;
}
const sortedPages = pages.sort((a, b) => a.metadata.page - b.metadata.page);

PDFObject.embed(task.file_url, "#my-pdf", {
pdfOpenParams: {
view: "FitH",
},
});
const pages = task.pages;
const sortedPages = pages.sort((a, b) => a.metadata.page - b.metadata.page);
const resultContainer = document.getElementById("result-container");
resultContainer.classList.add(...["border", "border-gray-900"]);

while (markdownContainer.firstChild) {
markdownContainer.removeChild(markdownContainer.firstChild);
}

markdownContainer.setAttribute("data-task-id", task.id);
markdownContainer.setAttribute("data-task-status", task.status);
markdownContainer.setAttribute("data-task-num-pages", task.num_pages);
markdownContainer.setAttribute(
"data-task-pages-processed",
task.pages_processed
);

sortedPages.forEach((page) => {
const pageContainer = document.createElement("div");
Expand All @@ -90,6 +101,49 @@ const displayTask = (task) => {
}
};

const getTaskPages = async (taskId, taskIdToDisplay) => {
try {
let paginationToken = "";
let task = null;
let pages = [];
while (true) {
const resp = await fetch(
`/api/task/${taskId}${
paginationToken ? `?pagination_token=${paginationToken}` : ""
}`,
{
headers: {
Authorization: window.TRIEVE_API_KEY,
},
}
);
const taskWithPages = await resp.json();
task = taskWithPages;
pages.push(...taskWithPages.pages);
paginationToken = taskWithPages.pagination_token;
if (!paginationToken) {
break;
}
}

pages = pages.sort((a, b) => a.metadata.page - b.metadata.page);
console.log("final pages", taskId, pages);
task.pages = pages;
upsertTaskToStorage(task);
if (taskIdToDisplay === taskId) {
displayTask(task);
}
} catch (e) {
console.error(e);
Notyf.error({
message: `Error fetching task pages. Please try again later. ${e}`,
dismissable: true,
type: "error",
position: { x: "center", y: "top" },
});
}
};

const fileUploadInput = document.getElementById("file-upload");

fileUploadInput.addEventListener("change", (event) => {
Expand Down Expand Up @@ -188,6 +242,8 @@ updateTaskStatusTable();

const refreshTasks = () => {
const tasks = JSON.parse(localStorage.getItem("tasks")) || [];
const url = new URL(window.location);
const taskIdToDisplay = url.searchParams.get("taskId");
tasks.forEach((task) => {
if (
task.status.toLowerCase() === "completed" &&
Expand All @@ -197,26 +253,7 @@ const refreshTasks = () => {
return;
}

fetch(`/api/task/${task.id}`, {
headers: {
Authorization: window.TRIEVE_API_KEY,
},
})
.then((response) => response.json())
.then((data) => {
upsertTaskToStorage(data);
})
.catch((error) => {
console.error("Error:", error);
});
});

const url = new URL(window.location);
const taskId = url.searchParams.get("taskId");
tasks.forEach((task) => {
if (task.id === taskId) {
displayTask(task);
}
getTaskPages(task.id, taskIdToDisplay);
});
};

Expand All @@ -226,19 +263,7 @@ const setActiveTaskFromUrl = () => {
const url = new URL(window.location);
const taskId = url.searchParams.get("taskId");
if (taskId) {
fetch(`/api/task/${taskId}`, {
headers: {
Authorization: window.TRIEVE_API_KEY,
},
})
.then((response) => response.json())
.then((data) => {
upsertTaskToStorage(data);
displayTask(data);
})
.catch((error) => {
console.error("Error:", error);
});
getTaskPages(taskId, taskId);
}
};

Expand Down

0 comments on commit f1c1c35

Please sign in to comment.