diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts index c752853..5bbfe81 100644 --- a/node-zerox/src/index.ts +++ b/node-zerox/src/index.ts @@ -97,7 +97,7 @@ export const zerox = async ({ if (maintainFormat) { // Use synchronous processing - for (const image of images) { + for (const [idx, image] of images.entries()) { const imagePath = path.join(tempDirectory, image); try { const { content, inputTokens, outputTokens } = await getCompletion({ @@ -106,6 +106,7 @@ export const zerox = async ({ llmParams, maintainFormat, model, + pageNumber: idx + 1, priorPage, }); const formattedMarkdown = formatMarkdown(content); @@ -124,7 +125,10 @@ export const zerox = async ({ } } else { // Process in parallel with a limit on concurrent pages - const processPage = async (image: string): Promise => { + const processPage = async ( + image: string, + pageNumber: number + ): Promise => { const imagePath = path.join(tempDirectory, image); try { const { content, inputTokens, outputTokens } = await getCompletion({ @@ -133,6 +137,7 @@ export const zerox = async ({ llmParams, maintainFormat, model, + pageNumber, priorPage, }); const formattedMarkdown = formatMarkdown(content); @@ -156,7 +161,7 @@ export const zerox = async ({ const promises = images.map((image, index) => limit(() => - processPage(image).then((result) => { + processPage(image, index + 1).then((result) => { results[index] = result; }) ) diff --git a/node-zerox/src/openAI.ts b/node-zerox/src/openAI.ts index df04456..92135d9 100644 --- a/node-zerox/src/openAI.ts +++ b/node-zerox/src/openAI.ts @@ -1,5 +1,9 @@ import { CompletionArgs, CompletionResponse } from "./types"; -import { convertKeysToSnakeCase, encodeImageToBase64, markdownToJson } from "./utils"; +import { + convertKeysToSnakeCase, + encodeImageToBase64, + markdownToJson, +} from "./utils"; import axios from "axios"; export const getCompletion = async ({ @@ -8,6 +12,7 @@ export const getCompletion = async ({ llmParams, maintainFormat, model, + pageNumber, priorPage, }: CompletionArgs): Promise => { const systemPrompt = ` @@ -58,13 +63,21 @@ export const getCompletion = async ({ const data = response.data; - const jsonOutput = await markdownToJson(data.choices[0].message.content); - console.log("====>>>>", JSON.stringify(jsonOutput)); + const jsonOutput = await markdownToJson( + data.choices[0].message.content, + pageNumber + ); + + // TODO: remove this + // Only for development + console.log('======') + console.log(JSON.stringify(jsonOutput)); return { content: data.choices[0].message.content, inputTokens: data.usage.prompt_tokens, outputTokens: data.usage.completion_tokens, + structuredContent: jsonOutput, }; } catch (err) { console.error("Error in OpenAI completion", err); diff --git a/node-zerox/src/types.ts b/node-zerox/src/types.ts index ad70170..3339169 100644 --- a/node-zerox/src/types.ts +++ b/node-zerox/src/types.ts @@ -36,6 +36,7 @@ export interface CompletionResponse { content: string; inputTokens: number; outputTokens: number; + structuredContent: ProcessedNode[]; } export interface CompletionArgs { @@ -44,6 +45,7 @@ export interface CompletionArgs { llmParams?: LLMParams; maintainFormat: boolean; model: ModelOptions | string; + pageNumber: number; priorPage: string; } @@ -54,3 +56,52 @@ export interface LLMParams { temperature?: number; topP?: number; } + +export enum MdNodeType { + break = "break", + heading = "heading", + list = "list", + paragraph = "paragraph", + strong = "strong", + table = "table", + text = "text", + thematicBreak = "thematicBreak", +} + +export enum ConvertedNodeType { + heading = "heading", + list = "list", + text = "text", +} +export interface BaseNode { + id: string; + page?: number; + parentId?: string; +} + +export interface TextNode extends BaseNode { + type: ConvertedNodeType.text; + value: string; +} + +export interface HeadingNode extends BaseNode { + type: ConvertedNodeType.heading; + value: string; +} + +export interface ListNode extends BaseNode { + type: ConvertedNodeType.list; + value: ListItem[]; +} + +export interface ListItem { + id: string; + value: string; +} + +export type ProcessedNode = TextNode | HeadingNode | ListNode; + +export interface ParentId { + depth: number; + id: string; +} diff --git a/node-zerox/src/utils.ts b/node-zerox/src/utils.ts index 5da1eb5..bb957e6 100644 --- a/node-zerox/src/utils.ts +++ b/node-zerox/src/utils.ts @@ -1,6 +1,12 @@ import { convert } from "libreoffice-convert"; import { fromPath } from "pdf2pic"; -import { LLMParams } from "./types"; +import { + LLMParams, + MdNodeType, + ConvertedNodeType, + ProcessedNode, + ParentId, +} from "./types"; import { pipeline } from "stream/promises"; import { promisify } from "util"; import * as Tesseract from "tesseract.js"; @@ -313,18 +319,13 @@ export const convertKeysToSnakeCase = ( ); }; -interface ProcessedNode { - id: string; - parentId: string | undefined; - type: string; - value: any; -} -interface parentId { - id: string; - depth: number; -} - -export const markdownToJson = async (markdownString: string) => { +/** + * + * @param markdownString String - Markdown text + * @param page Number - Page number + * @returns ProcessedNode[] - Array of processed nodes + */ +export const markdownToJson = async (markdownString: string, page: number) => { /** * Bypassing typescript transpiler using eval to use dynamic imports * @@ -341,57 +342,64 @@ export const markdownToJson = async (markdownString: string) => { console.log(JSON.stringify(parsedMd)); - const parentIdManager: parentId[] = []; + const parentIdManager: ParentId[] = []; - const jsonObj: ProcessedNode[] = []; - parsedMd.children.forEach((node: any) => { - const isHeading = node.type === "heading"; + const processedNodes: ProcessedNode[] = []; + parsedMd.children.forEach((sourceNode: any) => { + const isHeading = sourceNode.type === MdNodeType.heading; - if (isHeading && node.depth <= (parentIdManager.at(-1)?.depth || 0)) { + if (isHeading && sourceNode.depth <= (parentIdManager.at(-1)?.depth || 0)) { for (let i = parentIdManager.length; i > 0; i--) { parentIdManager.pop(); - if (node.depth > (parentIdManager.at(-1)?.depth || 0)) { + if (sourceNode.depth > (parentIdManager.at(-1)?.depth || 0)) { break; } } } - const processedNode = processNode(node, parentIdManager.at(-1)?.id); + const processedNode = processNode( + sourceNode, + page, + parentIdManager.at(-1)?.id + ); if (isHeading) { - parentIdManager.push({ id: processedNode[0].id, depth: node.depth }); + parentIdManager.push({ + id: processedNode[0].id, + depth: sourceNode.depth, + }); } - jsonObj.push(...processedNode); + processedNodes.push(...processedNode); }); - return jsonObj; -}; - -const type: Record = { - heading: "heading", - text: "text", - list: "list", + return processedNodes; }; -const processNode = (node: any, parentId?: string): ProcessedNode[] => { +const processNode = ( + node: any, + page: number, + parentId?: string +): ProcessedNode[] => { let value: any; let siblingNodes: ProcessedNode[] = []; - if (node.type === "heading") { - value = node.children - .map((childNode: any) => processText(childNode)) - .join(" "); - } else if (node.type === "paragraph") { + if ( + node.type === MdNodeType.heading || + node.type === MdNodeType.paragraph || + node.type === MdNodeType.strong + ) { value = node.children .map((childNode: any) => processText(childNode)) .join(" "); - } else if (node.type === "list") { + } else if (node.type === MdNodeType.list) { const processedNodes = node.children.map((childNode: any) => - processListItem(childNode) + processListItem(childNode, page) ); value = []; processedNodes.forEach((pn: any) => { value.push(...pn.node); + + // Store nested list nodes siblingNodes.push(...pn.siblings); }); } @@ -399,25 +407,34 @@ const processNode = (node: any, parentId?: string): ProcessedNode[] => { return [ { id: nanoid(), + page, parentId, - type: type[node.type as string] || type.text, + type: + ConvertedNodeType[node.type as ConvertedNodeType] || + ConvertedNodeType.text, value, }, ...(siblingNodes || []), ]; }; +const ignoreNodeTypes = new Set([MdNodeType.break, MdNodeType.thematicBreak]); + const processText = (node: any) => { - return node.value; + if (ignoreNodeTypes.has(node.type)) return ""; + + return node.type === MdNodeType.text + ? node.value + : node.children.map((child: any) => processText(child)).join(" "); }; -const processListItem = (node: any) => { +const processListItem = (node: any, page: number) => { let newNode: ProcessedNode[] = []; let siblings: ProcessedNode[] = []; node.children.forEach((childNode: any) => { - if (childNode.type !== "list") { - const processedNode = processNode(childNode); + if (childNode.type !== MdNodeType.list) { + const processedNode = processNode(childNode, page); if (newNode.length > 0) { newNode[0].value += processedNode.map(({ value }) => value).join(", "); } else { @@ -429,13 +446,13 @@ const processListItem = (node: any) => { newNode = [ { id: nanoid(), - type: "text", + type: ConvertedNodeType.text, value: "", parentId: undefined, }, ]; } - const processedNode = processNode(childNode, newNode[0].id); + const processedNode = processNode(childNode, page, newNode[0].id); siblings.push(...processedNode); } }); diff --git a/node-zerox/tsconfig.json b/node-zerox/tsconfig.json index f14c077..b843df0 100644 --- a/node-zerox/tsconfig.json +++ b/node-zerox/tsconfig.json @@ -6,7 +6,8 @@ "outDir": "./dist", "strict": true, "esModuleInterop": true, - "skipLibCheck": true + "skipLibCheck": true, + "downlevelIteration": true, }, "include": ["src/**/*"], "exclude": ["node_modules", "**/*.test.ts"]