Skip to content

Commit

Permalink
Improve typing, add page number to nodes and a bit of clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
ZeeshanZulfiqarAli committed Nov 4, 2024
1 parent c608b96 commit 8f2a9de
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 51 deletions.
11 changes: 8 additions & 3 deletions node-zerox/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ export const zerox = async ({

if (maintainFormat) {
// Use synchronous processing
for (const image of images) {
for (const [idx, image] of images.entries()) {
const imagePath = path.join(tempDirectory, image);
try {
const { content, inputTokens, outputTokens } = await getCompletion({
Expand All @@ -106,6 +106,7 @@ export const zerox = async ({
llmParams,
maintainFormat,
model,
pageNumber: idx + 1,
priorPage,
});
const formattedMarkdown = formatMarkdown(content);
Expand All @@ -124,7 +125,10 @@ export const zerox = async ({
}
} else {
// Process in parallel with a limit on concurrent pages
const processPage = async (image: string): Promise<string | null> => {
const processPage = async (
image: string,
pageNumber: number
): Promise<string | null> => {
const imagePath = path.join(tempDirectory, image);
try {
const { content, inputTokens, outputTokens } = await getCompletion({
Expand All @@ -133,6 +137,7 @@ export const zerox = async ({
llmParams,
maintainFormat,
model,
pageNumber,
priorPage,
});
const formattedMarkdown = formatMarkdown(content);
Expand All @@ -156,7 +161,7 @@ export const zerox = async ({

const promises = images.map((image, index) =>
limit(() =>
processPage(image).then((result) => {
processPage(image, index + 1).then((result) => {
results[index] = result;
})
)
Expand Down
19 changes: 16 additions & 3 deletions node-zerox/src/openAI.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import { CompletionArgs, CompletionResponse } from "./types";
import { convertKeysToSnakeCase, encodeImageToBase64, markdownToJson } from "./utils";
import {
convertKeysToSnakeCase,
encodeImageToBase64,
markdownToJson,
} from "./utils";
import axios from "axios";

export const getCompletion = async ({
Expand All @@ -8,6 +12,7 @@ export const getCompletion = async ({
llmParams,
maintainFormat,
model,
pageNumber,
priorPage,
}: CompletionArgs): Promise<CompletionResponse> => {
const systemPrompt = `
Expand Down Expand Up @@ -58,13 +63,21 @@ export const getCompletion = async ({

const data = response.data;

const jsonOutput = await markdownToJson(data.choices[0].message.content);
console.log("====>>>>", JSON.stringify(jsonOutput));
const jsonOutput = await markdownToJson(
data.choices[0].message.content,
pageNumber
);

// TODO: remove this
// Only for development
console.log('======')
console.log(JSON.stringify(jsonOutput));

return {
content: data.choices[0].message.content,
inputTokens: data.usage.prompt_tokens,
outputTokens: data.usage.completion_tokens,
structuredContent: jsonOutput,
};
} catch (err) {
console.error("Error in OpenAI completion", err);
Expand Down
51 changes: 51 additions & 0 deletions node-zerox/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ export interface CompletionResponse {
content: string;
inputTokens: number;
outputTokens: number;
structuredContent: ProcessedNode[];
}

export interface CompletionArgs {
Expand All @@ -44,6 +45,7 @@ export interface CompletionArgs {
llmParams?: LLMParams;
maintainFormat: boolean;
model: ModelOptions | string;
pageNumber: number;
priorPage: string;
}

Expand All @@ -54,3 +56,52 @@ export interface LLMParams {
temperature?: number;
topP?: number;
}

export enum MdNodeType {
break = "break",
heading = "heading",
list = "list",
paragraph = "paragraph",
strong = "strong",
table = "table",
text = "text",
thematicBreak = "thematicBreak",
}

export enum ConvertedNodeType {
heading = "heading",
list = "list",
text = "text",
}
export interface BaseNode {
id: string;
page?: number;
parentId?: string;
}

export interface TextNode extends BaseNode {
type: ConvertedNodeType.text;
value: string;
}

export interface HeadingNode extends BaseNode {
type: ConvertedNodeType.heading;
value: string;
}

export interface ListNode extends BaseNode {
type: ConvertedNodeType.list;
value: ListItem[];
}

export interface ListItem {
id: string;
value: string;
}

export type ProcessedNode = TextNode | HeadingNode | ListNode;

export interface ParentId {
depth: number;
id: string;
}
105 changes: 61 additions & 44 deletions node-zerox/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import { convert } from "libreoffice-convert";
import { fromPath } from "pdf2pic";
import { LLMParams } from "./types";
import {
LLMParams,
MdNodeType,
ConvertedNodeType,
ProcessedNode,
ParentId,
} from "./types";
import { pipeline } from "stream/promises";
import { promisify } from "util";
import * as Tesseract from "tesseract.js";
Expand Down Expand Up @@ -313,18 +319,13 @@ export const convertKeysToSnakeCase = (
);
};

interface ProcessedNode {
id: string;
parentId: string | undefined;
type: string;
value: any;
}
interface parentId {
id: string;
depth: number;
}

export const markdownToJson = async (markdownString: string) => {
/**
*
* @param markdownString String - Markdown text
* @param page Number - Page number
* @returns ProcessedNode[] - Array of processed nodes
*/
export const markdownToJson = async (markdownString: string, page: number) => {
/**
* Bypassing typescript transpiler using eval to use dynamic imports
*
Expand All @@ -341,83 +342,99 @@ export const markdownToJson = async (markdownString: string) => {

console.log(JSON.stringify(parsedMd));

const parentIdManager: parentId[] = [];
const parentIdManager: ParentId[] = [];

const jsonObj: ProcessedNode[] = [];
parsedMd.children.forEach((node: any) => {
const isHeading = node.type === "heading";
const processedNodes: ProcessedNode[] = [];
parsedMd.children.forEach((sourceNode: any) => {
const isHeading = sourceNode.type === MdNodeType.heading;

if (isHeading && node.depth <= (parentIdManager.at(-1)?.depth || 0)) {
if (isHeading && sourceNode.depth <= (parentIdManager.at(-1)?.depth || 0)) {
for (let i = parentIdManager.length; i > 0; i--) {
parentIdManager.pop();
if (node.depth > (parentIdManager.at(-1)?.depth || 0)) {
if (sourceNode.depth > (parentIdManager.at(-1)?.depth || 0)) {
break;
}
}
}
const processedNode = processNode(node, parentIdManager.at(-1)?.id);
const processedNode = processNode(
sourceNode,
page,
parentIdManager.at(-1)?.id
);

if (isHeading) {
parentIdManager.push({ id: processedNode[0].id, depth: node.depth });
parentIdManager.push({
id: processedNode[0].id,
depth: sourceNode.depth,
});
}

jsonObj.push(...processedNode);
processedNodes.push(...processedNode);
});

return jsonObj;
};

const type: Record<string, string> = {
heading: "heading",
text: "text",
list: "list",
return processedNodes;
};

const processNode = (node: any, parentId?: string): ProcessedNode[] => {
const processNode = (
node: any,
page: number,
parentId?: string
): ProcessedNode[] => {
let value: any;
let siblingNodes: ProcessedNode[] = [];

if (node.type === "heading") {
value = node.children
.map((childNode: any) => processText(childNode))
.join(" ");
} else if (node.type === "paragraph") {
if (
node.type === MdNodeType.heading ||
node.type === MdNodeType.paragraph ||
node.type === MdNodeType.strong
) {
value = node.children
.map((childNode: any) => processText(childNode))
.join(" ");
} else if (node.type === "list") {
} else if (node.type === MdNodeType.list) {
const processedNodes = node.children.map((childNode: any) =>
processListItem(childNode)
processListItem(childNode, page)
);
value = [];
processedNodes.forEach((pn: any) => {
value.push(...pn.node);

// Store nested list nodes
siblingNodes.push(...pn.siblings);
});
}

return [
{
id: nanoid(),
page,
parentId,
type: type[node.type as string] || type.text,
type:
ConvertedNodeType[node.type as ConvertedNodeType] ||
ConvertedNodeType.text,
value,
},
...(siblingNodes || []),
];
};

const ignoreNodeTypes = new Set([MdNodeType.break, MdNodeType.thematicBreak]);

const processText = (node: any) => {
return node.value;
if (ignoreNodeTypes.has(node.type)) return "";

return node.type === MdNodeType.text
? node.value
: node.children.map((child: any) => processText(child)).join(" ");
};

const processListItem = (node: any) => {
const processListItem = (node: any, page: number) => {
let newNode: ProcessedNode[] = [];
let siblings: ProcessedNode[] = [];

node.children.forEach((childNode: any) => {
if (childNode.type !== "list") {
const processedNode = processNode(childNode);
if (childNode.type !== MdNodeType.list) {
const processedNode = processNode(childNode, page);
if (newNode.length > 0) {
newNode[0].value += processedNode.map(({ value }) => value).join(", ");
} else {
Expand All @@ -429,13 +446,13 @@ const processListItem = (node: any) => {
newNode = [
{
id: nanoid(),
type: "text",
type: ConvertedNodeType.text,
value: "",
parentId: undefined,
},
];
}
const processedNode = processNode(childNode, newNode[0].id);
const processedNode = processNode(childNode, page, newNode[0].id);
siblings.push(...processedNode);
}
});
Expand Down
3 changes: 2 additions & 1 deletion node-zerox/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"outDir": "./dist",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true
"skipLibCheck": true,
"downlevelIteration": true,
},
"include": ["src/**/*"],
"exclude": ["node_modules", "**/*.test.ts"]
Expand Down

0 comments on commit 8f2a9de

Please sign in to comment.