Rename markdown output to processed content (#11)

azhong-git · web-flow · commit f59f39005987 · 2025-05-13T13:10:28.000-07:00
diff --git a/README.md b/README.md
@@ -302,11 +302,11 @@ The function returns a Promise that resolves to an `ExtractorResult<T>` object:
 ```typescript
 interface ExtractorResult<T> {
   data: T;             // Extracted structured data
-  markdown: string;    // The markdown content that was processed
+  processedContent: string;    // Processed content that was sent to the LLM. Markdown if the input was HTM (after conversion)
   usage: {             // Token usage statistics
     inputTokens?: number;
     outputTokens?: number;
-  }
+  };
 }
 ```
 
diff --git a/src/example.ts b/src/example.ts
@@ -71,7 +71,7 @@ async function example() {
     console.log(JSON.stringify(result.data, null, 2));
 
     console.log("\nMarkdown Content:");
-    console.log(result.markdown);
+    console.log(result.processedContent);
 
     console.log("\nToken Usage:");
     console.log(result.usage);
diff --git a/src/index.ts b/src/index.ts
@@ -100,7 +100,7 @@ export async function extract<T extends z.ZodTypeAny>(
   // Return the full result
   return {
     data,
-    markdown: content,
+    processedContent: content,
     usage,
   };
 }
diff --git a/src/types.ts b/src/types.ts
@@ -100,8 +100,12 @@ export interface ExtractorResult<T> {
   /** Extracted data according to the schema */
   data: T;
 
-  /** Raw markdown content that was processed */
-  markdown: string;
+  /**
+   * Processed content that was sent to the LLM.
+   * This will be markdown if the input was HTML (after conversion),
+   * or the original content if the input was already markdown or plain text.
+   */
+  processedContent: string;
 
   /** Usage statistics */
   usage: Usage;
diff --git a/tests/integration/processedContent.test.ts b/tests/integration/processedContent.test.ts
@@ -0,0 +1,86 @@
+import { z } from "zod";
+import { extract, ContentFormat, LLMProvider } from "../../src";
+
+describe("ProcessedContent Integration Tests", () => {
+  const simpleSchema = z.object({
+    title: z.string(),
+    content: z.string().nullable(),
+  });
+
+  // Skip tests if API keys are not available
+  const skipIfNoKeys = () => {
+    if (!process.env.OPENAI_API_KEY) {
+      return true;
+    }
+    return false;
+  };
+
+  it("should return original content as processedContent for TXT format", async () => {
+    if (skipIfNoKeys()) {
+      console.log("Skipping test: No API keys available");
+      return;
+    }
+
+    const plainTextContent =
+      "Title: Simple Test\n\nThis is a test of plain text extraction.";
+
+    const result = await extract({
+      content: plainTextContent,
+      format: ContentFormat.TXT,
+      schema: simpleSchema,
+      provider: LLMProvider.OPENAI,
+      openaiApiKey: process.env.OPENAI_API_KEY,
+    });
+
+    // Verify the processedContent is the same as the original content
+    expect(result.processedContent).toBe(plainTextContent);
+  }, 30000);
+
+  it("should return original content as processedContent for MARKDOWN format", async () => {
+    if (skipIfNoKeys()) {
+      console.log("Skipping test: No API keys available");
+      return;
+    }
+
+    const markdownContent =
+      "# Simple Test\n\nThis is a test of markdown extraction.";
+
+    const result = await extract({
+      content: markdownContent,
+      format: ContentFormat.MARKDOWN,
+      schema: simpleSchema,
+      provider: LLMProvider.OPENAI,
+      openaiApiKey: process.env.OPENAI_API_KEY,
+    });
+
+    // Verify the processedContent is the same as the original content
+    expect(result.processedContent).toBe(markdownContent);
+  }, 30000);
+
+  it("should return converted markdown as processedContent for HTML format", async () => {
+    if (skipIfNoKeys()) {
+      console.log("Skipping test: No API keys available");
+      return;
+    }
+
+    const htmlContent =
+      "<h1>Simple Test</h1><p>This is a test of HTML extraction.</p>";
+
+    const result = await extract({
+      content: htmlContent,
+      format: ContentFormat.HTML,
+      schema: simpleSchema,
+      provider: LLMProvider.OPENAI,
+      openaiApiKey: process.env.OPENAI_API_KEY,
+      sourceUrl: "https://example.com",
+    });
+
+    // For HTML, processedContent should be the converted markdown
+    expect(result.processedContent).toContain("Simple Test");
+    expect(result.processedContent).toContain(
+      "This is a test of HTML extraction."
+    );
+    expect(result.processedContent).not.toContain("<h1>");
+    expect(result.processedContent).not.toContain("</p>");
+  }, 30000);
+});

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ export async function extract<T extends z.ZodTypeAny>(`
`100`	`100`	`// Return the full result`
`101`	`101`	`return {`
`102`	`102`	`data,`
`103`		`- markdown: content,`
	`103`	`+ processedContent: content,`
`104`	`104`	`usage,`
`105`	`105`	`};`
`106`	`106`	`}`