diff --git a/examples/pdf-server/server.test.ts b/examples/pdf-server/server.test.ts index f63ae6737..8a0790a80 100644 --- a/examples/pdf-server/server.test.ts +++ b/examples/pdf-server/server.test.ts @@ -4,9 +4,15 @@ import os from "node:os"; import path from "node:path"; import { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js"; +import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs"; +import { PDFDocument } from "pdf-lib"; +import { makeRandomJpeg } from "../../tests/helpers/range-counting-server"; import { createPdfCache, createServer, + extractFormSchema, + PdfCacheRangeTransport, + MAX_CHUNK_BYTES, validateUrl, isAncestorDir, allowedLocalFiles, @@ -289,6 +295,266 @@ describe("PDF Cache with Timeouts", () => { // through manual testing or E2E tests. }); +describe("PdfCacheRangeTransport", () => { + it("accumulates ranges larger than MAX_CHUNK_BYTES into one onDataRange call", async () => { + const big = MAX_CHUNK_BYTES * 2 + 100; + const reads: Array<[number, number]> = []; + const t = new PdfCacheRangeTransport("u", big, async (_u, off, n) => { + reads.push([off, n]); + return { + data: new Uint8Array(Math.min(n, MAX_CHUNK_BYTES)), + totalBytes: big, + }; + }); + const delivered: Array<[number, number]> = []; + t.addRangeListener((begin: number, chunk: Uint8Array) => + delivered.push([begin, chunk.length]), + ); + t.requestDataRange(0, big); + await new Promise((r) => setTimeout(r, 10)); + // pdf.js's reader is keyed by the original begin and removed after one + // delivery, so deliver() must call onDataRange exactly once with the + // accumulated buffer — multiple calls would throw inside pdfjs. + expect(delivered).toEqual([[0, big]]); + expect(reads).toEqual([ + [0, MAX_CHUNK_BYTES], + [MAX_CHUNK_BYTES, MAX_CHUNK_BYTES], + [MAX_CHUNK_BYTES * 2, 100], + ]); + }); + + it("rejects .failed when a range fetch errors instead of hanging", async () => { + const t = new PdfCacheRangeTransport("u", 1000, async () => { + throw new Error("network down"); + }); + t.requestDataRange(0, 100); + await expect( + Promise.race([ + t.failed, + new Promise((r) => setTimeout(() => r("timeout"), 200)), + ]), + ).rejects.toThrow("network down"); + }); + + it("rejects .failed on zero-length response (would otherwise spin)", async () => { + const t = new PdfCacheRangeTransport("u", 1000, async () => ({ + data: new Uint8Array(0), + totalBytes: 1000, + })); + t.requestDataRange(0, 100); + await expect(t.failed).rejects.toThrow(/empty range/); + }); + + it("getDocument resolves on a >1MB PDF when readPdfRange clamps to MAX_CHUNK_BYTES", async () => { + // pdfjs coalesces adjacent missing chunks into one requestDataRange that + // can exceed MAX_CHUNK_BYTES. deliver() must accumulate clamped reads and + // hand pdfjs a single onDataRange(begin, fullBuffer). This test fails if + // deliver() either truncates or calls onDataRange more than once per + // requestDataRange (pdf.mjs _onReceiveData matches by exact begin). + const d = await PDFDocument.create(); + const img = await d.embedJpg(makeRandomJpeg(1_100_000)); + const page = d.addPage([612, 792]); + page.drawImage(img, { x: 36, y: 36, width: 540, height: 720 }); + const bytes = await d.save(); + expect(bytes.length).toBeGreaterThan(2 * MAX_CHUNK_BYTES); + + const readClamped: PdfCache["readPdfRange"] = async (_u, off, n) => { + const len = Math.min(n, MAX_CHUNK_BYTES, bytes.length - off); + return { data: bytes.slice(off, off + len), totalBytes: bytes.length }; + }; + // Record the spans pdfjs actually requests so the test fails fast if it + // never asks for >MAX_CHUNK_BYTES (i.e. can't go vacuously green). + const spans: number[] = []; + class RecordingTransport extends PdfCacheRangeTransport { + override requestDataRange(begin: number, end: number): void { + spans.push(end - begin); + super.requestDataRange(begin, end); + } + } + const transport = new RecordingTransport( + "mem://big", + bytes.length, + readClamped, + ); + + const orHang = (p: Promise, what: string): Promise => + Promise.race([ + p, + transport.failed, + new Promise((_, rej) => + setTimeout(() => rej(new Error(`${what} hung`)), 5000), + ), + ]); + + const doc = await orHang( + getDocument({ + range: transport, + length: bytes.length, + disableAutoFetch: true, + disableStream: true, + rangeChunkSize: 64 * 1024, + }).promise, + "getDocument", + ); + const p1 = await orHang(doc.getPage(1), "getPage"); + // getPage() alone doesn't decode the image XObject; getOperatorList() does, + // which is what triggers the >512KB coalesced range request. + await orHang(p1.getOperatorList(), "getOperatorList"); + expect(Math.max(...spans)).toBeGreaterThan(MAX_CHUNK_BYTES); + doc.destroy(); + }); +}); + +describe("display_pdf transport-error handling", () => { + it("returns (does not hang) when range fetches fail mid-load", async () => { + // First fetch = the 1-byte size probe → 206 with Content-Range so + // display_pdf gets totalBytes. Every subsequent fetch (made by + // PdfCacheRangeTransport via readPdfRange) rejects, which must surface + // through transport.failed → orFail() → outer catch, not hang. + let calls = 0; + const mockFetch = spyOn(globalThis, "fetch").mockImplementation( + async () => { + if (calls++ === 0) { + return new Response(new Uint8Array(1), { + status: 206, + headers: { "Content-Range": "bytes 0-0/50000" }, + }); + } + throw new Error("network down"); + }, + ); + + const server = createServer(); + const client = new Client({ name: "t", version: "1" }); + const [ct, st] = InMemoryTransport.createLinkedPair(); + await Promise.all([server.connect(st), client.connect(ct)]); + + try { + const result = await Promise.race([ + client.callTool({ + name: "display_pdf", + arguments: { url: "https://arxiv.org/pdf/err-test" }, + }), + new Promise((_, rej) => + setTimeout( + () => rej(new Error("display_pdf hung on transport error")), + 3000, + ), + ), + ]); + expect(result.isError).toBeFalsy(); + const sc = result.structuredContent as { formFields?: unknown }; + expect(sc.formFields).toBeUndefined(); + expect(calls).toBeGreaterThan(1); + } finally { + mockFetch.mockRestore(); + await client.close(); + await server.close(); + } + }); +}); + +describe("extractFormSchema field-tree handling", () => { + async function schemaFor(bytes: Uint8Array) { + const doc = await getDocument({ data: bytes }).promise; + try { + const fo = (await doc.getFieldObjects()) as Parameters< + typeof extractFormSchema + >[1]; + return await extractFormSchema(doc, fo); + } finally { + doc.destroy(); + } + } + + it("handles pdf-lib separated field/widget structure", async () => { + const d = await PDFDocument.create(); + const form = d.getForm(); + d.addPage([612, 792]); + form + .createTextField("alpha") + .addToPage(d.getPage(0), { x: 50, y: 700, width: 200, height: 20 }); + form + .createCheckBox("agree") + .addToPage(d.getPage(0), { x: 50, y: 660, width: 20, height: 20 }); + form + .createDropdown("choice") + .addToPage(d.getPage(0), { x: 50, y: 620, width: 100, height: 20 }); + + const schema = await schemaFor(await d.save()); + expect(schema).not.toBeNull(); + expect(schema!.properties.alpha).toEqual({ + type: "string", + title: "alpha", + }); + expect(schema!.properties.agree).toEqual({ + type: "boolean", + title: "agree", + }); + expect(schema!.properties.choice.type).toBe("string"); + }); + + it("handles fields with multiple widgets across pages", async () => { + const d = await PDFDocument.create(); + const form = d.getForm(); + d.addPage([612, 792]); + d.addPage([612, 792]); + const tf = form.createTextField("shared"); + tf.addToPage(d.getPage(0), { x: 50, y: 700, width: 200, height: 20 }); + tf.addToPage(d.getPage(1), { x: 50, y: 700, width: 200, height: 20 }); + + const schema = await schemaFor(await d.save()); + expect(schema?.properties.shared).toEqual({ + type: "string", + title: "shared", + }); + }); + + it("skips container nodes and finds leaf fields (W-9 style)", async () => { + const bytes = fs.readFileSync( + path.join(__dirname, "../../tests/helpers/assets/fw9.pdf"), + ); + const doc = await getDocument({ data: new Uint8Array(bytes) }).promise; + try { + const fo = (await doc.getFieldObjects()) as Parameters< + typeof extractFormSchema + >[1]; + // Container nodes (no leaf type) should not crash extraction + expect(fo!["topmostSubform[0]"]).toBeDefined(); + // Schema is null for W-9 (mechanical names), but extraction must not throw + const schema = await extractFormSchema(doc, fo); + expect(schema).toBeNull(); + } finally { + doc.destroy(); + } + }); + + it("returns null when no AcroForm present", async () => { + const d = await PDFDocument.create(); + d.addPage([612, 792]); + const schema = await schemaFor(await d.save()); + expect(schema).toBeNull(); + }); +}); + +describe("validateUrl loopback HTTP allow (PDF_SERVER_ALLOW_LOOPBACK_HTTP)", () => { + it("rejects http://127.0.0.1 by default", () => { + expect(validateUrl("http://127.0.0.1:9999/x.pdf").valid).toBe(false); + }); + + it("accepts http://127.0.0.1 only when the env gate is set, and never non-loopback http", () => { + const prev = process.env.PDF_SERVER_ALLOW_LOOPBACK_HTTP; + process.env.PDF_SERVER_ALLOW_LOOPBACK_HTTP = "1"; + try { + expect(validateUrl("http://127.0.0.1:9999/x.pdf").valid).toBe(true); + expect(validateUrl("http://169.254.169.254/").valid).toBe(false); + } finally { + if (prev === undefined) delete process.env.PDF_SERVER_ALLOW_LOOPBACK_HTTP; + else process.env.PDF_SERVER_ALLOW_LOOPBACK_HTTP = prev; + } + }); +}); + describe("validateUrl with MCP roots (allowedLocalDirs)", () => { const savedFiles = new Set(allowedLocalFiles); const savedDirs = new Set(allowedLocalDirs); diff --git a/examples/pdf-server/server.ts b/examples/pdf-server/server.ts index 062b26301..93dedfdc9 100644 --- a/examples/pdf-server/server.ts +++ b/examples/pdf-server/server.ts @@ -31,6 +31,7 @@ import { import "./pdfjs-polyfill.js"; import { getDocument, + PDFDataRangeTransport, VerbosityLevel, version as PDFJS_VERSION, } from "pdfjs-dist/legacy/build/pdf.mjs"; @@ -368,9 +369,6 @@ export const viewSourcePaths = new Map(); /** Valid form field names per viewer UUID (populated during display_pdf) */ const viewFieldNames = new Map>(); -/** Detailed form field info per viewer UUID (populated during display_pdf) */ -const viewFieldInfo = new Map(); - /** * Active fs.watch per view. Only created for local files when interact is * enabled (stdio). Watcher is re-established on `rename` events to survive @@ -387,7 +385,7 @@ const viewFileWatches = new Map(); /** * Per-view heartbeat. THIS is what the sweep iterates — not commandQueues. * - * Why not commandQueues: display_pdf populates viewFieldNames/viewFieldInfo/ + * Why not commandQueues: display_pdf populates viewFieldNames/ * viewFileWatches but never touches commandQueues (only enqueueCommand does, * and it's triply gated). And dequeueCommands deletes the entry on every poll, * so even when it exists the sweep's TTL window is ~200ms wide. Net effect: @@ -409,7 +407,6 @@ function pruneStaleQueues(): void { viewLastActivity.delete(uuid); commandQueues.delete(uuid); viewFieldNames.delete(uuid); - viewFieldInfo.delete(uuid); viewsPolled.delete(uuid); viewSourcePaths.delete(uuid); stopFileWatch(uuid); @@ -644,10 +641,19 @@ export function validateUrl(url: string): { // Remote URL - require HTTPS try { const parsed = new URL(url); - if (parsed.protocol !== "https:") { - return { valid: false, error: `Only HTTPS URLs are allowed: ${url}` }; + if (parsed.protocol === "https:") return { valid: true }; + // Loopback HTTP is opt-in (test fixtures, local dev). Off by default so a + // remotely-deployed server can't be made to probe its own internal ports. + if ( + process.env.PDF_SERVER_ALLOW_LOOPBACK_HTTP && + parsed.protocol === "http:" && + (parsed.hostname === "127.0.0.1" || + parsed.hostname === "localhost" || + parsed.hostname === "[::1]") + ) { + return { valid: true }; } - return { valid: true }; + return { valid: false, error: `Only HTTPS URLs are allowed: ${url}` }; } catch { return { valid: false, error: `Invalid URL: ${url}` }; } @@ -882,6 +888,61 @@ export function createPdfCache( }; } +/** + * pdf.js range transport backed by {@link PdfCache.readPdfRange}. Lets + * getDocument() fetch only the byte ranges it needs (xref, /AcroForm dict) + * instead of the whole file. With disableAutoFetch, a PDF without form + * fields is opened with ~5% of bytes fetched. + * + * pdf.js has no upstream error channel on PDFDataRangeTransport (its + * `abort()` is a no-op stub it calls *on* us, not the other way). Callers + * must `Promise.race` their pdf.js awaits against {@link failed}, which + * rejects on the first fetch error. + */ +export class PdfCacheRangeTransport extends PDFDataRangeTransport { + /** Rejects on the first range-fetch error; never resolves. */ + readonly failed: Promise; + private fail!: (e: unknown) => void; + + constructor( + private url: string, + length: number, + private readPdfRange: PdfCache["readPdfRange"], + ) { + super(length, null); + this.failed = new Promise((_, reject) => { + this.fail = reject; + }); + // Don't crash the process if no one is racing yet. + this.failed.catch(() => {}); + } + + override requestDataRange(begin: number, end: number): void { + void this.deliver(begin, end).catch((e) => this.fail(e)); + } + + /** + * pdf.js coalesces adjacent missing chunks into one unbounded request, but + * readPdfRange clamps each call to MAX_CHUNK_BYTES. Its reader is keyed by + * the original `begin` and removed after one delivery, so we must accumulate + * slices and call onDataRange exactly once with the full buffer. + */ + private async deliver(begin: number, end: number): Promise { + const buf = new Uint8Array(end - begin); + let off = 0; + while (off < buf.length) { + const want = Math.min(buf.length - off, MAX_CHUNK_BYTES); + const { data } = await this.readPdfRange(this.url, begin + off, want); + if (data.length === 0) { + throw new Error(`empty range at ${begin + off} for ${this.url}`); + } + buf.set(data.subarray(0, Math.min(data.length, buf.length - off)), off); + off += data.length; + } + this.onDataRange(begin, buf); + } +} + // ============================================================================= // MCP Roots // ============================================================================= @@ -957,6 +1018,60 @@ interface FormFieldInfo { options?: string[]; } +/** + * Open `url` via {@link PdfCacheRangeTransport} and return form metadata. + * Uses `disableAutoFetch` so PDFs without an AcroForm are probed with only + * the trailer/xref/catalog (~5-25% of bytes); PDFs with forms still walk + * every page via {@link extractFormFieldInfo} but those are typically small. + * All errors (including range-fetch failures surfaced via + * {@link PdfCacheRangeTransport.failed}) resolve to empty results. + */ +async function probeFormFields( + url: string, + totalBytes: number, + readPdfRange: PdfCache["readPdfRange"], +): Promise<{ + formSchema: Awaited>; + fieldInfo: FormFieldInfo[]; +}> { + // Assigned sequentially below so a throw in extractFormFieldInfo (no per-page + // guard, unlike extractFormSchema) doesn't discard an already-computed schema. + let formSchema: Awaited> = null; + let fieldInfo: FormFieldInfo[] = []; + try { + const transport = new PdfCacheRangeTransport(url, totalBytes, readPdfRange); + const orFail = (p: Promise): Promise => + Promise.race([p, transport.failed]); + const pdfDoc = await orFail( + getDocument({ + range: transport, + length: totalBytes, + disableAutoFetch: true, + disableStream: true, + rangeChunkSize: 64 * 1024, + standardFontDataUrl: STANDARD_FONT_DATA_URL, + StandardFontDataFactory: FetchStandardFontDataFactory, + verbosity: VerbosityLevel.ERRORS, + }).promise, + ); + try { + const fieldObjects = (await orFail(pdfDoc.getFieldObjects())) as Record< + string, + PdfJsFieldObject[] + > | null; + if (fieldObjects && Object.keys(fieldObjects).length > 0) { + formSchema = await orFail(extractFormSchema(pdfDoc, fieldObjects)); + fieldInfo = await orFail(extractFormFieldInfo(pdfDoc)); + } + } finally { + pdfDoc.destroy(); + } + } catch { + // Non-fatal — return whatever was assigned before the throw. + } + return { formSchema, fieldInfo }; +} + /** * Extract detailed form field info (name, type, page, bounding box, label) * from a PDF. Bounding boxes are converted to model coordinates (top-left origin). @@ -1020,27 +1135,25 @@ async function extractFormFieldInfo( return fields; } -async function extractFormSchema(pdfDoc: PDFDocumentProxy): Promise<{ +export async function extractFormSchema( + pdfDoc: PDFDocumentProxy, + fieldObjects: Record | null, +): Promise<{ type: "object"; properties: Record; required?: string[]; } | null> { - let fieldObjects: Record | null; - try { - fieldObjects = (await pdfDoc.getFieldObjects()) as Record< - string, - PdfJsFieldObject[] - > | null; - } catch { - return null; - } if (!fieldObjects || Object.keys(fieldObjects).length === 0) { return null; } const properties: Record = {}; for (const [name, fields] of Object.entries(fieldObjects)) { - const field = fields[0]; // first widget determines the type + // pdfjs returns the full field-tree array: for separated structures + // (pdf-lib) the typed widget is at [1+] behind a container at [0]; for + // merged/leaf entries (W-9, most authoring tools) it's at [0]. Pick the + // first entry that actually has a field type. + const field = fields.find((f) => f.type) ?? fields[0]; if (!field.editable) continue; switch (field.type) { @@ -1434,7 +1547,7 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before const { totalBytes } = await readPdfRange(normalized, 0, 1); const uuid = randomUUID(); // Start the heartbeat now so the sweep can clean up viewFieldNames/ - // viewFieldInfo/viewFileWatches even if no interact calls ever happen. + // viewFileWatches even if no interact calls ever happen. if (!disableInteract) touchView(uuid); // Check writability (governs save button; see isWritablePath doc). @@ -1462,38 +1575,19 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before } } - // Extract form field schema + detailed field info from a single - // download/parse pass. - let formSchema: Awaited> = null; - let fieldInfo: FormFieldInfo[] = []; - try { - const { data } = await readPdfRange(normalized, 0, totalBytes); - const pdfDoc = await getDocument({ - data, - standardFontDataUrl: STANDARD_FONT_DATA_URL, - StandardFontDataFactory: FetchStandardFontDataFactory, - verbosity: VerbosityLevel.ERRORS, - }).promise; - try { - formSchema = await extractFormSchema(pdfDoc); - fieldInfo = await extractFormFieldInfo(pdfDoc); - } finally { - pdfDoc.destroy(); - } - } catch { - // Non-fatal — PDF may not have form fields or may fail to parse - } + const { formSchema, fieldInfo } = await probeFormFields( + normalized, + totalBytes, + readPdfRange, + ); if (formSchema) { viewFieldNames.set(uuid, new Set(Object.keys(formSchema.properties))); } - if (fieldInfo.length > 0) { - viewFieldInfo.set(uuid, fieldInfo); - if (!viewFieldNames.has(uuid)) { - viewFieldNames.set( - uuid, - new Set(fieldInfo.map((f) => f.name).filter(Boolean)), - ); - } + if (fieldInfo.length > 0 && !viewFieldNames.has(uuid)) { + viewFieldNames.set( + uuid, + new Set(fieldInfo.map((f) => f.name).filter(Boolean)), + ); } // Elicit form field values if requested and client supports it diff --git a/examples/pdf-server/src/mcp-app.ts b/examples/pdf-server/src/mcp-app.ts index 500879a10..a953e242a 100644 --- a/examples/pdf-server/src/mcp-app.ts +++ b/examples/pdf-server/src/mcp-app.ts @@ -150,6 +150,11 @@ const imageCache = new Map(); /** Annotations imported from the PDF file (baseline for diff computation). */ let pdfBaselineAnnotations: PdfAnnotationDef[] = []; +/** Pages whose native annotations have already been imported into the baseline. */ +const baselineScannedPages = new Set(); +/** Native-annotation ids the user deleted (from restored localStorage diff) — + * the lazy per-page scan must NOT re-add these to annotationMap. */ +const restoredRemovedIds = new Set(); // Dirty flag — tracks unsaved local changes let isDirty = false; @@ -2679,52 +2684,60 @@ function annotationStorageKey(): string | null { } /** - * Import annotations from the loaded PDF to establish the baseline. - * These are the annotations that exist in the PDF file itself. + * Import one page's native annotations into the baseline. Called lazily from + * renderPage() so we don't walk every page (and pull most of the file via + * range requests) before the user sees anything. Idempotent per page. */ -async function loadBaselineAnnotations( - doc: pdfjsLib.PDFDocumentProxy, -): Promise { - pdfBaselineAnnotations = []; - for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) { +function scanPageBaselineAnnotations( + pageNum: number, + annotations: unknown[], +): void { + if (baselineScannedPages.has(pageNum)) return; + baselineScannedPages.add(pageNum); + let imported = 0; + for (let i = 0; i < annotations.length; i++) { + // Isolate each annotation: a malformed one must not bubble up to the + // caller's form-layer try in renderPage() (which would skip + // AnnotationLayer.render and hide form widgets for the whole page). try { - const page = await doc.getPage(pageNum); - const annotations = await page.getAnnotations(); - for (let i = 0; i < annotations.length; i++) { - const ann = annotations[i]; - const def = importPdfjsAnnotation(ann, pageNum, i); - if (def) { - pdfBaselineAnnotations.push(def); - // Add to annotationMap if not already present (from localStorage restore) - if (!annotationMap.has(def.id)) { - annotationMap.set(def.id, { def, elements: [] }); - } - } else if (ann.annotationType !== 20) { - // Widget (type 20) is expected to be skipped; anything else we - // don't import will still be painted by page.render() onto the - // canvas as unselectable pixels. Log so we can diagnose - // "ghost annotations" (visible but not in panel, not clickable). - log.info( - `[WARN] Baseline: skipped PDF annotation on page ${pageNum}`, - `type=${ann.annotationType}`, - `subtype=${ann.subtype ?? "?"}`, - `name=${ann.name ?? "?"}`, - `rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`, - ); + const ann = annotations[i] as { + annotationType?: number; + subtype?: string; + name?: string; + rect?: number[]; + }; + const def = importPdfjsAnnotation(ann, pageNum, i); + if (def) { + pdfBaselineAnnotations.push(def); + imported++; + if (!annotationMap.has(def.id) && !restoredRemovedIds.has(def.id)) { + annotationMap.set(def.id, { def, elements: [] }); } + } else if (ann.annotationType !== 20) { + // Widget (type 20) is expected to be skipped; anything else we + // don't import will still be painted by page.render() onto the + // canvas as unselectable pixels. Log so we can diagnose + // "ghost annotations" (visible but not in panel, not clickable). + log.info( + `[WARN] Baseline: skipped PDF annotation on page ${pageNum}`, + `type=${ann.annotationType}`, + `subtype=${ann.subtype ?? "?"}`, + `name=${ann.name ?? "?"}`, + `rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`, + ); } } catch (err) { - // Log the error — a thrown import for one annotation silently - // drops the REST of that page's annotations too. - log.info( - `[WARN] Baseline: page ${pageNum} annotation import failed:`, - err, - ); + log.info(`Baseline: page ${pageNum} annotation import failed`, err); + } + } + if (imported > 0) { + try { + updateAnnotationsBadge(); + renderAnnotationPanel(); + } catch (err) { + log.info(`Baseline: page ${pageNum} panel update failed`, err); } } - log.info( - `Loaded ${pdfBaselineAnnotations.length} baseline annotations from PDF`, - ); } function persistAnnotations(): void { @@ -2740,6 +2753,20 @@ function persistAnnotations(): void { pdfBaselineFormValues, ); + // computeDiff only sees baseline ids from pages we've already scanned. + // Carry forward restored tombstones for unvisited pages so the first + // persist after restore doesn't drop them. Once every page is scanned the + // baseline is complete and computeDiff is authoritative on its own — + // dropping the carry-forward then also stops a stale id (no longer in the + // file) from pinning dirty=true forever. + if (baselineScannedPages.size < totalPages) { + for (const id of restoredRemovedIds) { + if (!annotationMap.has(id) && !diff.removed.includes(id)) { + diff.removed.push(id); + } + } + } + // Dirty tracks whether there are unsaved changes. Undoing back to baseline // yields an empty diff → clean again → save button disables. if (!isRestoring) setDirty(!isDiffEmpty(diff)); @@ -2765,11 +2792,11 @@ function restoreAnnotations(): void { const diff = deserializeDiff(raw); // Merge baseline + diff. The loop below is add-only, so we MUST also - // delete: loadBaselineAnnotations() runs between the two restore calls - // and re-seeds annotationMap with every baseline id — including the - // ones in diff.removed. Without this, the zombie survives the restore, - // and the next persistAnnotations() sees it in currentIds → computeDiff - // produces removed=[] → the deletion is permanently lost from storage. + // delete: the per-page baseline scan re-seeds annotationMap with every + // native id it encounters — including ones in diff.removed. Without the + // deletes here AND the restoredRemovedIds tombstones below, the zombie + // survives, and the next persistAnnotations() sees it in currentIds → + // computeDiff produces removed=[] → the deletion is permanently lost. const merged = mergeAnnotations(pdfBaselineAnnotations, diff); for (const def of merged) { if (!annotationMap.has(def.id)) { @@ -2778,6 +2805,9 @@ function restoreAnnotations(): void { } for (const id of diff.removed) { annotationMap.delete(id); + // Tombstone so the lazy per-page baseline scan (which runs AFTER this + // restore) doesn't resurrect it. + restoredRemovedIds.add(id); } // Restore form fields @@ -2869,6 +2899,14 @@ async function buildFieldNameMap( // getFieldObjects may fail on some PDFs } + // No AcroForm → nothing to map. Skip the per-page widget walk so form-free + // PDFs (the common large case) don't pull every page after first paint. + // getFieldObjects() itself only reads the catalog/AcroForm dict via range + // transport, so this gate is cheap. + if (!cachedFieldObjects || Object.keys(cachedFieldObjects).length === 0) { + return false; + } + // Scan every page's widget annotations to collect the CORRECT storage keys, // plus labels, pages, positions, AND fieldValue (what the widget renders // — which can differ from getFieldObjects().value if the PDF is internally @@ -3074,11 +3112,19 @@ async function getAnnotatedPdfBytes(): Promise { } // Baseline annotations the user deleted: strip their refs from /Annots so - // they don't reappear on reload. Ids without a recoverable ref (page-index - // fallback) can't be removed by-ref and are skipped. - const removedRefs = pdfBaselineAnnotations - .filter((a) => !annotationMap.has(a.id)) - .map((a) => parseAnnotationRef(a.id)) + // they don't reappear on reload. Include restored tombstones for pages we + // haven't scanned yet — those ids aren't in pdfBaselineAnnotations but the + // ref is still parseable from the id string. Ids without a recoverable ref + // (page-index fallback) can't be removed by-ref and are skipped. + const removedIds = new Set(); + for (const a of pdfBaselineAnnotations) { + if (!annotationMap.has(a.id)) removedIds.add(a.id); + } + for (const id of restoredRemovedIds) { + if (!annotationMap.has(id)) removedIds.add(id); + } + const removedRefs = [...removedIds] + .map(parseAnnotationRef) .filter((r): r is NonNullable => r !== null); // Only write fields that actually changed vs. what's already in the PDF. @@ -3362,6 +3408,9 @@ async function renderPage() { formLayerEl.style.setProperty("--total-scale-factor", `${scale}`); try { const annotations = await page.getAnnotations(); + // Lazy baseline import — piggyback on the annotations we just fetched + // for this page instead of walking all pages upfront. + scanPageBaselineAnnotations(pageToRender, annotations); if (annotations.length > 0) { const linkService = { getDestinationHash: () => "#", @@ -4406,6 +4455,8 @@ async function reloadPdf(): Promise { undoStack.length = 0; redoStack.length = 0; pdfBaselineAnnotations = []; + baselineScannedPages.clear(); + restoredRemovedIds.clear(); pdfBaselineFormValues.clear(); pageTextCache.clear(); pageTextItemsCache.clear(); @@ -4449,11 +4500,11 @@ async function reloadPdf(): Promise { log.info("PDF reloaded:", totalPages, "pages,", totalBytes, "bytes"); showViewer(); - // Render immediately — annotation/form scans below are O(numPages) and - // do NOT block the canvas. See same pattern in the initial-load path. + // Render immediately — baseline-annotation scan now happens per-page + // inside renderPage(); buildFieldNameMap below early-returns when no + // AcroForm is present. See same pattern in the initial-load path. await renderPage(); - await loadBaselineAnnotations(document); const seeded = await buildFieldNameMap(document); syncFormValuesToStorage(); if (seeded) await renderPage(); @@ -4509,6 +4560,11 @@ async function loadPdfProgressively(urlToLoad: string): Promise<{ const loadingTask = pdfjsLib.getDocument({ range: transport, standardFontDataUrl: STANDARD_FONT_DATA_URL, + // Only fetch ranges renderPage()/getFieldObjects() actually ask for. + // Without these pdfjs background-prefetches the whole file regardless of + // the per-page lazy scans below. + disableAutoFetch: true, + disableStream: true, }); try { @@ -4673,12 +4729,13 @@ app.ontoolresult = async (result: CallToolResult) => { scale = fitScale; log.info("Initial fit scale:", scale); } - await renderPage(); - - // Import annotations from the PDF to establish baseline - await loadBaselineAnnotations(document); - // Restore any persisted user diff + // Restore any persisted user diff BEFORE first render so the per-page + // baseline scan inside renderPage() can honour the removed-id tombstones + // and not resurrect annotations the user deleted last session. + // restoreAnnotations is sync (localStorage read) so first paint is not + // delayed. restoreAnnotations(); + await renderPage(); // Build field name → annotation ID mapping for form filling const seeded = await buildFieldNameMap(document); diff --git a/examples/pdf-server/src/pdf-annotations.test.ts b/examples/pdf-server/src/pdf-annotations.test.ts index 38ef27b7e..239777a74 100644 --- a/examples/pdf-server/src/pdf-annotations.test.ts +++ b/examples/pdf-server/src/pdf-annotations.test.ts @@ -367,6 +367,92 @@ describe("computeDiff", () => { current.map((a) => a.id).sort(), ); }); + + // Backs the post-computeDiff union step in mcp-app.ts persistAnnotations / + // getAnnotatedPdfBytes: with the lazy per-page baseline scan, computeDiff + // alone cannot produce `removed` for natives on pages not yet visited, so + // callers must union restoredRemovedIds. These tests pin the contract those + // call sites depend on. + describe("partial baseline (lazy-scan tombstone preservation)", () => { + const tombstoned = "pdf-5-0"; + const userNote: PdfAnnotationDef = { + type: "note", + id: "u1", + page: 1, + x: 10, + y: 10, + content: "unrelated edit", + }; + + /** Mirrors the union loop in mcp-app.ts persistAnnotations. */ + function unionRestored( + diff: AnnotationDiff, + restored: Iterable, + currentIds: Set, + ): void { + for (const id of restored) { + if (!currentIds.has(id) && !diff.removed.includes(id)) { + diff.removed.push(id); + } + } + } + + it("computeDiff alone drops tombstones for unscanned pages; the union step preserves them", () => { + const baseline: PdfAnnotationDef[] = []; // page with the native not yet scanned + const current = [userNote]; + const diff = computeDiff(baseline, current, new Map()); + expect(diff.removed).toEqual([]); // proves the union step is load-bearing + + unionRestored(diff, [tombstoned], new Set(current.map((a) => a.id))); + + expect(diff.removed).toEqual([tombstoned]); + expect(isDiffEmpty(diff)).toBe(false); + expect(deserializeDiff(serializeDiff(diff)).removed).toEqual([ + tombstoned, + ]); + }); + + it("a tombstone the user re-added is excluded from the union", () => { + const reAdded: PdfAnnotationDef = { + type: "note", + id: tombstoned, + page: 3, + x: 0, + y: 0, + content: "back", + }; + const current = [userNote, reAdded]; + const diff = computeDiff([], current, new Map()); + unionRestored(diff, [tombstoned], new Set(current.map((a) => a.id))); + expect(diff.removed).toEqual([]); + }); + + it("union does not duplicate ids once the page is scanned and computeDiff produces them", () => { + const native: PdfAnnotationDef = { + type: "note", + id: tombstoned, + page: 3, + x: 0, + y: 0, + content: "native", + }; + const diff = computeDiff([native], [userNote], new Map()); + expect(diff.removed).toEqual([tombstoned]); + unionRestored(diff, [tombstoned], new Set([userNote.id])); + expect(diff.removed).toEqual([tombstoned]); + }); + + it("removedRefs from restored tombstones parse for buildAnnotatedPdfBytes; non-ref ids are skipped", () => { + const restored = new Set(["pdf-5-0", "pdf-12R", "pdf-2-idx-7"]); + const removedRefs = [...restored] + .map(parseAnnotationRef) + .filter((r): r is NonNullable => r !== null); + expect(removedRefs).toEqual([ + { objectNumber: 5, generationNumber: 0 }, + { objectNumber: 12, generationNumber: 0 }, + ]); + }); + }); }); // ============================================================================= diff --git a/playwright.config.ts b/playwright.config.ts index 6f6cb65b6..ecf1e136a 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -47,6 +47,10 @@ export default defineConfig({ env: { ...process.env, EXAMPLE: process.env.EXAMPLE ?? "", + // Let pdf-server fetch from the http://127.0.0.1 range-counting fixture + // (validateUrl rejects loopback HTTP unless this is set). Scoped to this + // server's check only — does not touch Node's TLS verification. + PDF_SERVER_ALLOW_LOOPBACK_HTTP: "1", }, }, // Snapshot configuration diff --git a/tests/e2e/pdf-incremental-load.spec.ts b/tests/e2e/pdf-incremental-load.spec.ts new file mode 100644 index 000000000..52a99ee9a --- /dev/null +++ b/tests/e2e/pdf-incremental-load.spec.ts @@ -0,0 +1,271 @@ +import { test, expect, type Page } from "@playwright/test"; +import { + startRangeServer, + type RangeServer, +} from "../helpers/range-counting-server"; + +/** + * Regression guard for incremental PDF loading. + * + * Asserts that display_pdf does not pull the entire file before the viewer + * starts streaming, that form schema is still returned in the initial response, + * and that no byte range is fetched server-side more than once. + * + * The "noforms <30%" test is the load-bearing regression check: it fails on the + * pre-range-transport implementation (which downloads 100% during display_pdf + * for form analysis) and passes once form extraction uses range transport. + */ + +test.setTimeout(120_000); + +let rangeServer: RangeServer; + +test.beforeAll(async () => { + rangeServer = await startRangeServer(); +}); + +test.afterAll(async () => { + await rangeServer.close(); +}); + +test.beforeEach(() => { + rangeServer.resetStats(); +}); + +function getAppFrame(page: Page) { + return page.frameLocator("iframe").first().frameLocator("iframe").first(); +} + +async function waitForAppLoad(page: Page) { + const outerFrame = page.frameLocator("iframe").first(); + await expect(outerFrame.locator("iframe")).toBeVisible({ timeout: 30_000 }); +} + +/** + * Load basic-host, select PDF Server, call display_pdf with a custom URL. + * Resolves once the tool result panel appears (server-side display_pdf done); + * does NOT wait for the viewer iframe — call waitForAppLoad separately so + * byte-count assertions can isolate server-side fetches from viewer fetches. + */ +async function displayPdf(page: Page, url: string) { + await page.goto("/?theme=hide"); + await expect(page.locator("select").first()).toBeEnabled({ timeout: 30_000 }); + await page.locator("select").first().selectOption({ label: "PDF Server" }); + await page.locator("textarea").fill(JSON.stringify({ url })); + await page.click('button:has-text("Call Tool")'); + await expect(page.locator('text="📤 Tool Result"').first()).toBeVisible({ + timeout: 30_000, + }); +} + +/** Read and parse the most recent tool result's structuredContent. */ +async function readStructuredContent( + page: Page, +): Promise> { + const resultPanel = page.locator('text="📤 Tool Result"').first(); + await expect(resultPanel).toBeVisible({ timeout: 30_000 }); + await resultPanel.click(); + const pre = page.locator("pre").last(); + await expect(pre).toBeVisible({ timeout: 5_000 }); + const raw = (await pre.textContent()) ?? "{}"; + const parsed = JSON.parse(raw) as { structuredContent?: object }; + return (parsed.structuredContent ?? {}) as Record; +} + +async function waitForFirstPageRendered(page: Page) { + const canvas = getAppFrame(page).locator("canvas").first(); + await expect(canvas).toBeVisible({ timeout: 30_000 }); + await expect + .poll(async () => canvas.evaluate((c: HTMLCanvasElement) => c.width)) + .toBeGreaterThan(0); +} + +test.describe("PDF Server — incremental loading", () => { + test("display_pdf on a form PDF returns form fields in initial response", async ({ + page, + }) => { + await displayPdf(page, `${rangeServer.baseUrl}/forms.pdf`); + await waitForAppLoad(page); + const sc = await readStructuredContent(page); + const fields = sc.formFields as Array<{ name: string }> | undefined; + expect(fields?.map((f) => f.name).sort()).toEqual([ + "city", + "email", + "name", + "notes", + "phone", + ]); + }); + + test("display_pdf on a no-forms PDF stays under byte budget and bounded overlap", async ({ + page, + }) => { + const fileSize = rangeServer.fileSizes["/noforms.pdf"]; + await displayPdf(page, `${rangeServer.baseUrl}/noforms.pdf`); + + // Measure before the viewer iframe loads so the count reflects only the + // server-side display_pdf range fetches. + expect(rangeServer.stats().totalBytesServed).toBeLessThan(fileSize * 0.3); + + await waitForAppLoad(page); + const sc = await readStructuredContent(page); + expect(sc.formFields).toBeUndefined(); + + await waitForFirstPageRendered(page); + // Server-side display_pdf and the viewer each open the document + // independently, so the xref/trailer/catalog is fetched twice (≈25%). + // This guards against the pre-range-transport behavior where the server + // alone pulled 100% (then 200% with the double-parse), giving overlap >> + // file size once the viewer also loaded. + expect(rangeServer.stats().overlapBytes).toBeLessThan(fileSize * 0.5); + }); + + test("first page renders under stall, then page 2 renders the >512KB image after release", async ({ + page, + }) => { + const fileSize = rangeServer.fileSizes["/noforms.pdf"]; + // Allow ~40% through (header + trailer/xref + page-1 content) then stall. + // The 1.1MB image stream referenced only by pages 2+ is the bulk. + const budget = Math.floor(fileSize * 0.4); + await displayPdf( + page, + `${rangeServer.baseUrl}/noforms.pdf?stallAfterBytes=${budget}`, + ); + await waitForAppLoad(page); + await waitForFirstPageRendered(page); + expect(rangeServer.stats().totalBytesServed).toBeLessThan(fileSize); + + rangeServer.release(); + const app = getAppFrame(page); + await app.locator("#next-btn").click(); + await expect(app.locator("#page-input")).toHaveValue("2", { + timeout: 30_000, + }); + // Page 2 references the ~1.1MB embedded JPEG. Rendering it exercises the + // viewer's range transport on a >MAX_CHUNK_BYTES object after the stall + // is released. (The server-side PdfCacheRangeTransport accumulate-once + // path is covered by the unit test in server.test.ts; on noforms.pdf + // display_pdf bails at the empty getFieldObjects() check before touching + // the image stream, so this test does not exercise it.) + await waitForFirstPageRendered(page); + expect(rangeServer.stats().totalBytesServed).toBeGreaterThan( + fileSize * 0.9, + ); + }); +}); + +// Kept in this spec because the test needs an HTTP-served PDF and the +// range-counting fixture is the convenient place; it does not use any +// byte-accounting features. +test.describe("PDF Server — annotation tombstone preservation", () => { + // FIXME(https://github.com/modelcontextprotocol/ext-apps/issues/642): + // basic-host doesn't replay the cached tool result on inner-iframe reload, + // and a fresh display_pdf call gets a new toolId → new storage key, so the + // restore-from-localStorage path can't be reached. The fix itself is covered + // by the computeDiff/serializeDiff contract tests in + // src/pdf-annotations.test.ts. + test.fixme("deleted native annotation tombstone survives a persist before its page is scanned", async ({ + page, + }) => { + // Regression for the lazy baseline scan: restoredRemovedIds must be + // unioned into persistAnnotations() and getAnnotatedPdfBytes() so a + // delete on page 2 isn't silently dropped when an unrelated edit on + // page 1 triggers a persist before page 2 has been re-scanned. + + await displayPdf(page, `${rangeServer.baseUrl}/with-native-annot.pdf`); + await waitForAppLoad(page); + await waitForFirstPageRendered(page); + const sc = await readStructuredContent(page); + const viewUUID = sc.viewUUID as string; + expect(viewUUID).toBeTruthy(); + + const app = getAppFrame(page); + + // 1. Go to page 2, open the panel, delete the native annotation via UI. + await app.locator("#next-btn").click(); + await expect(app.locator("#page-input")).toHaveValue("2"); + await app.locator("#annotations-btn").click(); + const nativeCard = app.locator( + '.annotation-card[data-annotation-id^="pdf-"]', + ); + await expect(nativeCard).toBeVisible({ timeout: 10_000 }); + const nativeId = await nativeCard.getAttribute("data-annotation-id"); + expect(nativeId).toMatch(/^pdf-\d+R?$/); + await nativeCard.locator(".annotation-card-delete").click(); + // Deleting a native annotation re-renders the card as a crossed-out + // tombstone (annotation-panel.ts createRemovedAnnotationCard) with the + // same data-annotation-id — it doesn't disappear from the DOM. + await expect(nativeCard).toHaveClass(/annotation-card-cleared/); + + // 2. Back to page 1 so the post-reload viewer restores there (page 2 + // must stay unscanned until the very end). + await app.locator("#page-input").fill("1"); + await app.locator("#page-input").press("Enter"); + await expect(app.locator("#page-input")).toHaveValue("1"); + + // 3. Capture the annotation localStorage key and confirm the delete was + // persisted. + const storageKey = await app + .locator("body") + .evaluate( + () => + Object.keys(localStorage).find( + (k) => k.startsWith("pdf-annot:") || k.endsWith(":annotations"), + ) ?? null, + ); + expect(storageKey).toBeTruthy(); + const diffBefore = await app + .locator("body") + .evaluate((_, k) => localStorage.getItem(k), storageKey!); + expect(JSON.parse(diffBefore!).removed).toContain(nativeId); + + // 4. Reload the inner viewer iframe ONLY (basic-host keeps the same + // cached tool result → same viewUUID/toolId → same storage key). + // restoreAnnotations() now seeds restoredRemovedIds from localStorage + // while the lazy scan has only seen page 1. + await app.locator("body").evaluate(() => location.reload()); + await waitForFirstPageRendered(page); + await expect(app.locator("#page-input")).toHaveValue("1"); + + // 5. Trigger persistAnnotations() via an unrelated edit on page 1 — the + // bug scenario: page 2 has not been scanned yet. + const toolSelect = page.locator("select").nth(1); + await toolSelect.selectOption("interact"); + await page.locator("textarea").fill( + JSON.stringify({ + viewUUID, + action: "add_annotations", + annotations: [ + { + id: "probe-on-page-1", + type: "highlight", + page: 1, + rects: [{ x: 50, y: 700, width: 100, height: 12 }], + }, + ], + }), + ); + await page.click('button:has-text("Call Tool")'); + await expect( + app.locator('[data-annotation-id="probe-on-page-1"]'), + ).toHaveCount(1, { timeout: 10_000 }); + + // 6. Load-bearing assertion: the persisted diff still carries the + // tombstone. Pre-fix, computeDiff() over the page-1-only baseline + // yielded removed=[], overwriting it. + const diffAfter = await app + .locator("body") + .evaluate((_, k) => localStorage.getItem(k), storageKey!); + const removedAfter: string[] = JSON.parse(diffAfter!).removed; + expect(removedAfter).toContain(nativeId); + + // 7. Belt-and-suspenders: navigate to page 2 (lazy scan now sees the + // native annotation) and confirm the panel shows it as a cleared + // tombstone, not a live (resurrected) card. + await app.locator("#next-btn").click(); + await expect(app.locator("#page-input")).toHaveValue("2"); + await expect( + app.locator(`.annotation-card[data-annotation-id="${nativeId}"]`), + ).toHaveClass(/annotation-card-cleared/); + }); +}); diff --git a/tests/helpers/assets/fw9.pdf b/tests/helpers/assets/fw9.pdf new file mode 100644 index 000000000..c5eb64678 Binary files /dev/null and b/tests/helpers/assets/fw9.pdf differ diff --git a/tests/helpers/range-counting-server.ts b/tests/helpers/range-counting-server.ts new file mode 100644 index 000000000..529ef332d --- /dev/null +++ b/tests/helpers/range-counting-server.ts @@ -0,0 +1,249 @@ +/** + * HTTP test fixture serving programmatically-generated PDFs with byte-range + * accounting. Used by pdf-incremental-load.spec.ts to assert that display_pdf + * doesn't pull the whole file before the viewer starts streaming. + * + * Plain HTTP on loopback — playwright.config.ts sets + * PDF_SERVER_ALLOW_LOOPBACK_HTTP=1 so validateUrl accepts http://127.0.0.1. + */ +import http from "node:http"; +import type { AddressInfo } from "node:net"; +import { PDFDocument, PDFName, PDFString, StandardFonts } from "pdf-lib"; + +export interface RangeServerStats { + /** Total bytes written across all responses (sum of slice lengths). */ + totalBytesServed: number; + /** Bytes that were served more than once for the same path. */ + overlapBytes: number; +} + +export interface RangeServer { + port: number; + baseUrl: string; + /** Map of served path → byte length. */ + fileSizes: Record; + stats(): RangeServerStats; + resetStats(): void; + /** Resolve any requests currently stalled by ?stallAfterBytes=N. */ + release(): void; + close(): Promise; +} + +const LOREM = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod " + + "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim " + + "veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea " + + "commodo consequat. "; + +async function buildNoFormsPdf(): Promise { + const doc = await PDFDocument.create(); + const font = await doc.embedFont(StandardFonts.Helvetica); + // Page 1 is text-only (small) so first paint needs minimal bytes. Pages 2+ + // each reference a large embedded JPEG so the bulk of the file is in image + // streams page 1 doesn't need. The stallAfterBytes test holds those back + // and asserts page 1 still renders. The image is >MAX_CHUNK_BYTES (512KB) + // so rendering page 2 also exercises the viewer's >512KB range path. + const big = await doc.embedJpg(makeRandomJpeg(1_100 * 1024)); + const page1 = doc.addPage([612, 792]); + for (let line = 0; line < 30; line++) { + page1.drawText(`1.${line + 1} ${LOREM}`, { + x: 36, + y: 760 - line * 22, + size: 10, + font, + }); + } + for (let p = 1; p < 20; p++) { + const page = doc.addPage([612, 792]); + page.drawImage(big, { x: 36, y: 200, width: 540, height: 540 }); + page.drawText(`Page ${p + 1}`, { x: 36, y: 760, size: 10, font }); + } + return doc.save(); +} + +/** Minimal valid JPEG with `len` bytes of incompressible scan data. */ +export function makeRandomJpeg(len: number): Uint8Array { + // SOI, APP0 (JFIF), SOF0 (baseline 8x8 1-component), DHT (minimal), + // SOS, , EOI. pdf-lib only needs to parse the headers + // to embed; the scan data is opaque. + const header = Uint8Array.from([ + 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, + 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0xff, 0xc0, 0x00, 0x0b, + 0x08, 0x00, 0x08, 0x00, 0x08, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, + 0x14, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xda, 0x00, 0x08, 0x01, + 0x01, 0x00, 0x00, 0x3f, 0x00, + ]); + const scan = new Uint8Array(len); + for (let i = 0; i < len; i++) scan[i] = (i * 1103515245 + 12345) & 0xff; + // Avoid 0xFF in scan data so we don't accidentally form a marker. + for (let i = 0; i < len; i++) if (scan[i] === 0xff) scan[i] = 0xfe; + const eoi = Uint8Array.from([0xff, 0xd9]); + const out = new Uint8Array(header.length + scan.length + eoi.length); + out.set(header, 0); + out.set(scan, header.length); + out.set(eoi, header.length + scan.length); + return out; +} + +/** + * Two pages, page 1 text-only, page 2 carries one native /Text (sticky-note) + * annotation. Used by the tombstone-preservation e2e: the viewer's lazy + * baseline scan must not have visited page 2 when persistAnnotations runs. + */ +async function buildWithNativeAnnotPdf(): Promise { + const doc = await PDFDocument.create(); + const font = await doc.embedFont(StandardFonts.Helvetica); + const page1 = doc.addPage([612, 792]); + page1.drawText("Page 1 — no native annotations here.", { + x: 36, + y: 740, + size: 12, + font, + }); + const page2 = doc.addPage([612, 792]); + page2.drawText("Page 2 — has one native /Text annot.", { + x: 36, + y: 740, + size: 12, + font, + }); + const annotRef = doc.context.register( + doc.context.obj({ + Type: "Annot", + Subtype: "Text", + Rect: [100, 700, 120, 720], + Contents: PDFString.of("native sticky note"), + Open: false, + Name: "Comment", + }), + ); + page2.node.set(PDFName.of("Annots"), doc.context.obj([annotRef])); + return doc.save(); +} + +async function buildFormsPdf(): Promise { + const doc = await PDFDocument.create(); + const form = doc.getForm(); + for (let p = 0; p < 2; p++) doc.addPage([612, 792]); + const [page1] = doc.getPages(); + const fields = ["name", "email", "phone", "city", "notes"]; + fields.forEach((name, i) => { + const f = form.createTextField(name); + f.addToPage(page1, { x: 100, y: 650 - i * 60, width: 300, height: 24 }); + }); + return doc.save(); +} + +export async function startRangeServer(): Promise { + if (process.env.NODE_ENV === "production") { + throw new Error( + "range-counting-server is a test fixture; refusing to start with NODE_ENV=production", + ); + } + const files: Record = { + "/noforms.pdf": await buildNoFormsPdf(), + "/forms.pdf": await buildFormsPdf(), + "/with-native-annot.pdf": await buildWithNativeAnnotPdf(), + }; + const fileSizes = Object.fromEntries( + Object.entries(files).map(([k, v]) => [k, v.length]), + ); + + // Per-path hit count per byte, for overlap accounting. + const hitCounts: Record = {}; + const initHits = () => { + for (const [k, v] of Object.entries(files)) { + hitCounts[k] = new Uint8Array(v.length); + } + }; + initHits(); + + let totalBytesServed = 0; + let releaseResolve: (() => void) | undefined; + let releasePromise = new Promise((r) => (releaseResolve = r)); + + const server = http.createServer(async (req, res) => { + const url = new URL(req.url ?? "/", "http://127.0.0.1"); + const body = files[url.pathname]; + if (!body) { + res.writeHead(404).end(); + return; + } + + const stallAfterBytes = url.searchParams.get("stallAfterBytes"); + const total = body.length; + const range = req.headers.range; + + let begin = 0; + let end = total; // exclusive + let status = 200; + if (range) { + const m = /^bytes=(\d+)-(\d*)$/.exec(range); + if (m) { + begin = parseInt(m[1], 10); + end = m[2] ? parseInt(m[2], 10) + 1 : total; + begin = Math.min(begin, total); + end = Math.min(end, total); + status = 206; + } + } + + // Stall once N bytes have already been served — lets pdfjs read the + // header/trailer/xref (scattered across the file) before blocking the + // bulk content streams. + if (stallAfterBytes !== null) { + if (totalBytesServed >= parseInt(stallAfterBytes, 10)) { + await releasePromise; + } + } + + const slice = body.subarray(begin, end); + totalBytesServed += slice.length; + const hits = hitCounts[url.pathname]; + for (let i = begin; i < end; i++) hits[i]++; + + const headers: Record = { + "Content-Type": "application/pdf", + "Accept-Ranges": "bytes", + "Content-Length": String(slice.length), + }; + if (status === 206) { + headers["Content-Range"] = `bytes ${begin}-${end - 1}/${total}`; + } + res.writeHead(status, headers); + res.end(slice); + }); + + await new Promise((resolve) => server.listen(0, resolve)); + const port = (server.address() as AddressInfo).port; + + return { + port, + baseUrl: `http://127.0.0.1:${port}`, + fileSizes, + stats() { + let overlapBytes = 0; + for (const hits of Object.values(hitCounts)) { + for (let i = 0; i < hits.length; i++) if (hits[i] > 1) overlapBytes++; + } + return { totalBytesServed, overlapBytes }; + }, + resetStats() { + totalBytesServed = 0; + initHits(); + // Unblock any handlers parked on the previous stall before re-arming, + // otherwise they hold sockets open forever and close() hangs. + releaseResolve?.(); + releasePromise = new Promise((r) => (releaseResolve = r)); + }, + release() { + releaseResolve?.(); + }, + close() { + releaseResolve?.(); + server.closeAllConnections?.(); + return new Promise((resolve) => server.close(() => resolve())); + }, + }; +}