From 8ac6b81e2aa8c74f44fed11c2de81b2f7595e4b9 Mon Sep 17 00:00:00 2001 From: txrp0x9 Date: Mon, 28 Oct 2024 16:27:22 +0530 Subject: [PATCH 1/2] fix formatting e2e v1 tests --- .../__tests__/e2e_v1_withAuth/index.test.ts | 1436 ++++++++--------- 1 file changed, 718 insertions(+), 718 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index a41634726..11aef2a5d 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -37,8 +37,8 @@ describe("E2E Tests for v1 API Routes", () => { describe("POST /v1/scrape", () => { it.concurrent("should require authorization", async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .send({ url: "https://firecrawl.dev"}) + .post("/v1/scrape") + .send({ url: "https://firecrawl.dev" }) expect(response.statusCode).toBe(401); }); @@ -177,7 +177,7 @@ describe("E2E Tests for v1 API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { @@ -194,812 +194,812 @@ describe("E2E Tests for v1 API Routes", () => { 30000 ); it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { - const scrapeRequest: ScrapeRequest = { - url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" // formats: ["markdown", "html"], + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send(scrapeRequest); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, 60000); + + it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/pdf/astro-ph/9301001" + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send(scrapeRequest); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, 60000); + + it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://www.scrapethissite.com/", + onlyMainContent: false // default is true + }; + const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + + if (!("data" in responseWithoutRemoveTags.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav + expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer + + const scrapeRequestWithRemoveTags: ScrapeRequest = { + url: "https://www.scrapethissite.com/", + excludeTags: ['.nav', '#footer', 'strong'], + onlyMainContent: false // default is true + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequestWithRemoveTags); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.markdown).not.toContain("Hartley Brody 2023"); + expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); // + }, 30000); + + it.concurrent('should return a successful response for a scrape with 400 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/400' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(400); + }, 60000); + + + it.concurrent('should return a successful response for a scrape with 401 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/401' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(401); + }, 60000); + + // Removed it as we want to retry fallback to the next scraper + // it.concurrent('should return a successful response for a scrape with 403 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/403' }); + // await new Promise((r) => setTimeout(r, 5000)); + + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(403); + // }, 60000); + + it.concurrent('should return a successful response for a scrape with 404 page', async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/404' }); + await new Promise((r) => setTimeout(r, 5000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(404); + }, 60000); + + // it.concurrent('should return a successful response for a scrape with 405 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/405' }); + // await new Promise((r) => setTimeout(r, 5000)); + + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(405); + // }, 60000); + + // it.concurrent('should return a successful response for a scrape with 500 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/500' }); + // await new Promise((r) => setTimeout(r, 5000)); + + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(500); + // }, 60000); + + it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); + + expect(response.statusCode).toBe(408); + }, 3000); + + it.concurrent( + "should return a successful response with a valid API key and includeHtml set to true", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://roastmywebsite.ai", + formats: ["html", "rawHtml"], }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") .send(scrapeRequest); - await new Promise((r) => setTimeout(r, 6000)); - + expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); + expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.html).toContain(" { + it.concurrent( + "should return a successful response with waitFor", + async () => { const scrapeRequest: ScrapeRequest = { - url: "https://arxiv.org/pdf/astro-ph/9301001" + url: "https://ycombinator.com/companies", + formats: ["markdown"], + waitFor: 8000 }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") .send(scrapeRequest); - await new Promise((r) => setTimeout(r, 6000)); - + expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); + expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data).not.toHaveProperty("links"); + expect(response.body.data).not.toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.markdown).toContain("PagerDuty"); expect(response.body.data.metadata.statusCode).toBe(200); expect(response.body.data.metadata.error).toBeUndefined(); - }, 60000); - it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { + }, + 30000 + ); + + it.concurrent( + "should return a successful response with a valid links on page", + async () => { const scrapeRequest: ScrapeRequest = { - url: "https://www.scrapethissite.com/", - onlyMainContent: false // default is true + url: "https://roastmywebsite.ai", + formats: ["links"], }; - const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - expect(responseWithoutRemoveTags.statusCode).toBe(200); - expect(responseWithoutRemoveTags.body).toHaveProperty("data"); - if (!("data" in responseWithoutRemoveTags.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); - expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); - expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav - expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer - - const scrapeRequestWithRemoveTags: ScrapeRequest = { - url: "https://www.scrapethissite.com/", - excludeTags: ['.nav', '#footer', 'strong'], - onlyMainContent: false // default is true - }; const response: ScrapeResponseRequestTest = await request(TEST_URL) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send(scrapeRequestWithRemoveTags); + .send(scrapeRequest); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data.markdown).not.toContain("Hartley Brody 2023"); - expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); // - }, 30000); + expect(response.body.data).not.toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("links"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.links).toContain("https://firecrawl.dev"); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, + 30000 + ); - it.concurrent('should return a successful response for a scrape with 400 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/400' }); - await new Promise((r) => setTimeout(r, 5000)); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(400); - }, 60000); + }); - it.concurrent('should return a successful response for a scrape with 401 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/401' }); - await new Promise((r) => setTimeout(r, 5000)); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(401); - }, 60000); - - // Removed it as we want to retry fallback to the next scraper - // it.concurrent('should return a successful response for a scrape with 403 page', async () => { - // const response: ScrapeResponseRequestTest = await request(TEST_URL) - // .post('/v1/scrape') - // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - // .set('Content-Type', 'application/json') - // .send({ url: 'https://httpstat.us/403' }); - // await new Promise((r) => setTimeout(r, 5000)); - - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty('data'); - // if (!("data" in response.body)) { - // throw new Error("Expected response body to have 'data' property"); - // } - // expect(response.body.data).toHaveProperty('markdown'); - // expect(response.body.data).toHaveProperty('metadata'); - // expect(response.body.data.metadata.statusCode).toBe(403); - // }, 60000); - - it.concurrent('should return a successful response for a scrape with 404 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/404' }); - await new Promise((r) => setTimeout(r, 5000)); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(404); - }, 60000); - - // it.concurrent('should return a successful response for a scrape with 405 page', async () => { - // const response: ScrapeResponseRequestTest = await request(TEST_URL) - // .post('/v1/scrape') - // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - // .set('Content-Type', 'application/json') - // .send({ url: 'https://httpstat.us/405' }); - // await new Promise((r) => setTimeout(r, 5000)); - - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty('data'); - // if (!("data" in response.body)) { - // throw new Error("Expected response body to have 'data' property"); - // } - // expect(response.body.data).toHaveProperty('markdown'); - // expect(response.body.data).toHaveProperty('metadata'); - // expect(response.body.data.metadata.statusCode).toBe(405); - // }, 60000); - - // it.concurrent('should return a successful response for a scrape with 500 page', async () => { - // const response: ScrapeResponseRequestTest = await request(TEST_URL) - // .post('/v1/scrape') - // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - // .set('Content-Type', 'application/json') - // .send({ url: 'https://httpstat.us/500' }); - // await new Promise((r) => setTimeout(r, 5000)); - - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty('data'); - // if (!("data" in response.body)) { - // throw new Error("Expected response body to have 'data' property"); - // } - // expect(response.body.data).toHaveProperty('markdown'); - // expect(response.body.data).toHaveProperty('metadata'); - // expect(response.body.data.metadata.statusCode).toBe(500); - // }, 60000); - - it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", timeout: 1000 }); - - expect(response.statusCode).toBe(408); - }, 3000); - - it.concurrent( - "should return a successful response with a valid API key and includeHtml set to true", - async () => { - const scrapeRequest: ScrapeRequest = { - url: "https://roastmywebsite.ai", - formats: ["html","rawHtml"], - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).not.toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("html"); - expect(response.body.data).toHaveProperty("rawHtml"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.html).toContain(" { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); - it.concurrent( - "should return a successful response with waitFor", - async () => { - const scrapeRequest: ScrapeRequest = { - url: "https://ycombinator.com/companies", - formats: ["markdown"], - waitFor: 8000 - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data).not.toHaveProperty("links"); - expect(response.body.data).not.toHaveProperty("rawHtml"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.markdown).toContain("PagerDuty"); - expect(response.body.data.metadata.statusCode).toBe(200); - expect(response.body.data.metadata.error).toBeUndefined(); - - }, - 30000 - ); + it.concurrent("should return an error response with an invalid API key", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); - it.concurrent( - "should return a successful response with a valid links on page", - async () => { - const scrapeRequest: ScrapeRequest = { - url: "https://roastmywebsite.ai", - formats: ["links"], - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data).not.toHaveProperty("rawHtml"); - expect(response.body.data).toHaveProperty("links"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.links).toContain("https://firecrawl.dev"); - expect(response.body.data.metadata.statusCode).toBe(200); - expect(response.body.data.metadata.error).toBeUndefined(); - }, - 30000 - ); - + it.concurrent("should return a successful response with a valid API key", async () => { + const mapRequest = { + url: "https://roastmywebsite.ai" + }; - }); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); -describe("POST /v1/map", () => { - it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent("should return a successful response with a valid API key and search", async () => { + const mapRequest = { + url: "https://usemotion.com", + search: "pricing" + }; - it.concurrent("should return a successful response with a valid API key", async () => { - const mapRequest = { - url: "https://roastmywebsite.ai" - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - }); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); - it.concurrent("should return a successful response with a valid API key and search", async () => { - const mapRequest = { - url: "https://usemotion.com", - search: "pricing" - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - expect(links[0]).toContain("usemotion.com/pricing"); - }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("usemotion.com/pricing"); + }); - it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => { - const mapRequest = { - url: "https://firecrawl.dev", - search: "docs", - includeSubdomains: true - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - - const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); - expect(containsDocsFirecrawlDev).toBe(true); - }); + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => { + const mapRequest = { + url: "https://firecrawl.dev", + search: "docs", + includeSubdomains: true + }; - it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { - const mapRequest = { - url: "https://www.firecrawl.dev", - search: "docs", - includeSubdomains: true - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - - const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); - expect(containsDocsFirecrawlDev).toBe(true); - }, 10000) - - it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { - const mapRequest = { - url: "https://www.firecrawl.dev", - search: "docs", - includeSubdomains: false - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - expect(links[0]).not.toContain("docs.firecrawl.dev"); - }) - - it.concurrent("should return an error for invalid URL", async () => { - const mapRequest = { - url: "invalid-url", - includeSubdomains: true, - search: "test", - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(400); - expect(response.body).toHaveProperty("success", false); - expect(response.body).toHaveProperty("error"); - }); -}); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); -describe("POST /v1/crawl", () => { - it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/crawl") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should throw error for blocklisted URL", async () => { - const scrapeRequest: ScrapeRequest = { - url: "https://facebook.com/fake-test", - }; - - const response = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(403); - expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); + expect(containsDocsFirecrawlDev).toBe(true); + }); + + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + + const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); + expect(containsDocsFirecrawlDev).toBe(true); + }, 10000) + + it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: false + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).not.toContain("docs.firecrawl.dev"); + }) + + it.concurrent("should return an error for invalid URL", async () => { + const mapRequest = { + url: "invalid-url", + includeSubdomains: true, + search: "test", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(400); + expect(response.body).toHaveProperty("success", false); + expect(response.body).toHaveProperty("error"); + }); }); - it.concurrent( - "should return an error response with an invalid API key", - async () => { + + describe("POST /v1/crawl", () => { + it.concurrent("should require authorization", async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) .post("/v1/crawl") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } - ); - - it.concurrent("should return a successful response", async () => { - const response = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("id"); - expect(response.body.id).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - ); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("url"); - expect(response.body.url).toContain("/v1/crawl/"); - }); + }); + + it.concurrent("should throw error for blocklisted URL", async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://facebook.com/fake-test", + }; - it.concurrent( - "should return a successful response with a valid API key and valid includes option", - async () => { - const crawlResponse = await request(TEST_URL) + const response = await request(TEST_URL) .post("/v1/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ - url: "https://firecrawl.dev", - limit: 40, - includePaths: ["blog/*"], - }); - - let response; - let isFinished = false; - - while (!isFinished) { - response = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + .send(scrapeRequest); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; + expect(response.statusCode).toBe(403); + expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + }); - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); } + ); - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(5); - urls.forEach((url: string) => { - expect(url).toContain("firecrawl.dev/blog"); - }); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0 - expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); - }, - 180000 - ); // 180 seconds - - it.concurrent( - "should return a successful response with a valid API key and valid excludes option", - async () => { - const crawlResponse = await request(TEST_URL) + it.concurrent("should return a successful response", async () => { + const response = await request(TEST_URL) .post("/v1/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ - url: "https://firecrawl.dev", - limit: 40, - excludePaths: ["blog/*"], - }); + .send({ url: "https://firecrawl.dev" }); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("id"); + expect(response.body.id).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("url"); + expect(response.body.url).toContain("/v1/crawl/"); + }); + + it.concurrent( + "should return a successful response with a valid API key and valid includes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + limit: 40, + includePaths: ["blog/*"], + }); + + let response; + let isFinished = false; - let isFinished = false; - let response; + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - while (!isFinished) { - response = await request(TEST_URL) + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) .get(`/v1/crawl/${crawlResponse.body.id}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url).toContain("firecrawl.dev/blog"); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0 + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + }, + 180000 + ); // 180 seconds + + it.concurrent( + "should return a successful response with a valid API key and valid excludes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + limit: 40, + excludePaths: ["blog/*"], + }); + + let isFinished = false; + let response; - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - } - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse = await request( - TEST_URL - ) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request( + TEST_URL + ) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(3); - urls.forEach((url: string) => { - expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy(); - }); - }, - 90000 - ); // 90 seconds - - it.concurrent( - "should return a successful response with max depth option for a valid crawl job", - async () => { - const crawlResponse = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - maxDepth: 1, + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(3); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy(); }); - expect(crawlResponse.statusCode).toBe(200); + }, + 90000 + ); // 90 seconds - const response = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) + it.concurrent( + "should return a successful response with max depth option for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + maxDepth: 1, + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) .get(`/v1/crawl/${crawlResponse.body.id}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - } - const completedResponse = await request( - TEST_URL - ) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).not.toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThanOrEqual(1); - - // Check if all URLs have a maximum depth of 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split("/"); - const depth = - pathSplits.length - - (pathSplits[0].length === 0 && - pathSplits[pathSplits.length - 1].length === 0 - ? 1 - : 0); - expect(depth).toBeLessThanOrEqual(2); - }); - }, - 180000 - ); -}) - -describe("GET /v1/crawl/:jobId", () => { - it.concurrent("should require authorization", async () => { - const response = await request(TEST_URL).get("/v1/crawl/123"); - expect(response.statusCode).toBe(401); + const completedResponse = await request( + TEST_URL + ) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, + 180000 + ); }); - it.concurrent( - "should return an error response with an invalid API key", - async () => { - const response = await request(TEST_URL) - .get("/v1/crawl/123") - .set("Authorization", `Bearer invalid-api-key`); + describe("GET /v1/crawl/:jobId", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).get("/v1/crawl/123"); expect(response.statusCode).toBe(401); - } - ); - - it.concurrent( - "should return Job not found for invalid job ID", - async () => { - const response = await request(TEST_URL) - .get("/v1/crawl/invalidJobId") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - } - ); - - it.concurrent( - "should return a successful crawl status response for a valid crawl job", - async () => { - const crawlResponse = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://docs.firecrawl.dev" }); - expect(crawlResponse.statusCode).toBe(200); + }); - let isCompleted = false; + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .get("/v1/crawl/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + } + ); - while (!isCompleted) { + it.concurrent( + "should return Job not found for invalid job ID", + async () => { const response = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) + .get("/v1/crawl/invalidJobId") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); + expect(response.statusCode).toBe(404); + } + ); - if (response.body.status === "completed") { - isCompleted = true; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + it.concurrent( + "should return a successful crawl status response for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } } - } - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).not.toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); - expect( - completedResponse.body.data[0].metadata.error - ).toBeUndefined(); - - const childrenLinks = completedResponse.body.data.filter( - (doc) => - doc.metadata && - doc.metadata.sourceURL - ); + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(childrenLinks.length).toBe(completedResponse.body.data.length); - }, - 180000 - ); // 120 seconds + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect( + completedResponse.body.data[0].metadata.error + ).toBeUndefined(); + + const childrenLinks = completedResponse.body.data.filter( + (doc) => + doc.metadata && + doc.metadata.sourceURL + ); - it.concurrent( - "If someone cancels a crawl job, it should turn into failed status", - async () => { - const crawlResponse = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://docs.firecrawl.dev", limit: 10 }); - - expect(crawlResponse.statusCode).toBe(200); - - await new Promise((r) => setTimeout(r, 10000)); - - const responseCancel = await request(TEST_URL) - .delete(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(responseCancel.statusCode).toBe(200); - expect(responseCancel.body).toHaveProperty("status"); - expect(responseCancel.body.status).toBe("cancelled"); - - await new Promise((r) => setTimeout(r, 10000)); - const completedResponse = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("cancelled"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); - }, - 60000 - ); // 60 seconds -}) + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, + 180000 + ); // 120 seconds + + it.concurrent( + "If someone cancels a crawl job, it should turn into failed status", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.firecrawl.dev", limit: 10 }); + + expect(crawlResponse.statusCode).toBe(200); + + await new Promise((r) => setTimeout(r, 10000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("cancelled"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + }, + 60000 + ); // 60 seconds + }); }); From db90b676677c84569fba70f14a1f361f0b99eda8 Mon Sep 17 00:00:00 2001 From: txrp0x9 Date: Mon, 28 Oct 2024 21:52:57 +0530 Subject: [PATCH 2/2] added batch scrape e2e tests --- .../__tests__/e2e_v1_withAuth/index.test.ts | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 11aef2a5d..8bf2f54f4 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -1,6 +1,7 @@ import request from "supertest"; import { configDotenv } from "dotenv"; import { + BatchScrapeRequest, ScrapeRequest, ScrapeResponseRequestTest, } from "../../controllers/v1/types"; @@ -1002,4 +1003,118 @@ describe("E2E Tests for v1 API Routes", () => { 60000 ); // 60 seconds }); + + describe("POST /v1/batch/scrape", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/batch/scrape") + .send({ urls: ["https://firecrawl.dev"] }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should throw error for blocklisted URL", async () => { + const scrapeRequest: BatchScrapeRequest = { + urls: ["https://firecrawl.dev", "https://scrapethissite.com", "https://facebook.com/fake-test"], + }; + const response = await request(TEST_URL) + .post("/v1/batch/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(400); + expect(response.body.details[0].message).toContain("does not support social media scraping") + }); + + it.concurrent("should return a successful response with a valid API key", async () => { + const scrapeRequest: BatchScrapeRequest = { + urls: ["https://mendable.ai", "https://www.google.com", "https://example.com"], + }; + const response = await request(TEST_URL) + .post("/v1/batch/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + }); + }); + + describe("GET /v1/batch/scrape/:jobId:", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).get("/v1/batch/scrape/123"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response = await request(TEST_URL) + .get("/v1/batch/scrape/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + }); + + it.concurrent( + "should return Job not found for invalid job ID", + async () => { + const response = await request(TEST_URL) + .get("/v1/batch/scrape/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + } + ); + + it.concurrent( + "should return successful data for a valid batch scrape", + async () => { + const batchScrapeRequest: BatchScrapeRequest = { + urls: ["https://firecrawl.dev", "https://www.scrapethissite.com", "https://example.com"], + }; + const batchResponse = await request(TEST_URL) + .post("/v1/batch/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(batchScrapeRequest); + expect(batchResponse.statusCode).toBe(200); + + let isCompleted = false; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v1/batch/scrape/${batchResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/batch/scrape/${batchResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + + completedResponse.body.data.forEach(data => { + expect(data).toHaveProperty("markdown"); + expect(data).toHaveProperty("metadata"); + expect(data.metadata.statusCode).toBe(200); + expect(data.metadata.error).toBeUndefined(); + }) + + const phrases = ["Scrape This Site", "Firecrawl", "Example Domain"]; + phrases.forEach(phrase => + expect(completedResponse.body.data.some(item => item.markdown && item.markdown.includes(phrase))).toBe(true) + ); + + }, + 180000 + ); + }); });