Skip to content

Commit 3a530b8

Browse files
committed
Fix BrowserCrawler tests
1 parent 5bb45f2 commit 3a530b8

File tree

6 files changed

+64
-21
lines changed

6 files changed

+64
-21
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,6 +1336,8 @@ export class BasicCrawler<
13361336
const statisticsId = request.id || request.uniqueKey;
13371337
this.stats.startJob(statisticsId);
13381338

1339+
const deferredCleanup: (() => Promise<unknown>)[] = [];
1340+
13391341
const crawlingContext: CrawlingContext = {
13401342
id: cryptoRandomObjectId(10),
13411343
log: this.log,
@@ -1357,6 +1359,9 @@ export class BasicCrawler<
13571359
useState: this.useState.bind(this),
13581360
sendRequest: createSendRequest(this.httpClient, request!, session, () => crawlingContext.proxyInfo?.url),
13591361
getKeyValueStore: async (idOrName?: string) => KeyValueStore.open(idOrName, { config: this.config }),
1362+
registerDeferredCleanup: (cleanup) => {
1363+
deferredCleanup.push(cleanup);
1364+
},
13601365
};
13611366

13621367
let isRequestLocked = true;
@@ -1419,6 +1424,8 @@ export class BasicCrawler<
14191424
// decrease the session score if the request fails (but the error handler did not throw)
14201425
crawlingContext.session?.markBad();
14211426
} finally {
1427+
await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
1428+
14221429
// Safety net - release the lock if nobody managed to do it before
14231430
if (isRequestLocked && source instanceof RequestProvider) {
14241431
try {

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import type {
22
Awaitable,
33
BasicCrawlerOptions,
4+
BasicCrawlingContext,
45
CrawlingContext,
56
Dictionary,
67
EnqueueLinksOptions,
@@ -354,7 +355,8 @@ export abstract class BrowserCrawler<
354355
contextPipelineBuilder: () =>
355356
contextPipelineBuilder()
356357
.compose({ action: this.performNavigation.bind(this) })
357-
.compose({ action: this.handleBlockedRequestByContent.bind(this) }),
358+
.compose({ action: this.handleBlockedRequestByContent.bind(this) })
359+
.compose({ action: this.restoreRequestState.bind(this) }),
358360
contextPipelineEnhancer: options.contextPipelineEnhancer,
359361
},
360362
config,
@@ -405,10 +407,15 @@ export abstract class BrowserCrawler<
405407
> {
406408
return ContextPipeline.create<CrawlingContext>().compose({
407409
action: this.preparePage.bind(this),
408-
cleanup: async (context: { page: Page }) => {
409-
await context.page
410-
.close()
411-
.catch((error: Error) => this.log.debug('Error while closing page', { error }));
410+
cleanup: async (context: {
411+
page: Page;
412+
registerDeferredCleanup: BasicCrawlingContext['registerDeferredCleanup'];
413+
}) => {
414+
context.registerDeferredCleanup(async () => {
415+
await context.page
416+
.close()
417+
.catch((error: Error) => this.log.debug('Error while closing page', { error }));
418+
});
412419
},
413420
});
414421
}
@@ -592,6 +599,11 @@ export abstract class BrowserCrawler<
592599
return {};
593600
}
594601

602+
private async restoreRequestState(crawlingContext: CrawlingContext) {
603+
crawlingContext.request.state = RequestState.REQUEST_HANDLER;
604+
return {};
605+
}
606+
595607
protected async _applyCookies(
596608
{ session, request, page, browserController }: BrowserCrawlingContext,
597609
preHooksCookies: string,

packages/core/src/crawlers/crawler_commons.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,11 @@ export interface CrawlingContext<UserData extends Dictionary = Dictionary> exten
157157
* ```
158158
*/
159159
sendRequest<Response = string>(overrideOptions?: Partial<OptionsInit>): Promise<GotResponse<Response>>;
160+
161+
/**
162+
* Register a function to be called at the very end of the request handling process. This is useful for resources that should be accessible to error handlers, for instance.
163+
*/
164+
registerDeferredCleanup(cleanup: () => Promise<unknown>): void;
160165
}
161166

162167
/**

packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,13 +470,16 @@ export class AdaptivePlaywrightCrawler<
470470
const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
471471
const logs: LogProxyCall[] = [];
472472

473+
const deferredCleanup: (() => Promise<unknown>)[] = [];
474+
473475
const resultBoundContextHelpers = {
474476
addRequests: result.addRequests,
475477
pushData: result.pushData,
476478
useState: this.allowStorageAccess(useStateFunction),
477479
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
478480
enqueueLinks: result.enqueueLinks,
479481
log: this.createLogProxy(context.log, logs),
482+
registerDeferredCleanup: (cleanup: () => Promise<unknown>) => deferredCleanup.push(cleanup),
480483
};
481484

482485
try {
@@ -510,6 +513,8 @@ export class AdaptivePlaywrightCrawler<
510513
return { result, ok: true, logs };
511514
} catch (error) {
512515
return { error, ok: false, logs };
516+
} finally {
517+
await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
513518
}
514519
}
515520

test/core/crawlers/basic_browser_crawler.ts

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,30 @@
11
import type { PuppeteerController, PuppeteerPlugin } from '@crawlee/browser-pool';
2-
import type { PuppeteerCrawlerOptions, PuppeteerCrawlingContext, PuppeteerGoToOptions } from '@crawlee/puppeteer';
2+
import type {
3+
BrowserCrawlerOptions,
4+
BrowserCrawlingContext,
5+
PuppeteerCrawlingContext,
6+
PuppeteerGoToOptions,
7+
} from '@crawlee/puppeteer';
38
import { BrowserCrawler } from '@crawlee/puppeteer';
49
import type { HTTPResponse, LaunchOptions, Page } from 'puppeteer';
510

11+
export type TestCrawlingContext = BrowserCrawlingContext<Page, HTTPResponse, PuppeteerController>;
12+
613
export class BrowserCrawlerTest extends BrowserCrawler<
714
Page,
815
HTTPResponse,
916
PuppeteerController,
1017
{ browserPlugins: [PuppeteerPlugin] },
1118
LaunchOptions,
12-
PuppeteerCrawlingContext
19+
TestCrawlingContext
1320
> {
14-
constructor(options: Partial<PuppeteerCrawlerOptions> = {}) {
15-
super(options as any);
21+
constructor(
22+
options: Partial<BrowserCrawlerOptions<Page, HTTPResponse, PuppeteerController, TestCrawlingContext>> = {},
23+
) {
24+
super({
25+
...options,
26+
contextPipelineBuilder: () => this.buildContextPipeline(),
27+
});
1628
}
1729

1830
protected async _navigationHandler(

test/core/crawlers/browser_crawler.test.ts

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import type { Server } from 'node:http';
22

33
import { BROWSER_POOL_EVENTS, OperatingSystemsName, PuppeteerPlugin } from '@crawlee/browser-pool';
44
import { BLOCKED_STATUS_CODES } from '@crawlee/core';
5-
import type { PuppeteerCrawlingContext, PuppeteerGoToOptions } from '@crawlee/puppeteer';
5+
import type { PuppeteerGoToOptions } from '@crawlee/puppeteer';
66
import { EnqueueStrategy, ProxyConfiguration, Request, RequestList, RequestState, Session } from '@crawlee/puppeteer';
77
import { sleep } from '@crawlee/utils';
88
import type { HTTPResponse } from 'puppeteer';
@@ -13,6 +13,7 @@ import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator.js';
1313
import { ENV_VARS } from '@apify/consts';
1414
import log from '@apify/log';
1515

16+
import type { TestCrawlingContext } from './basic_browser_crawler.js';
1617
import { BrowserCrawlerTest } from './basic_browser_crawler.js';
1718

1819
describe('BrowserCrawler', () => {
@@ -64,7 +65,7 @@ describe('BrowserCrawler', () => {
6465
const processed: Request[] = [];
6566
const failed: Request[] = [];
6667
const requestList = await RequestList.open(null, sources);
67-
const requestHandler = async ({ page, request, response }: PuppeteerCrawlingContext) => {
68+
const requestHandler = async ({ page, request, response }: TestCrawlingContext) => {
6869
await page.waitForSelector('title');
6970

7071
expect(response!.status()).toBe(200);
@@ -124,7 +125,7 @@ describe('BrowserCrawler', () => {
124125
let sessionGoto!: Session;
125126
const browserCrawler = new (class extends BrowserCrawlerTest {
126127
protected override async _navigationHandler(
127-
ctx: PuppeteerCrawlingContext,
128+
ctx: TestCrawlingContext,
128129
): Promise<HTTPResponse | null | undefined> {
129130
vitest.spyOn(ctx.session!, 'markBad');
130131
sessionGoto = ctx.session!;
@@ -184,6 +185,7 @@ describe('BrowserCrawler', () => {
184185
},
185186
requestList,
186187
useSessionPool: true,
188+
requestHandler: async () => {},
187189
maxRequestRetries: 0,
188190
postNavigationHooks: [hook],
189191
});
@@ -270,7 +272,7 @@ describe('BrowserCrawler', () => {
270272
let optionsGoto: PuppeteerGoToOptions;
271273
const browserCrawler = new (class extends BrowserCrawlerTest {
272274
protected override async _navigationHandler(
273-
ctx: PuppeteerCrawlingContext,
275+
ctx: TestCrawlingContext,
274276
gotoOptions: PuppeteerGoToOptions,
275277
): Promise<HTTPResponse | null | undefined> {
276278
optionsGoto = gotoOptions;
@@ -617,7 +619,7 @@ describe('BrowserCrawler', () => {
617619
let called = false;
618620
const browserCrawler = new (class extends BrowserCrawlerTest {
619621
protected override async _navigationHandler(
620-
ctx: PuppeteerCrawlingContext,
622+
ctx: TestCrawlingContext,
621623
): Promise<HTTPResponse | null | undefined> {
622624
browserCrawler.browserPool.on(BROWSER_POOL_EVENTS.BROWSER_RETIRED, () => {
623625
resolve();
@@ -841,7 +843,7 @@ describe('BrowserCrawler', () => {
841843

842844
const browserCrawler = new (class extends BrowserCrawlerTest {
843845
protected override async _navigationHandler(
844-
ctx: PuppeteerCrawlingContext,
846+
ctx: TestCrawlingContext,
845847
): Promise<HTTPResponse | null | undefined> {
846848
const { session } = ctx;
847849
const proxyInfo = await this.proxyConfiguration!.newProxyInfo(session?.id);
@@ -880,7 +882,7 @@ describe('BrowserCrawler', () => {
880882
let numberOfRotations = -requestList!.length();
881883
const browserCrawler = new (class extends BrowserCrawlerTest {
882884
protected override async _navigationHandler(
883-
ctx: PuppeteerCrawlingContext,
885+
ctx: TestCrawlingContext,
884886
): Promise<HTTPResponse | null | undefined> {
885887
const { session } = ctx;
886888
const proxyInfo = await this.proxyConfiguration!.newProxyInfo(session?.id);
@@ -918,7 +920,7 @@ describe('BrowserCrawler', () => {
918920

919921
const crawler = new (class extends BrowserCrawlerTest {
920922
protected override async _navigationHandler(
921-
ctx: PuppeteerCrawlingContext,
923+
ctx: TestCrawlingContext,
922924
): Promise<HTTPResponse | null | undefined> {
923925
const { session } = ctx;
924926
const proxyInfo = await this.proxyConfiguration!.newProxyInfo(session?.id);
@@ -967,16 +969,16 @@ describe('BrowserCrawler', () => {
967969
});
968970

969971
test('uses correct crawling context', async () => {
970-
let prepareCrawlingContext: PuppeteerCrawlingContext;
972+
let prepareCrawlingContext: TestCrawlingContext;
971973

972-
const gotoFunction = async (crawlingContext: PuppeteerCrawlingContext) => {
974+
const gotoFunction = async (crawlingContext: TestCrawlingContext) => {
973975
prepareCrawlingContext = crawlingContext;
974976
expect(crawlingContext.request).toBeInstanceOf(Request);
975977
expect(crawlingContext.session).toBeInstanceOf(Session);
976978
expect(typeof crawlingContext.page).toBe('object');
977979
};
978980

979-
const requestHandler = async (crawlingContext: PuppeteerCrawlingContext) => {
981+
const requestHandler = async (crawlingContext: TestCrawlingContext) => {
980982
expect(crawlingContext === prepareCrawlingContext).toEqual(true);
981983
expect(crawlingContext.request).toBeInstanceOf(Request);
982984
expect(crawlingContext.session).toBeInstanceOf(Session);
@@ -986,7 +988,7 @@ describe('BrowserCrawler', () => {
986988
throw new Error('some error');
987989
};
988990

989-
const failedRequestHandler = async (crawlingContext: Partial<PuppeteerCrawlingContext>, error: Error) => {
991+
const failedRequestHandler = async (crawlingContext: Partial<TestCrawlingContext>, error: Error) => {
990992
expect(crawlingContext).toBe(prepareCrawlingContext);
991993
expect(crawlingContext.request).toBeInstanceOf(Request);
992994
expect(crawlingContext.session).toBeInstanceOf(Session);

0 commit comments

Comments
 (0)