Skip to content

Commit a409af2

Browse files
barjinjanbuchar
authored andcommitted
feat!: use native Request / Response interface (#3163)
Phasing out `got-scraping`-specific interfaces in favour of native `fetch` API. Related to #3071
1 parent c65c218 commit a409af2

File tree

23 files changed

+326
-473
lines changed

23 files changed

+326
-473
lines changed

docs/upgrading/upgrading_v4.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,16 @@ This experimental option relied on an outdated manifest version for browser exte
9494
## Available resource detection
9595

9696
In v3, we introduced a new way to detect available resources for the crawler, available via `systemInfoV2` flag. In v4, this is the default way to detect available resources. The old way is removed completely together with the `systemInfoV2` flag.
97+
98+
## `HttpClient` instances return `Response` objects
99+
100+
The interface of `HttpClient` instances was changed to return the [native `Response` objects](https://developer.mozilla.org/en-US/docs/Web/API/Response) instead of custom `HttpResponse` objects.
101+
102+
## `CrawlingContext.response` is now of type `Response`
103+
104+
The `CrawlingContext.response` property is now of type [`Response`](https://developer.mozilla.org/en-US/docs/Web/API/Response) instead of `HttpResponse`. `CrawlingContext.sendRequest` method now returns `Response` objects as well.
105+
106+
## Crawling context in the `FileDownload` crawler no longer includes `body` and `stream` properties
107+
108+
The crawling context in the `FileDownload` crawler no longer includes the `body` and `stream` properties. These can be accessed directly via the `response` property instead, e.g. `context.response.bytes()` or `context.response.body`.
109+

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1578,7 +1578,7 @@ export class BasicCrawler<
15781578
},
15791579
pushData: this.pushData.bind(this),
15801580
useState: this.useState.bind(this),
1581-
sendRequest: createSendRequest(this.httpClient, request!, session),
1581+
sendRequest: createSendRequest(this.httpClient, request!, session) as CrawlingContext['sendRequest'],
15821582
getKeyValueStore: async (idOrName?: string) => KeyValueStore.open(idOrName, { config: this.config }),
15831583
registerDeferredCleanup: (cleanup) => {
15841584
deferredCleanup.push(cleanup);

packages/basic-crawler/src/internals/send-request.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import {
55
type Request,
66
type Session,
77
} from '@crawlee/core';
8-
import type { Method, Response as GotResponse } from 'got-scraping';
98

109
/**
1110
* Prepares a function to be used as the `sendRequest` context helper.
@@ -17,10 +16,7 @@ import type { Method, Response as GotResponse } from 'got-scraping';
1716
* @param getProxyUrl A function that will return the proxy URL that should be used for handling the request.
1817
*/
1918
export function createSendRequest(httpClient: BaseHttpClient, originRequest: Request, session: Session | undefined) {
20-
return async <Response = string>(
21-
// TODO the type information here (and in crawler_commons) is outright wrong... for BC - replace this with generic HttpResponse in v4
22-
overrideOptions: Partial<HttpRequestOptions> = {},
23-
): Promise<GotResponse<Response>> => {
19+
return async (overrideOptions: Partial<HttpRequestOptions> = {}): Promise<Response> => {
2420
const cookieJar = session
2521
? {
2622
getCookieString: async (url: string) => session.getCookieString(url),
@@ -31,7 +27,7 @@ export function createSendRequest(httpClient: BaseHttpClient, originRequest: Req
3127

3228
const requestOptions = processHttpRequestOptions({
3329
url: originRequest.url,
34-
method: originRequest.method as Method, // Narrow type to omit CONNECT
30+
method: originRequest.method,
3531
headers: originRequest.headers,
3632
proxyUrl: session?.proxyInfo?.url,
3733
sessionToken: session,
@@ -43,6 +39,6 @@ export function createSendRequest(httpClient: BaseHttpClient, originRequest: Req
4339
// Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand
4440
requestOptions.body ??= originRequest.payload;
4541

46-
return httpClient.sendRequest<any>(requestOptions) as unknown as GotResponse<Response>;
42+
return httpClient.sendRequest(requestOptions);
4743
};
4844
}

packages/core/src/cookie_utils.ts

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,14 @@ export interface ResponseLike {
1212
/**
1313
* @internal
1414
*/
15-
export function getCookiesFromResponse(response: ResponseLike): Cookie[] {
16-
const headers = typeof response.headers === 'function' ? response.headers() : response.headers;
17-
const cookieHeader = headers?.['set-cookie'] || '';
15+
export function getCookiesFromResponse(response: Response): Cookie[] {
16+
const headers = response.headers;
17+
const cookieHeaders = headers.getSetCookie();
1818

1919
try {
20-
return Array.isArray(cookieHeader)
21-
? cookieHeader.map((cookie) => Cookie.parse(cookie)!)
22-
: [Cookie.parse(cookieHeader)!];
20+
return cookieHeaders.map((cookie) => Cookie.parse(cookie)!);
2321
} catch (e) {
24-
throw new CookieParseError(cookieHeader);
22+
throw new CookieParseError(cookieHeaders);
2523
}
2624
}
2725

packages/core/src/crawlers/crawler_commons.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import type { Dictionary } from '@crawlee/types';
2-
import type { OptionsInit, Response as GotResponse } from 'got-scraping';
2+
import type { OptionsInit } from 'got-scraping';
33
import type { ReadonlyDeep, SetRequired } from 'type-fest';
44

55
import type { Configuration } from '../configuration.js';
@@ -156,7 +156,7 @@ export interface CrawlingContext<UserData extends Dictionary = Dictionary> exten
156156
* },
157157
* ```
158158
*/
159-
sendRequest<Response = string>(overrideOptions?: Partial<OptionsInit>): Promise<GotResponse<Response>>;
159+
sendRequest(overrideOptions?: Partial<OptionsInit>): Promise<Response>;
160160

161161
/**
162162
* Register a function to be called at the very end of the request handling process. This is useful for resources that should be accessible to error handlers, for instance.

packages/core/src/http_clients/base-http-client.ts

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import type { Readable } from 'node:stream';
22

3+
import type { AllowedHttpMethods } from '@crawlee/types';
34
import { applySearchParams, type SearchParams } from '@crawlee/utils';
45

56
import type { FormDataLike } from './form-data-like.js';
@@ -15,24 +16,6 @@ type Timeout =
1516
}
1617
| { request: number };
1718

18-
type Method =
19-
| 'GET'
20-
| 'POST'
21-
| 'PUT'
22-
| 'PATCH'
23-
| 'HEAD'
24-
| 'DELETE'
25-
| 'OPTIONS'
26-
| 'TRACE'
27-
| 'get'
28-
| 'post'
29-
| 'put'
30-
| 'patch'
31-
| 'head'
32-
| 'delete'
33-
| 'options'
34-
| 'trace';
35-
3619
/**
3720
* Maps permitted values of the `responseType` option on {@apilink HttpRequest} to the types that they produce.
3821
*/
@@ -79,7 +62,7 @@ export interface HttpRequest<TResponseType extends keyof ResponseTypes = 'text'>
7962
[k: string]: unknown; // TODO BC with got - remove in 4.0
8063

8164
url: string | URL;
82-
method?: Method;
65+
method?: AllowedHttpMethods;
8366
headers?: SimpleHeaders;
8467
body?: string | Buffer | Readable | Generator | AsyncGenerator | FormDataLike;
8568

@@ -146,6 +129,14 @@ interface HttpResponseWithoutBody<TResponseType extends keyof ResponseTypes = ke
146129
request: HttpRequest<TResponseType>;
147130
}
148131

132+
export class ResponseWithUrl extends Response {
133+
override url: string;
134+
constructor(body: BodyInit | null, init: ResponseInit & { url?: string }) {
135+
super(body, init);
136+
this.url = init.url ?? '';
137+
}
138+
}
139+
149140
/**
150141
* HTTP response data as returned by the {@apilink BaseHttpClient.sendRequest} method.
151142
*/
@@ -169,7 +160,7 @@ export interface StreamingHttpResponse extends HttpResponseWithoutBody {
169160
* Type of a function called when an HTTP redirect takes place. It is allowed to mutate the `updatedRequest` argument.
170161
*/
171162
export type RedirectHandler = (
172-
redirectResponse: BaseHttpResponseData,
163+
redirectResponse: Response,
173164
updatedRequest: { url?: string | URL; headers: SimpleHeaders },
174165
) => void;
175166

@@ -182,12 +173,12 @@ export interface BaseHttpClient {
182173
*/
183174
sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
184175
request: HttpRequest<TResponseType>,
185-
): Promise<HttpResponse<TResponseType>>;
176+
): Promise<Response>;
186177

187178
/**
188179
* Perform an HTTP Request and return after the response headers are received. The body may be read from a stream contained in the response.
189180
*/
190-
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse>;
181+
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<Response>;
191182
}
192183

193184
/**

packages/core/src/http_clients/got-scraping-http-client.ts

Lines changed: 62 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,40 @@
1+
import { Readable } from 'node:stream';
2+
13
import type { Options, PlainResponse } from 'got-scraping';
24
import { gotScraping } from 'got-scraping';
35

4-
import type {
5-
BaseHttpClient,
6-
HttpRequest,
7-
HttpResponse,
8-
RedirectHandler,
9-
ResponseTypes,
10-
StreamingHttpResponse,
6+
import {
7+
type BaseHttpClient,
8+
type HttpRequest,
9+
type RedirectHandler,
10+
type ResponseTypes,
11+
ResponseWithUrl,
1112
} from './base-http-client.js';
1213

1314
/**
1415
* A HTTP client implementation based on the `got-scraping` library.
1516
*/
1617
export class GotScrapingHttpClient implements BaseHttpClient {
18+
/**
19+
* Type guard that validates the HTTP method (excluding CONNECT).
20+
* @param request - The HTTP request to validate
21+
*/
22+
private validateRequest<TResponseType extends keyof ResponseTypes, T extends HttpRequest<TResponseType>>(
23+
request: T,
24+
): request is T & { method: Exclude<T['method'], 'CONNECT' | 'connect'> } {
25+
return !['CONNECT', 'connect'].includes(request.method!);
26+
}
27+
1728
/**
1829
* @inheritDoc
1930
*/
2031
async sendRequest<TResponseType extends keyof ResponseTypes>(
2132
request: HttpRequest<TResponseType>,
22-
): Promise<HttpResponse<TResponseType>> {
33+
): Promise<Response> {
34+
if (!this.validateRequest(request)) {
35+
throw new Error(`The HTTP method CONNECT is not supported by the GotScrapingHttpClient.`);
36+
}
37+
2338
const gotResult = await gotScraping({
2439
...request,
2540
// `HttpCrawler` reads the cookies beforehand and sets them in `request.gotOptions`.
@@ -31,23 +46,45 @@ export class GotScrapingHttpClient implements BaseHttpClient {
3146
},
3247
});
3348

34-
return {
35-
...gotResult,
36-
body: gotResult.body as ResponseTypes[TResponseType],
37-
request: { url: request.url, ...gotResult.request },
38-
};
49+
const parsedHeaders = Object.entries(gotResult.headers)
50+
.map(([key, value]) => {
51+
if (value === undefined) return [];
52+
53+
if (Array.isArray(value)) {
54+
return value.map((v) => [key, v]);
55+
}
56+
57+
return [[key, value]];
58+
})
59+
.flat() as [string, string][];
60+
61+
return new ResponseWithUrl(new Uint8Array(gotResult.rawBody), {
62+
headers: new Headers(parsedHeaders),
63+
status: gotResult.statusCode,
64+
statusText: gotResult.statusMessage ?? '',
65+
url: gotResult.url,
66+
});
3967
}
4068

4169
/**
4270
* @inheritDoc
4371
*/
44-
async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
72+
async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise<Response> {
73+
if (!this.validateRequest(request)) {
74+
throw new Error(`The HTTP method CONNECT is not supported by the GotScrapingHttpClient.`);
75+
}
4576
// eslint-disable-next-line no-async-promise-executor
4677
return new Promise(async (resolve, reject) => {
4778
const stream = gotScraping({ ...request, isStream: true, cookieJar: undefined });
4879

49-
stream.on('redirect', (updatedOptions: Options, redirectResponse: PlainResponse) => {
50-
handleRedirect?.(redirectResponse, updatedOptions);
80+
stream.on('redirect', (updatedOptions: Options, redirectResponse: any) => {
81+
const nativeRedirectResponse = new ResponseWithUrl(redirectResponse.rawBody, {
82+
headers: redirectResponse.headers,
83+
status: redirectResponse.statusCode,
84+
statusText: redirectResponse.statusMessage,
85+
url: redirectResponse.url,
86+
});
87+
handleRedirect?.(nativeRedirectResponse, updatedOptions);
5188
});
5289

5390
// We need to end the stream for DELETE requests, otherwise it will hang.
@@ -58,37 +95,15 @@ export class GotScrapingHttpClient implements BaseHttpClient {
5895
stream.on('error', reject);
5996

6097
stream.on('response', (response: PlainResponse) => {
61-
const result: StreamingHttpResponse = {
62-
stream,
63-
request,
64-
redirectUrls: response.redirectUrls,
65-
url: response.url,
66-
ip: response.ip,
67-
statusCode: response.statusCode,
68-
headers: response.headers,
69-
trailers: response.trailers,
70-
complete: response.complete,
71-
get downloadProgress() {
72-
return stream.downloadProgress;
73-
},
74-
get uploadProgress() {
75-
return stream.uploadProgress;
76-
},
77-
};
78-
79-
Object.assign(result, response); // TODO BC - remove in 4.0
80-
81-
resolve(result);
82-
83-
stream.on('end', () => {
84-
result.complete = response.complete;
85-
86-
result.trailers ??= {};
87-
Object.assign(result.trailers, response.trailers);
88-
89-
(result as any).rawTrailers ??= []; // TODO BC - remove in 4.0
90-
Object.assign((result as any).rawTrailers, response.rawTrailers);
91-
});
98+
// Cast shouldn't be needed here, undici might have a different `ReadableStream` type
99+
resolve(
100+
new ResponseWithUrl(Readable.toWeb(stream) as any, {
101+
status: response.statusCode,
102+
statusText: response.statusMessage ?? '',
103+
headers: response.headers as HeadersInit,
104+
url: response.url,
105+
}),
106+
);
92107
});
93108
});
94109
}

packages/core/src/request.ts

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ export enum RequestState {
8181
* ```
8282
* @category Sources
8383
*/
84-
export class Request<UserData extends Dictionary = Dictionary> {
84+
class CrawleeRequest<UserData extends Dictionary = Dictionary> {
8585
/** Request ID */
8686
id?: string;
8787

@@ -196,7 +196,8 @@ export class Request<UserData extends Dictionary = Dictionary> {
196196
this.url = url;
197197
this.loadedUrl = loadedUrl;
198198
this.uniqueKey =
199-
uniqueKey || Request.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
199+
uniqueKey ||
200+
CrawleeRequest.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
200201
this.method = method;
201202
this.payload = payload;
202203
this.noRetry = noRetry;
@@ -259,6 +260,18 @@ export class Request<UserData extends Dictionary = Dictionary> {
259260
}
260261
}
261262

263+
/**
264+
* Converts the Crawlee Request object to a `fetch` API Request object.
265+
* @returns The native `fetch` API Request object.
266+
*/
267+
public intoFetchAPIRequest(): Request {
268+
return new Request(this.url, {
269+
method: this.method,
270+
headers: this.headers,
271+
body: this.payload,
272+
});
273+
}
274+
262275
/** Tells the crawler processing this request to skip the navigation and process the request directly. */
263276
get skipNavigation(): boolean {
264277
return this.userData.__crawlee?.skipNavigation ?? false;
@@ -419,7 +432,7 @@ export class Request<UserData extends Dictionary = Dictionary> {
419432
}
420433
return normalizedUrl;
421434
}
422-
const payloadHash = payload ? Request.hashPayload(payload) : '';
435+
const payloadHash = payload ? CrawleeRequest.hashPayload(payload) : '';
423436
return `${normalizedMethod}(${payloadHash}):${normalizedUrl}`;
424437
}
425438

@@ -561,10 +574,12 @@ interface ComputeUniqueKeyOptions {
561574
useExtendedUniqueKey?: boolean;
562575
}
563576

564-
export type Source = (Partial<RequestOptions> & { requestsFromUrl?: string; regex?: RegExp }) | Request;
577+
export type Source = (Partial<RequestOptions> & { requestsFromUrl?: string; regex?: RegExp }) | CrawleeRequest;
565578

566579
/** @internal */
567580
export interface InternalSource {
568581
requestsFromUrl: string;
569582
regex?: RegExp;
570583
}
584+
585+
export { CrawleeRequest as Request };

0 commit comments

Comments
 (0)