From 7962b253408df4d7c8959b51bb15eb9d053b7117 Mon Sep 17 00:00:00 2001 From: Cuadrix <51061675+Cuadrix@users.noreply.github.com> Date: Tue, 30 Jun 2020 00:46:41 +0100 Subject: [PATCH] v1.2.7 --- README.md | 111 +++++++++++++----------------------- changelog.md | 5 ++ package.json | 8 +-- src/core/lookup.js | 43 +++++++------- src/core/proxy.js | 65 ++++++++++++--------- src/lib/cdp.js | 18 ++++++ src/lib/cookies.js | 113 ++++++++++++++++++++++++++++++++++++- src/lib/options.js | 37 +++--------- src/{lib => util}/types.js | 0 9 files changed, 243 insertions(+), 157 deletions(-) create mode 100644 src/lib/cdp.js rename src/{lib => util}/types.js (100%) diff --git a/README.md b/README.md index efbd38f..9dd8f3f 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,10 @@ Forwards intercepted requests from the browser to Node.js where it handles the r ## Features -- Proxy per page **and** per request -- Supports **(** http, https, socks4, socks5 **)** proxies -- Authentication -- Cookie handling internally +- Proxy per page and proxy per request +- Supports **http**, **https**, **socks4** and **socks5** proxies +- Supports authentication +- Handles cookies ## Installation ``` @@ -20,9 +20,9 @@ npm i puppeteer-page-proxy - `pageOrReq` <[object](https://developer.mozilla.org/en-US/docs/Glossary/Object)> 'Page' or 'Request' object to set a proxy for. - `proxy` <[string](https://developer.mozilla.org/en-US/docs/Glossary/String)|[object](https://developer.mozilla.org/en-US/docs/Glossary/Object)> Proxy to use in the current page. * Begins with a protocol (e.g. http://, https://, socks://) - * In the case of [proxy per request](https://github.com/Cuadrix/puppeteer-page-proxy#proxy-per-request), this can be an object with optional properites for overriding requests:\ + * In the case of [proxy per request](https://github.com/Cuadrix/puppeteer-page-proxy#proxy-per-request), this can be an object with optional properties for overriding requests:\ `url`, `method`, `postData`, `headers`\ -See [request.continue](https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#requestcontinueoverrides) for more info about the above properties. +See [httpRequest.continue](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#httprequestcontinueoverrides) for more info about the above properties. #### PageProxy.lookup(page[, lookupService, isJSON, timeout]) @@ -38,26 +38,14 @@ See [request.continue](https://github.com/puppeteer/puppeteer/blob/master/docs/a **NOTE:** By default this method expects a response in [JSON](https://en.wikipedia.org/wiki/JSON#Example) format and [JSON.parse](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/parse)'s it to a usable javascript object. To disable this functionality, set `isJSON` to `false`. ## Usage -#### Proxy per page: +#### Importing: ```js -const puppeteer = require('puppeteer'); const useProxy = require('puppeteer-page-proxy'); +``` -(async () => { - const site = 'https://example.com'; - const proxy = 'http://host:port'; - const proxy2 = 'https://host:port'; - - const browser = await puppeteer.launch({headless: false}); - - const page = await browser.newPage(); - await useProxy(page, proxy); - await page.goto(site); - - const page2 = await browser.newPage(); - await useProxy(page2, proxy2); - await page2.goto(site); -})(); +#### Proxy per page: +```js +await useProxy(page, 'http://127.0.0.1:80'); ``` To remove proxy, omit or pass in falsy value (e.g `null`): ```js @@ -66,33 +54,21 @@ await useProxy(page, null); #### Proxy per request: ```js -const puppeteer = require('puppeteer'); -const useProxy = require('puppeteer-page-proxy'); - -(async () => { - const site = 'https://example.com'; - const proxy = 'socks://host:port'; - - const browser = await puppeteer.launch({headless: false}); - const page = await browser.newPage(); - - await page.setRequestInterception(true); - page.on('request', async req => { - await useProxy(req, proxy); - }); - await page.goto(site); -})(); +await page.setRequestInterception(true); +page.on('request', async request => { + await useProxy(request, 'https://127.0.0.1:443'); +}); ``` The request object itself is passed as the first argument. The proxy can now be changed every request. Using it along with other interception methods: ```js await page.setRequestInterception(true); -page.on('request', async req => { +page.on('request', async request => { if (req.resourceType() === 'image') { req.abort(); } else { - await useProxy(req, proxy); + await useProxy(request, 'socks4://127.0.0.1:1080'); } }); ``` @@ -100,9 +76,9 @@ page.on('request', async req => { Overriding requests: ```js await page.setRequestInterception(true); -page.on('request', async req => { - await useProxy(req, { - proxy: proxy, +page.on('request', async request => { + await useProxy(request, { + proxy: 'socks5://127.0.0.1:1080', url: 'https://example.com', method: 'POST', postData: '404', @@ -113,40 +89,29 @@ page.on('request', async req => { }); ``` -**NOTE:** It is necessary to set [page.setRequestInterception](https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#pagesetrequestinterceptionvalue) to true when setting proxies per request, otherwise the function will fail. +**NOTE:** It is necessary to set [page.setRequestInterception](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagesetrequestinterceptionvalue) to true when setting proxies per request, otherwise the function will fail. -#### Authentication: +#### Authenticating: ```js -const proxy = 'https://login:pass@host:port'; +const proxy = 'https://user:pass@host:port'; ``` -#### Lookup IP used by proxy: +#### IP lookup: ```js -const puppeteer = require('puppeteer'); -const useProxy = require('puppeteer-page-proxy'); - -(async () => { - const site = 'https://example.com'; - const proxy1 = 'http://host:port'; - const proxy2 = 'https://host:port'; - - const browser = await puppeteer.launch({headless: false}); - - // 1 - const page1 = await browser.newPage(); - await useProxy(page1, proxy1); - let data = await useProxy.lookup(page1); // Waits until done, 'then' continues - console.log(data.ip); - await page1.goto(site); +// 1. Waits until done, 'then' continues +const data = await useProxy.lookup(page1); + console.log(data.ip); - // 2 - const page2 = await browser.newPage(); - await useProxy(page2, proxy2); - useProxy.lookup(page2).then(data => { // Executes and 'comes back' once done - console.log(data.ip); - }); - await page2.goto(site); -})(); +// 2. Executes and 'comes back' once done +useProxy.lookup(page2).then(data => { + console.log(data.ip); +}); +``` +In case of any [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS) errors, use `--disable-web-security` launch flag: +```js +const browser = await puppeteer.launch({ + args: ['--disable-web-security'] +}); ``` ## FAQ @@ -156,7 +121,7 @@ It takes over the task of requesting content **from** the browser to do it inter #### Why am I getting _"Request is already handled!"_? -This happens when there is an attempt to handle the same request more than once. An intercepted request is handled by either [request.abort](https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#requestaborterrorcode), [request.continue](https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#requestcontinueoverrides) or [request.respond](https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#requestrespondresponse) methods. Each of these methods 'send' the request to its destination. A request that has already reached its destination cannot be intercepted or handled. +This happens when there is an attempt to handle the same request more than once. An intercepted request is handled by either [httpRequest.abort](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#httprequestaborterrorcode), [httpRequest.continue](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#httprequestcontinueoverrides) or [httpRequest.respond](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#httprequestrespondresponse) methods. Each of these methods 'send' the request to its destination. A request that has already reached its destination cannot be intercepted or handled. #### Why does the browser show _"Your connection to this site is not secure"_? diff --git a/changelog.md b/changelog.md index f5853f3..df26636 100644 --- a/changelog.md +++ b/changelog.md @@ -1,4 +1,9 @@ # Change log +### [1.2.7] - 2020-06-30 +#### Changes +- Reimplement cookie handling to account for deletion and addition of browser cookies +- Changed default lookup fetch source to **api64.ipify.org** +- Update documentation ### [1.2.6] - 2020-06-18 #### Changes - Updated for Puppeteer's v4.0.0 [breaking changes](https://github.com/puppeteer/puppeteer/releases/tag/v4.0.0) ([#22](https://github.com/Cuadrix/puppeteer-page-proxy/issues/22), [#23](https://github.com/Cuadrix/puppeteer-page-proxy/issues/23)) diff --git a/package.json b/package.json index 63e784d..3bc174a 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "puppeteer-page-proxy", "description": "Additional Node.js module to use with 'puppeteer' for setting proxies per page basis.", - "version": "1.2.6", + "version": "1.2.7", "author": "Cuadrix (https://github.com/Cuadrix)", "homepage": "https://github.com/Cuadrix/puppeteer-page-proxy", "main": "./src/index.js", @@ -10,8 +10,8 @@ "test": "echo \"Error: no test specified\" && exit 1" }, "repository": { - "type" : "git", - "url" : "https://github.com/Cuadrix/puppeteer-page-proxy.git" + "type": "git", + "url": "https://github.com/Cuadrix/puppeteer-page-proxy.git" }, "keywords": [ "puppeteer", @@ -28,4 +28,4 @@ "socks-proxy-agent": "^5.0.0", "tough-cookie": "^4.0.0" } -} \ No newline at end of file +} diff --git a/src/core/lookup.js b/src/core/lookup.js index 7883c58..2b1d95b 100644 --- a/src/core/lookup.js +++ b/src/core/lookup.js @@ -1,39 +1,36 @@ -const lookup = async (page, lookupService = "https://api.ipify.org?format=json", isJSON = true, timeout = 30000) => { +const lookup = async (page, lookupService = "https://api64.ipify.org?format=json", isJSON = true, timeout = 30000) => { const doLookup = async () => { return await page.evaluate((lookupService, timeout, isJSON) => { return new Promise((resolve) => { - const req = new XMLHttpRequest(); - req.timeout = timeout; - req.onload = () => { - if (req.status >= 200 && req.status <= 299) { - resolve(isJSON ? JSON.parse(req.responseText) : req.responseText); - } else { - resolve(onLookupFailed(`Request from ${window.location.href} to ${lookupService} failed with status code ${req.status}`)); - } + const request = new XMLHttpRequest(); + request.timeout = timeout; + request.onload = () => { + if (request.status >= 200 && request.status <= 299) { + resolve(isJSON ? JSON.parse(request.responseText) : request.responseText); + } else {resolve(onLookupFailed( + `Request from ${window.location.href} to ` + + `${lookupService} failed with status code ${request.status}` + ))} }; - req.ontimeout = (error) => { - resolve(onLookupFailed(`Request from ${window.location.href} to ${lookupService} timed out -> ${req.timeout} ms`)); - }; - req.open("GET", lookupService, true); - req.send(); + request.ontimeout = (error) => {resolve(onLookupFailed( + `Request from ${window.location.href} to ` + + `${lookupService} timed out at ${request.timeout} ms` + ))}; + request.open("GET", lookupService, true); + request.send(); }); }, lookupService, timeout, isJSON); }; try { await page.setBypassCSP(true); - const functionName = "onLookupFailed"; + const functionName = "$ppp_on_lookup_failed"; if (!page._pageBindings.has(functionName)) { - await page.exposeFunction(functionName, (reason) => { - console.error(reason); - return; + await page.exposeFunction(functionName, (failReason) => { + console.error(failReason); return; }); } return await doLookup(); - } catch(error) { - if (error.message.startsWith("Execution context was destroyed")) { - return await doLookup(); - } - } + } catch(error) {console.error(error)} }; module.exports = lookup; \ No newline at end of file diff --git a/src/core/proxy.js b/src/core/proxy.js index 1c9f5f9..3bde32d 100644 --- a/src/core/proxy.js +++ b/src/core/proxy.js @@ -1,28 +1,41 @@ -const request = require("got"); -const type = require("../lib/types"); -const cookieJar = require("../lib/cookies"); -const {setOverrides, setHeaders, setAgent} = require("../lib/options"); +const got = require("got"); +const CookieHandler = require("../lib/cookies"); +const {setHeaders, setAgent} = require("../lib/options"); +const type = require("../util/types"); // Responsible for applying proxy -const proxyHandler = async (req, proxy) => { +const requestHandler = async (request, proxy, overrides = {}) => { + // Reject non http(s) URI schemes + if (!request.url().startsWith("http") && !request.url().startsWith("https")) { + request.continue(); return; + } + const cookieHandler = new CookieHandler(request); + // Request options for Got accounting for overrides const options = { - cookieJar, - method: req.method(), - body: req.postData(), - headers: setHeaders(req), + cookieJar: await cookieHandler.getCookies(), + method: overrides.method || request.method(), + body: overrides.postData || request.postData(), + headers: overrides.headers || setHeaders(request), agent: setAgent(proxy), responseType: "buffer", maxRedirects: 15, throwHttpErrors: false }; try { - const res = await request(req.url(), options); - await req.respond({ - status: res.statusCode, - headers: res.headers, - body: res.body + const response = await got(overrides.url || request.url(), options); + // Set cookies manually because "set-cookie" doesn't set all cookies (?) + // Perhaps related to https://github.com/puppeteer/puppeteer/issues/5364 + const setCookieHeader = response.headers["set-cookie"]; + if (setCookieHeader) { + await cookieHandler.setCookies(setCookieHeader); + response.headers["set-cookie"] = undefined; + } + await request.respond({ + status: response.statusCode, + headers: response.headers, + body: response.body }); - } catch(error) {await req.abort()} + } catch(error) {await request.abort()} }; // For reassigning proxy of page @@ -41,7 +54,7 @@ const removeRequestListener = (page, listenerName) => { }; // Calls this if request object passed -const proxyPerRequest = async (req, data) => { +const proxyPerRequest = async (request, data) => { let proxy, overrides; // Separate proxy and overrides if (type(data) === "object") { @@ -51,21 +64,21 @@ const proxyPerRequest = async (req, data) => { overrides = data; } } else {proxy = data} - req = setOverrides(req, overrides); // Skip request if proxy omitted - if (proxy) {await proxyHandler(req, proxy)} - else {req.continue(overrides)} + if (proxy) {await requestHandler(request, proxy, overrides)} + else {request.continue(overrides)} }; // Calls this if page object passed const proxyPerPage = async (page, proxy) => { await page.setRequestInterception(true); - removeRequestListener(page, "$ppp"); - if (proxy) { - page.on("request", $ppp = async (req) => { - await proxyHandler(req, proxy); - }); - } else {await page.setRequestInterception(false)} + const listener = "$ppp_request_listener"; + removeRequestListener(page, listener); + const f = {[listener]: async (request) => { + await requestHandler(request, proxy); + }}; + if (proxy) {page.on("request", f[listener])} + else {await page.setRequestInterception(false)} }; // Main function @@ -74,7 +87,7 @@ const useProxy = async (target, data) => { if (targetType === "HTTPRequest") { await proxyPerRequest(target, data); } else if (targetType === "Page") { - await proxyPerPage(target, data) + await proxyPerPage(target, data); } }; diff --git a/src/lib/cdp.js b/src/lib/cdp.js new file mode 100644 index 0000000..b366126 --- /dev/null +++ b/src/lib/cdp.js @@ -0,0 +1,18 @@ +class CDP { + constructor(client) { + // Network domain: https://chromedevtools.github.io/devtools-protocol/1-3/Network/ + this.Network = { + async getCookies(urls) { + return (await client.send("Network.getCookies", urls)).cookies; + }, + async setCookies(cookies) { + await client.send("Network.setCookies", cookies); + }, + async deleteCookies(cookies) { + await client.send("Network.deleteCookies", cookies); + } + } + } +} + +module.exports = CDP; \ No newline at end of file diff --git a/src/lib/cookies.js b/src/lib/cookies.js index df280cc..1bfdcbe 100644 --- a/src/lib/cookies.js +++ b/src/lib/cookies.js @@ -1,3 +1,114 @@ const {CookieJar} = require("tough-cookie"); +const CDP = require("./cdp"); -module.exports = new CookieJar(); \ No newline at end of file +// Parse single raw cookie string to a cookie object for the browser +const parseCookie = (rawCookie, domain) => { + const cookie = {name: "", value: "", domain, path: "/", secure: false, httpOnly: false, sameSite: "Lax", expires: undefined}; + const pairs = rawCookie.split(/; */); + for (let i = 0; i < pairs.length; i++) { + // Split to key value pair e.g. key=value + const pair = pairs[i].split(/=(.*)/, 2); + // Trim and assign key and value + let key = pair[0].trim(); + let value = pair[1] ? pair[1].trim() : ""; + // Remove surrounding quotes from value if exists + value = value.replace(/^"(.*)"$/, "$1"); + switch (key.toLowerCase()) { + case "domain": cookie.domain = value; break; + case "path": cookie.path = value; break; + case "secure": cookie.secure = true; break; + case "httponly": cookie.httpOnly = true; break; + case "samesite": + const firstChar = value[0].toUpperCase(); + const restChars = value.slice(1).toLowerCase(); + cookie.sameSite = firstChar + restChars; + break; + case "max-age": + // Current time and 'max-age' in seconds + const currentTime = new Date().getTime() / 1000; + const maxAge = parseInt(value); + cookie.expires = Math.round(currentTime + maxAge); + break; + case "expires": + // If cookie expires hasn't already been set by 'max-age' + if (!cookie.expires) { + const time = new Date(value).getTime(); + cookie.expires = Math.round(time / 1000); + } + break; + default: if (i < 1) {cookie.name = key; cookie.value = value} + } + } + return cookie; +} + +// Format single browser cookie object to tough-cookie object +const formatCookie = (cookie) => { + const currentDate = new Date().toISOString(); + return { + key: cookie.name, + value: cookie.value, + expires: (cookie.expires === -1) ? "Infinity" : new Date(cookie.expires * 1000).toISOString(), + domain: cookie.domain.replace(/^\./, ""), + path: cookie.path, + secure: cookie.secure, + httpOnly: cookie.httpOnly, + sameSite: cookie.sameSite, + hostOnly: !cookie.domain.startsWith("."), + creation: currentDate, + lastAccessed: currentDate + }; +}; + +// Responsible for getting and setting browser cookies +class CookieHandler extends CDP { + constructor(request) { + super(request._client); + this.url = request.isNavigationRequest() ? request.url() : request.frame().url(); + this.domain = new URL(this.url).hostname; + } + // Parse an array of raw cookies to an array of cookie objects + parseCookies(rawCookies) { + return rawCookies.map((rawCookie) => { + return parseCookie(rawCookie, this.domain); + }); + }; + // Format browser cookies to tough-cookies + formatCookies(cookies) { + return cookies.map((cookie) => { + return formatCookie(cookie); + }); + }; + // Get browser cookies of current page/url + async getCookies() { + const browserCookies = await this.Network.getCookies({urls: [this.url]}); + const toughCookies = this.formatCookies(browserCookies); + // Add cookies to cookieJar + const cookieJar = CookieJar.deserializeSync({ + version: 'tough-cookie@4.0.0', + storeType: 'MemoryCookieStore', + rejectPublicSuffixes: true, + cookies: toughCookies + }); + return cookieJar; + } + // Set cookies to browser from "set-cookie" header + async setCookies(rawCookies) { + const browserCookies = this.parseCookies(rawCookies); + // Delete old cookies before setting new ones + for (let i = 0; i < browserCookies.length; i++) { + const cookie = browserCookies[i]; + const badCookie = { + name: cookie.name, + url: this.url, + domain: cookie.domain, + path: cookie.path + }; + await this.Network.deleteCookies(badCookie); + } + // Store cookies in the browser + await this.Network.setCookies({cookies: browserCookies}); + } +} + +module.exports = CookieHandler; \ No newline at end of file diff --git a/src/lib/options.js b/src/lib/options.js index 74a66fc..21b63d0 100644 --- a/src/lib/options.js +++ b/src/lib/options.js @@ -2,39 +2,16 @@ const HttpProxyAgent = require("http-proxy-agent"); const HttpsProxyAgent = require("https-proxy-agent"); const SocksProxyAgent = require("socks-proxy-agent"); -// For overriding request objects -const setOverrides = (req, overrides) => { - const map = { - url: true, - method: true, - postData: true, - headers: true - }; - for (const key in overrides) { - if (map[key]) { - if (key === "headers") { - req["$" + key] = () => overrides[key]; - } else { - req[key] = () => overrides[key]; - } - } - } - return req; -}; - -// Some extra headers -const setHeaders = (req) => { - // If headers have been overriden - if (req.$headers) - return req.$headers(); - // Extended default headers +// Set some extra headers because Puppeteer doesn't capture all request headers +// Related: https://github.com/puppeteer/puppeteer/issues/5364 +const setHeaders = (request) => { const headers = { - ...req.headers(), + ...request.headers(), "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", - "host": new URL(req.url()).hostname + "host": new URL(request.url()).hostname } - if (req.isNavigationRequest()) { + if (request.isNavigationRequest()) { headers["sec-fetch-mode"] = "navigate"; headers["sec-fetch-site"] = "none"; headers["sec-fetch-user"] = "?1"; @@ -59,4 +36,4 @@ const setAgent = (proxy) => { }; }; -module.exports = {setOverrides, setHeaders, setAgent}; \ No newline at end of file +module.exports = {setHeaders, setAgent}; \ No newline at end of file diff --git a/src/lib/types.js b/src/util/types.js similarity index 100% rename from src/lib/types.js rename to src/util/types.js