From 1975618d1cd5067f26319c81e279097344a411f6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Apr 2023 09:49:39 -0700 Subject: [PATCH] origin override: add --originOverride source=dest to allow routing where https://src-host:src-port/path/page.html -> http://dest-host:dest-port/path/page.html where source=https://src-host:src-port and dest=http://dest-host:dest-port --- crawler.js | 9 +++++++++ util/argParser.js | 8 +++++++- util/originoverride.js | 43 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 util/originoverride.js diff --git a/crawler.js b/crawler.js index 6528a2d5e..09a60d2da 100644 --- a/crawler.js +++ b/crawler.js @@ -27,6 +27,7 @@ import { Browser } from "./util/browser.js"; import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js"; import { AdBlockRules, BlockRules } from "./util/blockrules.js"; +import { OriginOverride } from "./util/originoverride.js"; // to ignore HTTPS error for HEAD check import { Agent as HTTPAgent } from "http"; @@ -709,6 +710,10 @@ export class Crawler { this.screencaster = this.initScreenCaster(); + if (this.params.originOverride) { + this.originOverride = new OriginOverride(this.params.originOverride); + } + for (let i = 0; i < this.params.scopedSeeds.length; i++) { const seed = this.params.scopedSeeds[i]; if (!await this.queueUrl(i, seed.url, 0, 0)) { @@ -965,6 +970,10 @@ export class Crawler { await this.blockRules.initPage(page); } + if (this.originOverride) { + await this.originOverride.initPage(page); + } + let ignoreAbort = false; // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF), diff --git a/util/argParser.js b/util/argParser.js index 14e697397..ae4d69abc 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -360,7 +360,13 @@ class ArgParser { alias: ["desc"], describe: "If set, write supplied description into WACZ datapackage.json metadata", type: "string" - } + }, + + "originOverride": { + describe: "if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port", + type: "array", + default: [], + }, }; } diff --git a/util/originoverride.js b/util/originoverride.js new file mode 100644 index 000000000..95ed842ce --- /dev/null +++ b/util/originoverride.js @@ -0,0 +1,43 @@ +import { errJSON, logger } from "./logger.js"; + +export class OriginOverride +{ + constructor(originOverride) { + this.originOverride = originOverride.map((override) => { + let [orig, dest] = override.split("="); + orig = new URL(orig).origin; + dest = new URL(dest).origin; + + return {orig, dest}; + }); + } + + initPage(page) { + for (const {orig, dest} of this.originOverride) { + const logDetails = {page: page.url(), orig, dest}; + + logger.debug(`Adding override ${orig} => ${dest}`); + + page.route(orig + "/**", async (route) => { + try { + const request = route.request(); + const url = request.url(); + + const newUrl = dest + url.slice(orig.length); + const resp = await fetch(newUrl, {headers: request.headers()}); + + const body = Buffer.from(await resp.arrayBuffer()); + const headers = Object.fromEntries(resp.headers); + const status = resp.status; + + logger.debug("Origin overridden", {orig: url, dest: newUrl, status, body: body.length}, "originoverride"); + + route.fulfill({body, headers, status}); + + } catch (e) { + logger.warn("Error overriding origin", {...errJSON(e), ...logDetails}, "originoverride"); + } + }); + } + } +}