webrecorder
diff --git a/‎README.md
+21-14 b/‎README.md
+21-14
diff --git a/‎crawler.js
+40-26 b/‎crawler.js
+40-26
diff --git a/‎create-login-profile.js
+7-10 b/‎create-login-profile.js
+7-10
diff --git a/‎package.json
+1-1 b/‎package.json
+1-1
diff --git a/‎tests/extra_hops_depth.test.js
+6-2 b/‎tests/extra_hops_depth.test.js
+6-2
@@ -1,6 +1,6 @@
 # Browsertrix Crawler
 
-Browsertrix Crawler is a simplified (Chrome) browser-based high-fidelity crawling system, designed to run a complex, customizable browser-based crawl in a single Docker container. Browsertrix Crawler uses [Playwright](https://github.com/microsoft/playwright) to control one or more browser windows in parallel.
+Browsertrix Crawler is a simplified (Chrome) browser-based high-fidelity crawling system, designed to run a complex, customizable browser-based crawl in a single Docker container. Browsertrix Crawler uses [Puppeteer](https://github.com/puppeteer/puppeteer) to control one or more browser windows in parallel.
 
 ## Features
 
@@ -14,7 +14,7 @@ Thus far, Browsertrix Crawler supports:
 - Screencasting: Ability to watch crawling in real-time (experimental).
 - Screenshotting: Ability to take thumbnails, full page screenshots, and/or screenshots of the initial page view.
 - Optimized (non-browser) capture of non-HTML resources.
-- Extensible Playwright driver script for customizing behavior per crawl or page.
+- Extensible Puppeteer driver script for customizing behavior per crawl or page.
 - Ability to create and reuse browser profiles interactively or via automated user/password login using an embedded browser.
 - Multi-platform support -- prebuilt Docker images available for Intel/AMD and Apple Silicon (M1/M2) CPUs.
 
@@ -69,13 +69,14 @@ Options:
       --crawlId, --id                       A user provided ID for this crawl or
                                              crawl configuration (can also be se
                                             t via CRAWL_ID env var)
-                                              [string] [default: "454230b33b8f"]
+                                              [string] [default: "97792ef37eaf"]
       --newContext                          Deprecated as of 0.8.0, any values p
                                             assed will be ignored
                                                         [string] [default: null]
-      --waitUntil                           Playwright page.goto() condition to
-                                            wait for before continuing
-                                                               [default: "load"]
+      --waitUntil                           Puppeteer page.goto() condition to w
+                                            ait for before continuing, can be mu
+                                            ltiple separated by ','
+                                                  [default: "load,networkidle2"]
       --depth                               The depth of the crawl for all seeds
                                                           [number] [default: -1]
       --extraHops                           Number of extra 'hops' to follow, be
@@ -150,10 +151,9 @@ Options:
                                             o process.cwd()
                                                    [string] [default: "/crawls"]
       --mobileDevice                        Emulate mobile device by name from:
-                                            https://github.com/microsoft/playwri
-                                            ght/blob/main/packages/playwright-co
-                                            re/src/server/deviceDescriptorsSourc
-                                            e.json                      [string]
+                                            https://github.com/puppeteer/puppete
+                                            er/blob/main/src/common/DeviceDescri
+                                            ptors.ts                    [string]
       --userAgent                           Override user-agent with specified s
                                             tring                       [string]
       --userAgentSuffix                     Append suffix to existing browser us
@@ -240,6 +240,13 @@ Options:
       --description, --desc                 If set, write supplied description i
                                             nto WACZ datapackage.json metadata
                                                                         [string]
+      --originOverride                      if set, will redirect requests from
+                                            each origin in key to origin in the
+                                            value, eg. --originOverride https://
+                                            host:port=http://alt-host:alt-port
+                                                           [array] [default: []]
+      --logErrorsToRedis                    If set, write error messages to redi
+                                            s         [boolean] [default: false]
       --config                              Path to YAML config file
 
 ```
@@ -250,9 +257,9 @@ Options:
 
 One of the key nuances of browser-based crawling is determining when a page is finished loading. This can be configured with the `--waitUntil` flag.
 
-The default is `load`, which waits until page load, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). The `--waitUntil networkidle` may make sense for sites where absolutely all requests must be waited until before proceeding.
+The default is `load,networkidle2`, which waits until page load and <=2 requests remain, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). `--waitUntil networkidle0` may make sense for sites where absolutely all requests must be waited until before proceeding.
 
-See [page.goto waitUntil options](https://playwright.dev/docs/api/class-page#page-goto-option-wait-until) for more info on the options that can be used with this flag from the Playwright docs.
+See [page.goto waitUntil options](https://pptr.dev/api/puppeteer.page.goto#remarks) for more info on the options that can be used with this flag from the Puppeteer docs.
 
 The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first.
 
@@ -543,11 +550,11 @@ The webhook URL can be an HTTP URL which receives a JSON POST request OR a Redis
 
 </details>
 
-### Configuring Chromium / Playwright / pywb
+### Configuring Chromium / Puppeteer / pywb
 
 There is a few environment variables you can set to configure chromium and pywb:
 
-- CHROME_FLAGS will be split by spaces and passed to Chromium (via `args` in Playwright). Note that setting some options is not supported such as `--proxy-server` since they are set by browsertrix itself.
+- CHROME_FLAGS will be split by spaces and passed to Chromium (via `args` in Puppeteer). Note that setting some options is not supported such as `--proxy-server` since they are set by browsertrix itself.
 - SOCKS_HOST and SOCKS_PORT are read by pywb to proxy upstream traffic
 
 Here's some examples use cases:
 
@@ -355,6 +355,24 @@ export class Crawler {
   async setupPage({page, cdp, workerid}) {
     await this.browser.setupPage({page, cdp});
 
+    if ((this.adBlockRules && this.params.blockAds) ||
+        this.blockRules || this.originOverride) {
+
+      await page.setRequestInterception(true);
+
+      if (this.adBlockRules && this.params.blockAds) {
+        await this.adBlockRules.initPage(this.browser, page);
+      }
+
+      if (this.blockRules) {
+        await this.blockRules.initPage(this.browser, page);
+      }
+
+      if (this.originOverride) {
+        await this.originOverride.initPage(this.browser, page);
+      }
+    }
+
     if (this.params.logging.includes("jserrors")) {
       page.on("console", (msg) => {
         if (msg.type() === "error") {
@@ -374,7 +392,7 @@ export class Crawler {
 
     if (this.params.behaviorOpts) {
       await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid));
-      await page.addInitScript(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
+      await this.browser.addInitScript(page, behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
     }
   }
 
@@ -404,7 +422,7 @@ export class Crawler {
         logger.debug("Skipping screenshots for non-HTML page", logDetails);
       }
       const archiveDir = path.join(this.collDir, "archive");
-      const screenshots = new Screenshots({page, url, directory: archiveDir});
+      const screenshots = new Screenshots({browser: this.browser, page, url, directory: archiveDir});
       if (this.params.screenshot.includes("view")) {
         await screenshots.take();
       }
@@ -430,7 +448,7 @@ export class Crawler {
         logger.info("Skipping behaviors for slow page", logDetails, "behavior");
       } else {
         const res = await timedRun(
-          this.runBehaviors(page, data.filteredFrames, logDetails),
+          this.runBehaviors(page, cdp, data.filteredFrames, logDetails),
           this.params.behaviorTimeout,
           "Behaviors timed out",
           logDetails,
@@ -495,16 +513,14 @@ export class Crawler {
     }
   }
 
-  async runBehaviors(page, frames, logDetails) {
+  async runBehaviors(page, cdp, frames, logDetails) {
     try {
       frames = frames || page.frames();
 
-      const context = page.context();
-
       logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior");
 
       return await Promise.allSettled(
-        frames.map(frame => this.browser.evaluateWithCLI(context, frame, "self.__bx_behaviors.run();", logDetails, "behavior"))
+        frames.map(frame => this.browser.evaluateWithCLI(page, frame, cdp, "self.__bx_behaviors.run();", logDetails, "behavior"))
       );
 
     } catch (e) {
@@ -711,7 +727,7 @@ export class Crawler {
 
     this.screencaster = this.initScreenCaster();
 
-    if (this.params.originOverride) {
+    if (this.params.originOverride.length) {
       this.originOverride = new OriginOverride(this.params.originOverride);
     }
 
@@ -905,6 +921,14 @@ export class Crawler {
     });
   }
 
+  logMemory() {
+    const memUsage = process.memoryUsage();
+    const { heapUsed, heapTotal } = memUsage;
+    this.maxHeapUsed = Math.max(this.maxHeapUsed || 0, heapUsed);
+    this.maxHeapTotal = Math.max(this.maxHeapTotal || 0, heapTotal);
+    logger.debug("Memory", {maxHeapUsed: this.maxHeapUsed, maxHeapTotal: this.maxHeapTotal, ...memUsage}, "memory");
+  }
+
   async writeStats(toFile=false) {
     if (!this.params.logging.includes("stats")) {
       return;
@@ -926,6 +950,7 @@ export class Crawler {
     };
 
     logger.info("Crawl statistics", stats, "crawlStatus");
+    this.logMemory();
 
     if (toFile && this.params.statsFilename) {
       try {
@@ -965,18 +990,6 @@ export class Crawler {
       }
     }
 
-    if (this.adBlockRules && this.params.blockAds) {
-      await this.adBlockRules.initPage(page);
-    }
-
-    if (this.blockRules) {
-      await this.blockRules.initPage(page);
-    }
-
-    if (this.originOverride) {
-      await this.originOverride.initPage(page);
-    }
-
     let ignoreAbort = false;
 
     // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
@@ -998,7 +1011,7 @@ export class Crawler {
     try {
       const resp = await page.goto(url, gotoOpts);
 
-      const contentType = await resp.headerValue("content-type");
+      const contentType = await this.browser.responseHeader(resp, "content-type");
 
       isHTMLPage = this.isHTMLContentType(contentType);
 
@@ -1068,7 +1081,7 @@ export class Crawler {
     await sleep(0.5);
 
     try {
-      await page.waitForLoadState("networkidle", {timeout: this.params.netIdleWait * 1000});
+      await this.browser.waitForNetworkIdle(page, {timeout: this.params.netIdleWait * 1000});
     } catch (e) {
       logger.debug("waitForNetworkIdle timed out, ignoring", details);
       // ignore, continue
@@ -1095,7 +1108,7 @@ export class Crawler {
     try {
       const linkResults = await Promise.allSettled(
         frames.map(frame => timedRun(
-          frame.evaluate(loadFunc, {selector: selector, extract: extract}),
+          frame.evaluate(loadFunc, {selector, extract}),
           PAGE_OP_TIMEOUT_SECS,
           "Link extraction timed out",
           logDetails,
@@ -1152,9 +1165,10 @@ export class Crawler {
     try {
       logger.debug("Check CF Blocking", logDetails);
 
-      const cloudflare = page.locator("div.cf-browser-verification.cf-im-under-attack");
-
-      while (await cloudflare.waitFor({timeout: PAGE_OP_TIMEOUT_SECS})) {
+      while (await timedRun(
+        page.$("div.cf-browser-verification.cf-im-under-attack"),
+        PAGE_OP_TIMEOUT_SECS
+      )) {
         logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
         await sleep(5.5);
       }
 
@@ -158,10 +158,8 @@ async function main() {
 
   const browser = new Browser();
 
-  const profileDir = await browser.loadProfile(params.profile);
-
   await browser.launch({
-    dataDir: profileDir,
+    profileUrl: params.profile,
     headless: params.headless,
     signals: true,
     chromeOptions: {
@@ -191,18 +189,17 @@ async function main() {
     params.password = await promptInput("Enter password: ", true);
   }
 
-  const { page, cdp } = await browser.getFirstPageWithCDP();
+  const { page, cdp } = await browser.newWindowPageWithCDP();
 
-  const waitUntil =  "load";
+  const waitUntil = "load";
 
-  //await page.setCacheEnabled(false);
-  await cdp.send("Network.setCacheDisabled", {cacheDisabled: true});
+  await page.setCacheEnabled(false);
 
   if (!params.automated) {
     await browser.setupPage({page, cdp});
 
     // for testing, inject browsertrix-behaviors
-    await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();");
+    await browser.addInitScript(page, behaviors + ";\nself.__bx_behaviors.init();");
   }
 
   logger.info(`Loading page: ${params.url}`);
@@ -384,7 +381,7 @@ class InteractiveBrowser {
         return;
       }
 
-      const cookies = await this.browser.context.cookies(url);
+      const cookies = await this.browser.getCookies(this.page, url);
       for (const cookie of cookies) {
         cookie.expires = (new Date().getTime() / 1000) + this.params.cookieDays * 86400;
         delete cookie.size;
@@ -396,7 +393,7 @@ class InteractiveBrowser {
           cookie.url = url;
         }
       }
-      await this.browser.context.addCookies(cookies);
+      await this.browser.setCookies(this.page, cookies);
     } catch (e) {
       logger.error("Save Cookie Error: ", e);
     }
 
@@ -17,7 +17,7 @@
     "ioredis": "^4.27.1",
     "js-yaml": "^4.1.0",
     "minio": "7.0.26",
-    "playwright-core": "^1.31.2",
+    "puppeteer-core": "^19.11.1",
     "sitemapper": "^3.1.2",
     "uuid": "8.3.2",
     "warcio": "^1.6.0",
 
@@ -16,7 +16,8 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
     console.log(error);
   }
 
-  const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
+  const crawledPages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
+  const crawledPagesArray = crawledPages.trim().split("\n");
 
   const expectedPages = [
     "https://webrecorder.net/",
@@ -28,7 +29,10 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
     "https://webrecorder.net/faq",
   ];
 
-  for (const page of crawled_pages.trim().split("\n")) {
+  // first line is the header, not page, so adding -1
+  expect(expectedPages.length).toEqual(crawledPagesArray.length - 1);
+
+  for (const page of crawledPagesArray) {
     const url = JSON.parse(page).url;
     if (!url) {
       continue;