Apply exclusions to redirects (#745)

- if redirected page is excluded, block loading of page - mark page as excluded, don't retry, and don't write to page list - support generic blocking of pages based on initial page response - fixes #744
webrecorder · Jan 28, 2025 · a00866b · a00866b
1 parent f7cbf96
commit a00866b
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 40 deletions.
diff --git a/src/crawler.ts b/src/crawler.ts
@@ -192,6 +192,7 @@ export class Crawler {
     | ((opts: {
         page: Page;
         data: PageState;
+        seed: ScopedSeed;
         // eslint-disable-next-line no-use-before-define
         crawler: Crawler;
       }) => Promise<void>)
@@ -930,7 +931,7 @@ self.__bx_behaviors.selectMainBehavior();
   async crawlPage(opts: WorkerState): Promise<void> {
     await this.writeStats();
 
-    const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
+    const { page, cdp, data, workerid, callbacks, recorder } = opts;
     data.callbacks = callbacks;
 
     const { url, seedId } = data;
@@ -948,14 +949,14 @@ self.__bx_behaviors.selectMainBehavior();
     data.logDetails = logDetails;
     data.workerid = workerid;
 
-    if (directFetchCapture) {
+    if (recorder) {
       try {
         const headers = auth
           ? { Authorization: auth, ...this.headers }
           : this.headers;
 
         const result = await timedRun(
-          directFetchCapture({ url, headers, cdp }),
+          recorder.directFetchCapture({ url, headers, cdp }),
           this.params.pageLoadTimeout,
           "Direct fetch of page URL timed out",
           logDetails,
@@ -1013,11 +1014,21 @@ self.__bx_behaviors.selectMainBehavior();
       await page.setExtraHTTPHeaders({});
     }
 
+    const seed = await this.crawlState.getSeedAt(
+      this.seeds,
+      this.numOriginalSeeds,
+      seedId,
+    );
+
+    if (recorder) {
+      recorder.pageSeed = seed;
+    }
+
     // run custom driver here, if any
     if (this.driver) {
-      await this.driver({ page, data, crawler: this });
+      await this.driver({ page, data, crawler: this, seed });
     } else {
-      await this.loadPage(page, data);
+      await this.loadPage(page, data, seed);
     }
 
     data.title = await timedRun(
@@ -1155,7 +1166,7 @@ self.__bx_behaviors.selectMainBehavior();
   async pageFinished(data: PageState) {
     // if page loaded, considered page finished successfully
     // (even if behaviors timed out)
-    const { loadState, logDetails, depth, url, retry } = data;
+    const { loadState, logDetails, depth, url, retry, pageSkipped } = data;
 
     if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
       await this.writePage(data);
@@ -1172,11 +1183,14 @@ self.__bx_behaviors.selectMainBehavior();
 
       await this.checkLimits();
     } else {
-      if (retry >= MAX_RETRY_FAILED) {
+      if (retry >= MAX_RETRY_FAILED && !pageSkipped) {
         await this.writePage(data);
       }
-      await this.crawlState.markFailed(url);
-
+      if (pageSkipped) {
+        await this.crawlState.markExcluded(url);
+      } else {
+        await this.crawlState.markFailed(url);
+      }
       if (this.healthChecker) {
         this.healthChecker.incError();
       }
@@ -1861,7 +1875,7 @@ self.__bx_behaviors.selectMainBehavior();
     }
   }
 
-  async loadPage(page: Page, data: PageState) {
+  async loadPage(page: Page, data: PageState, seed: ScopedSeed) {
     const { url, depth } = data;
 
     const logDetails = data.logDetails;
@@ -1889,8 +1903,8 @@ self.__bx_behaviors.selectMainBehavior();
 
     // store the first successful non-redirect response, even if page doesn't load fully
     const waitFirstResponse = (resp: HTTPResponse) => {
-      firstResponse = resp;
-      if (!isRedirectStatus(firstResponse.status())) {
+      if (!isRedirectStatus(resp.status())) {
+        firstResponse = resp;
         // don't listen to any additional responses
         page.off("response", waitFirstResponse);
       }
@@ -1949,11 +1963,19 @@ self.__bx_behaviors.selectMainBehavior();
       } else if (!downloadResponse) {
         // log if not already log and rethrow, consider page failed
         if (msg !== "logged") {
-          logger.error("Page Load Failed, will retry", {
-            msg,
-            loadState: data.loadState,
-            ...logDetails,
-          });
+          const loadState = data.loadState;
+
+          // excluded in recorder
+          if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
+            data.pageSkipped = true;
+            logger.warn("Page Load Blocked, skipping", { msg, loadState });
+          } else {
+            logger.error("Page Load Failed, will retry", {
+              msg,
+              loadState,
+              ...logDetails,
+            });
+          }
           e.message = "logged";
         }
         throw e;
@@ -2064,12 +2086,6 @@ self.__bx_behaviors.selectMainBehavior();
 
     const { seedId, extraHops } = data;
 
-    const seed = await this.crawlState.getSeedAt(
-      this.seeds,
-      this.numOriginalSeeds,
-      seedId,
-    );
-
     if (!seed) {
       logger.error(
         "Seed not found, likely invalid crawl state - skipping link extraction and behaviors",

diff --git a/src/util/recorder.ts b/src/util/recorder.ts
@@ -24,6 +24,7 @@ import { RedisCrawlState, WorkerId } from "./state.js";
 import { CDPSession, Protocol } from "puppeteer-core";
 import { Crawler } from "../crawler.js";
 import { getProxyDispatcher } from "./proxy.js";
+import { ScopedSeed } from "./seeds.js";
 
 const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
 const MAX_TEXT_REWRITE_SIZE = 25_000_000;
@@ -148,6 +149,8 @@ export class Recorder {
   pageUrl!: string;
   pageid!: string;
 
+  pageSeed?: ScopedSeed;
+
   frameIdToExecId: Map<string, number> | null;
 
   constructor({
@@ -691,11 +694,27 @@ export class Recorder {
 
     reqresp.fetchContinued = true;
 
+    reqresp.fillFetchRequestPaused(params);
+
     if (
       url === this.pageUrl &&
       (!this.pageInfo.ts ||
-        (responseStatusCode && responseStatusCode < this.pageInfo.tsStatus))
+        (responseStatusCode && responseStatusCode <= this.pageInfo.tsStatus))
     ) {
+      const errorReason = await this.blockPageResponse(
+        url,
+        reqresp,
+        responseHeaders,
+      );
+
+      if (errorReason) {
+        await cdp.send("Fetch.failRequest", {
+          requestId,
+          errorReason,
+        });
+        return true;
+      }
+
       logger.debug("Setting page timestamp", {
         ts: reqresp.ts,
         url,
@@ -706,8 +725,6 @@ export class Recorder {
       this.mainFrameId = params.frameId;
     }
 
-    reqresp.fillFetchRequestPaused(params);
-
     if (this.noResponseForStatus(responseStatusCode)) {
       reqresp.payload = new Uint8Array();
       return false;
@@ -866,6 +883,34 @@ export class Recorder {
     return true;
   }
 
+  async blockPageResponse(
+    url: string,
+    reqresp: RequestResponseInfo,
+    responseHeaders?: Protocol.Fetch.HeaderEntry[],
+  ): Promise<Protocol.Network.ErrorReason | undefined> {
+    if (reqresp.isRedirectStatus()) {
+      try {
+        let loc = this.getLocation(responseHeaders);
+        if (loc) {
+          loc = new URL(loc, url).href;
+
+          if (this.pageSeed && this.pageSeed.isExcluded(loc)) {
+            logger.warn(
+              "Skipping page that redirects to excluded URL",
+              { newUrl: loc, origUrl: this.pageUrl },
+              "recorder",
+            );
+
+            return "BlockedByResponse";
+          }
+        }
+      } catch (e) {
+        // ignore
+        logger.debug("Redirect check error", e, "recorder");
+      }
+    }
+  }
+
   startPage({ pageid, url }: { pageid: string; url: string }) {
     this.pageid = pageid;
     this.pageUrl = url;
@@ -1187,6 +1232,21 @@ export class Recorder {
     return null;
   }
 
+  protected getLocation(
+    headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
+  ) {
+    if (!headers) {
+      return null;
+    }
+    for (const header of headers) {
+      if (header.name.toLowerCase() === "location") {
+        return header.value;
+      }
+    }
+
+    return null;
+  }
+
   protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
     if (!headers) {
       return -1;

diff --git a/src/util/seeds.ts b/src/util/seeds.ts
@@ -280,15 +280,23 @@ export class ScopedSeed {
       }
     }
 
+    if (this.isExcluded(url)) {
+      return false;
+    }
+
+    return { url, isOOS };
+  }
+
+  isExcluded(url: string) {
     // check exclusions
     for (const e of this.exclude) {
       if (e.test(url)) {
         //console.log(`Skipping ${url} excluded by ${e}`);
-        return false;
+        return true;
       }
     }
 
-    return { url, isOOS };
+    return false;
   }
 }
 

diff --git a/src/util/state.ts b/src/util/state.ts
@@ -74,6 +74,7 @@ export class PageState {
   favicon?: string;
 
   skipBehaviors = false;
+  pageSkipped = false;
   filteredFrames: Frame[] = [];
   loadState: LoadState = LoadState.FAILED;
 

diff --git a/src/util/worker.ts b/src/util/worker.ts
@@ -2,11 +2,7 @@ import os from "os";
 
 import { logger, formatErr } from "./logger.js";
 import { sleep, timedRun } from "./timing.js";
-import {
-  DirectFetchRequest,
-  DirectFetchResponse,
-  Recorder,
-} from "./recorder.js";
+import { Recorder } from "./recorder.js";
 import { rxEscape } from "./seeds.js";
 import { CDPSession, Page } from "puppeteer-core";
 import { PageState, WorkerId } from "./state.js";
@@ -24,9 +20,6 @@ export type WorkerState = {
   workerid: WorkerId;
   // eslint-disable-next-line @typescript-eslint/ban-types
   callbacks: Record<string, Function>;
-  directFetchCapture:
-    | ((request: DirectFetchRequest) => Promise<DirectFetchResponse>)
-    | null;
   recorder: Recorder | null;
   markPageUsed: () => void;
   frameIdToExecId: Map<string, number>;
@@ -175,16 +168,13 @@ export class PageWorker {
         this.page = page;
         this.cdp = cdp;
         this.callbacks = {};
-        const directFetchCapture = this.recorder
-          ? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
-          : null;
+
         this.opts = {
           page,
           cdp,
           workerid,
           callbacks: this.callbacks,
           recorder: this.recorder,
-          directFetchCapture,
           frameIdToExecId: new Map<string, number>(),
           markPageUsed: () => {
             if (!this.alwaysReuse) {

diff --git a/tests/exclude-redirected.test.js b/tests/exclude-redirected.test.js
@@ -0,0 +1,21 @@
+import fs from "fs";
+import { execSync } from "child_process";
+
+// example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'
+// pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded
+
+test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
+  execSync(
+      "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1");
+
+  // no entries besides header
+  expect(
+    fs
+      .readFileSync(
+        "test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",
+        "utf8",
+      ).trim().split("\n").length
+  ).toBe(1);
+
+});
+