Skip to content

Commit

Permalink
Apply exclusions to redirects (#745)
Browse files Browse the repository at this point in the history
- if redirected page is excluded, block loading of page
- mark page as excluded, don't retry, and don't write to page list
- support generic blocking of pages based on initial page response
- fixes #744
  • Loading branch information
ikreymer authored Jan 28, 2025
1 parent f7cbf96 commit a00866b
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 40 deletions.
62 changes: 39 additions & 23 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ export class Crawler {
| ((opts: {
page: Page;
data: PageState;
seed: ScopedSeed;
// eslint-disable-next-line no-use-before-define
crawler: Crawler;
}) => Promise<void>)
Expand Down Expand Up @@ -930,7 +931,7 @@ self.__bx_behaviors.selectMainBehavior();
async crawlPage(opts: WorkerState): Promise<void> {
await this.writeStats();

const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
const { page, cdp, data, workerid, callbacks, recorder } = opts;
data.callbacks = callbacks;

const { url, seedId } = data;
Expand All @@ -948,14 +949,14 @@ self.__bx_behaviors.selectMainBehavior();
data.logDetails = logDetails;
data.workerid = workerid;

if (directFetchCapture) {
if (recorder) {
try {
const headers = auth
? { Authorization: auth, ...this.headers }
: this.headers;

const result = await timedRun(
directFetchCapture({ url, headers, cdp }),
recorder.directFetchCapture({ url, headers, cdp }),
this.params.pageLoadTimeout,
"Direct fetch of page URL timed out",
logDetails,
Expand Down Expand Up @@ -1013,11 +1014,21 @@ self.__bx_behaviors.selectMainBehavior();
await page.setExtraHTTPHeaders({});
}

const seed = await this.crawlState.getSeedAt(
this.seeds,
this.numOriginalSeeds,
seedId,
);

if (recorder) {
recorder.pageSeed = seed;
}

// run custom driver here, if any
if (this.driver) {
await this.driver({ page, data, crawler: this });
await this.driver({ page, data, crawler: this, seed });
} else {
await this.loadPage(page, data);
await this.loadPage(page, data, seed);
}

data.title = await timedRun(
Expand Down Expand Up @@ -1155,7 +1166,7 @@ self.__bx_behaviors.selectMainBehavior();
async pageFinished(data: PageState) {
// if page loaded, considered page finished successfully
// (even if behaviors timed out)
const { loadState, logDetails, depth, url, retry } = data;
const { loadState, logDetails, depth, url, retry, pageSkipped } = data;

if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
await this.writePage(data);
Expand All @@ -1172,11 +1183,14 @@ self.__bx_behaviors.selectMainBehavior();

await this.checkLimits();
} else {
if (retry >= MAX_RETRY_FAILED) {
if (retry >= MAX_RETRY_FAILED && !pageSkipped) {
await this.writePage(data);
}
await this.crawlState.markFailed(url);

if (pageSkipped) {
await this.crawlState.markExcluded(url);
} else {
await this.crawlState.markFailed(url);
}
if (this.healthChecker) {
this.healthChecker.incError();
}
Expand Down Expand Up @@ -1861,7 +1875,7 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async loadPage(page: Page, data: PageState) {
async loadPage(page: Page, data: PageState, seed: ScopedSeed) {
const { url, depth } = data;

const logDetails = data.logDetails;
Expand Down Expand Up @@ -1889,8 +1903,8 @@ self.__bx_behaviors.selectMainBehavior();

// store the first successful non-redirect response, even if page doesn't load fully
const waitFirstResponse = (resp: HTTPResponse) => {
firstResponse = resp;
if (!isRedirectStatus(firstResponse.status())) {
if (!isRedirectStatus(resp.status())) {
firstResponse = resp;
// don't listen to any additional responses
page.off("response", waitFirstResponse);
}
Expand Down Expand Up @@ -1949,11 +1963,19 @@ self.__bx_behaviors.selectMainBehavior();
} else if (!downloadResponse) {
// log if not already log and rethrow, consider page failed
if (msg !== "logged") {
logger.error("Page Load Failed, will retry", {
msg,
loadState: data.loadState,
...logDetails,
});
const loadState = data.loadState;

// excluded in recorder
if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
data.pageSkipped = true;
logger.warn("Page Load Blocked, skipping", { msg, loadState });
} else {
logger.error("Page Load Failed, will retry", {
msg,
loadState,
...logDetails,
});
}
e.message = "logged";
}
throw e;
Expand Down Expand Up @@ -2064,12 +2086,6 @@ self.__bx_behaviors.selectMainBehavior();

const { seedId, extraHops } = data;

const seed = await this.crawlState.getSeedAt(
this.seeds,
this.numOriginalSeeds,
seedId,
);

if (!seed) {
logger.error(
"Seed not found, likely invalid crawl state - skipping link extraction and behaviors",
Expand Down
66 changes: 63 additions & 3 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import { RedisCrawlState, WorkerId } from "./state.js";
import { CDPSession, Protocol } from "puppeteer-core";
import { Crawler } from "../crawler.js";
import { getProxyDispatcher } from "./proxy.js";
import { ScopedSeed } from "./seeds.js";

const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
Expand Down Expand Up @@ -148,6 +149,8 @@ export class Recorder {
pageUrl!: string;
pageid!: string;

pageSeed?: ScopedSeed;

frameIdToExecId: Map<string, number> | null;

constructor({
Expand Down Expand Up @@ -691,11 +694,27 @@ export class Recorder {

reqresp.fetchContinued = true;

reqresp.fillFetchRequestPaused(params);

if (
url === this.pageUrl &&
(!this.pageInfo.ts ||
(responseStatusCode && responseStatusCode < this.pageInfo.tsStatus))
(responseStatusCode && responseStatusCode <= this.pageInfo.tsStatus))
) {
const errorReason = await this.blockPageResponse(
url,
reqresp,
responseHeaders,
);

if (errorReason) {
await cdp.send("Fetch.failRequest", {
requestId,
errorReason,
});
return true;
}

logger.debug("Setting page timestamp", {
ts: reqresp.ts,
url,
Expand All @@ -706,8 +725,6 @@ export class Recorder {
this.mainFrameId = params.frameId;
}

reqresp.fillFetchRequestPaused(params);

if (this.noResponseForStatus(responseStatusCode)) {
reqresp.payload = new Uint8Array();
return false;
Expand Down Expand Up @@ -866,6 +883,34 @@ export class Recorder {
return true;
}

async blockPageResponse(
url: string,
reqresp: RequestResponseInfo,
responseHeaders?: Protocol.Fetch.HeaderEntry[],
): Promise<Protocol.Network.ErrorReason | undefined> {
if (reqresp.isRedirectStatus()) {
try {
let loc = this.getLocation(responseHeaders);
if (loc) {
loc = new URL(loc, url).href;

if (this.pageSeed && this.pageSeed.isExcluded(loc)) {
logger.warn(
"Skipping page that redirects to excluded URL",
{ newUrl: loc, origUrl: this.pageUrl },
"recorder",
);

return "BlockedByResponse";
}
}
} catch (e) {
// ignore
logger.debug("Redirect check error", e, "recorder");
}
}
}

startPage({ pageid, url }: { pageid: string; url: string }) {
this.pageid = pageid;
this.pageUrl = url;
Expand Down Expand Up @@ -1187,6 +1232,21 @@ export class Recorder {
return null;
}

protected getLocation(
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
) {
if (!headers) {
return null;
}
for (const header of headers) {
if (header.name.toLowerCase() === "location") {
return header.value;
}
}

return null;
}

protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
if (!headers) {
return -1;
Expand Down
12 changes: 10 additions & 2 deletions src/util/seeds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -280,15 +280,23 @@ export class ScopedSeed {
}
}

if (this.isExcluded(url)) {
return false;
}

return { url, isOOS };
}

isExcluded(url: string) {
// check exclusions
for (const e of this.exclude) {
if (e.test(url)) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false;
return true;
}
}

return { url, isOOS };
return false;
}
}

Expand Down
1 change: 1 addition & 0 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ export class PageState {
favicon?: string;

skipBehaviors = false;
pageSkipped = false;
filteredFrames: Frame[] = [];
loadState: LoadState = LoadState.FAILED;

Expand Down
14 changes: 2 additions & 12 deletions src/util/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,7 @@ import os from "os";

import { logger, formatErr } from "./logger.js";
import { sleep, timedRun } from "./timing.js";
import {
DirectFetchRequest,
DirectFetchResponse,
Recorder,
} from "./recorder.js";
import { Recorder } from "./recorder.js";
import { rxEscape } from "./seeds.js";
import { CDPSession, Page } from "puppeteer-core";
import { PageState, WorkerId } from "./state.js";
Expand All @@ -24,9 +20,6 @@ export type WorkerState = {
workerid: WorkerId;
// eslint-disable-next-line @typescript-eslint/ban-types
callbacks: Record<string, Function>;
directFetchCapture:
| ((request: DirectFetchRequest) => Promise<DirectFetchResponse>)
| null;
recorder: Recorder | null;
markPageUsed: () => void;
frameIdToExecId: Map<string, number>;
Expand Down Expand Up @@ -175,16 +168,13 @@ export class PageWorker {
this.page = page;
this.cdp = cdp;
this.callbacks = {};
const directFetchCapture = this.recorder
? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
: null;

this.opts = {
page,
cdp,
workerid,
callbacks: this.callbacks,
recorder: this.recorder,
directFetchCapture,
frameIdToExecId: new Map<string, number>(),
markPageUsed: () => {
if (!this.alwaysReuse) {
Expand Down
21 changes: 21 additions & 0 deletions tests/exclude-redirected.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import fs from "fs";
import { execSync } from "child_process";

// example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'
// pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded

test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
execSync(
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1");

// no entries besides header
expect(
fs
.readFileSync(
"test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",
"utf8",
).trim().split("\n").length
).toBe(1);

});

0 comments on commit a00866b

Please sign in to comment.