Skip to content

Commit aac36e2

Browse files
committed
apply exclusions to redirects:
- if redirected page is excluded, block loading of page - mark page as excluded, don't retry, and don't write to page list - support generic blocking of pages based on initial page response - fixes #744
1 parent f7cbf96 commit aac36e2

File tree

5 files changed

+119
-40
lines changed

5 files changed

+119
-40
lines changed

src/crawler.ts

+41-23
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ export class Crawler {
192192
| ((opts: {
193193
page: Page;
194194
data: PageState;
195+
seed: ScopedSeed;
195196
// eslint-disable-next-line no-use-before-define
196197
crawler: Crawler;
197198
}) => Promise<void>)
@@ -930,7 +931,7 @@ self.__bx_behaviors.selectMainBehavior();
930931
async crawlPage(opts: WorkerState): Promise<void> {
931932
await this.writeStats();
932933

933-
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
934+
const { page, cdp, data, workerid, callbacks, recorder } = opts;
934935
data.callbacks = callbacks;
935936

936937
const { url, seedId } = data;
@@ -948,14 +949,14 @@ self.__bx_behaviors.selectMainBehavior();
948949
data.logDetails = logDetails;
949950
data.workerid = workerid;
950951

951-
if (directFetchCapture) {
952+
if (recorder) {
952953
try {
953954
const headers = auth
954955
? { Authorization: auth, ...this.headers }
955956
: this.headers;
956957

957958
const result = await timedRun(
958-
directFetchCapture({ url, headers, cdp }),
959+
recorder.directFetchCapture({ url, headers, cdp }),
959960
this.params.pageLoadTimeout,
960961
"Direct fetch of page URL timed out",
961962
logDetails,
@@ -1013,11 +1014,21 @@ self.__bx_behaviors.selectMainBehavior();
10131014
await page.setExtraHTTPHeaders({});
10141015
}
10151016

1017+
const seed = await this.crawlState.getSeedAt(
1018+
this.seeds,
1019+
this.numOriginalSeeds,
1020+
seedId,
1021+
);
1022+
1023+
if (recorder) {
1024+
recorder.pageSeed = seed;
1025+
}
1026+
10161027
// run custom driver here, if any
10171028
if (this.driver) {
1018-
await this.driver({ page, data, crawler: this });
1029+
await this.driver({ page, data, crawler: this, seed });
10191030
} else {
1020-
await this.loadPage(page, data);
1031+
await this.loadPage(page, data, seed);
10211032
}
10221033

10231034
data.title = await timedRun(
@@ -1155,7 +1166,7 @@ self.__bx_behaviors.selectMainBehavior();
11551166
async pageFinished(data: PageState) {
11561167
// if page loaded, considered page finished successfully
11571168
// (even if behaviors timed out)
1158-
const { loadState, logDetails, depth, url, retry } = data;
1169+
const { loadState, logDetails, depth, url, retry, pageSkipped } = data;
11591170

11601171
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
11611172
await this.writePage(data);
@@ -1172,11 +1183,14 @@ self.__bx_behaviors.selectMainBehavior();
11721183

11731184
await this.checkLimits();
11741185
} else {
1175-
if (retry >= MAX_RETRY_FAILED) {
1186+
if (retry >= MAX_RETRY_FAILED && !pageSkipped) {
11761187
await this.writePage(data);
11771188
}
1178-
await this.crawlState.markFailed(url);
1179-
1189+
if (pageSkipped) {
1190+
await this.crawlState.markExcluded(url);
1191+
} else {
1192+
await this.crawlState.markFailed(url);
1193+
}
11801194
if (this.healthChecker) {
11811195
this.healthChecker.incError();
11821196
}
@@ -1861,7 +1875,7 @@ self.__bx_behaviors.selectMainBehavior();
18611875
}
18621876
}
18631877

1864-
async loadPage(page: Page, data: PageState) {
1878+
async loadPage(page: Page, data: PageState, seed: ScopedSeed) {
18651879
const { url, depth } = data;
18661880

18671881
const logDetails = data.logDetails;
@@ -1889,8 +1903,8 @@ self.__bx_behaviors.selectMainBehavior();
18891903

18901904
// store the first successful non-redirect response, even if page doesn't load fully
18911905
const waitFirstResponse = (resp: HTTPResponse) => {
1892-
firstResponse = resp;
1893-
if (!isRedirectStatus(firstResponse.status())) {
1906+
if (!isRedirectStatus(resp.status())) {
1907+
firstResponse = resp;
18941908
// don't listen to any additional responses
18951909
page.off("response", waitFirstResponse);
18961910
}
@@ -1949,11 +1963,21 @@ self.__bx_behaviors.selectMainBehavior();
19491963
} else if (!downloadResponse) {
19501964
// log if not already log and rethrow, consider page failed
19511965
if (msg !== "logged") {
1952-
logger.error("Page Load Failed, will retry", {
1953-
msg,
1954-
loadState: data.loadState,
1955-
...logDetails,
1956-
});
1966+
const loadState = data.loadState;
1967+
if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
1968+
logger.error("Page Load Blocked, skipping", {
1969+
msg,
1970+
loadState,
1971+
...logDetails,
1972+
});
1973+
data.pageSkipped = true;
1974+
} else {
1975+
logger.error("Page Load Failed, will retry", {
1976+
msg,
1977+
loadState,
1978+
...logDetails,
1979+
});
1980+
}
19571981
e.message = "logged";
19581982
}
19591983
throw e;
@@ -2064,12 +2088,6 @@ self.__bx_behaviors.selectMainBehavior();
20642088

20652089
const { seedId, extraHops } = data;
20662090

2067-
const seed = await this.crawlState.getSeedAt(
2068-
this.seeds,
2069-
this.numOriginalSeeds,
2070-
seedId,
2071-
);
2072-
20732091
if (!seed) {
20742092
logger.error(
20752093
"Seed not found, likely invalid crawl state - skipping link extraction and behaviors",

src/util/recorder.ts

+65-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import { RedisCrawlState, WorkerId } from "./state.js";
2424
import { CDPSession, Protocol } from "puppeteer-core";
2525
import { Crawler } from "../crawler.js";
2626
import { getProxyDispatcher } from "./proxy.js";
27+
import { ScopedSeed } from "./seeds.js";
2728

2829
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
2930
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
@@ -148,6 +149,8 @@ export class Recorder {
148149
pageUrl!: string;
149150
pageid!: string;
150151

152+
pageSeed?: ScopedSeed;
153+
151154
frameIdToExecId: Map<string, number> | null;
152155

153156
constructor({
@@ -691,11 +694,27 @@ export class Recorder {
691694

692695
reqresp.fetchContinued = true;
693696

697+
reqresp.fillFetchRequestPaused(params);
698+
694699
if (
695700
url === this.pageUrl &&
696701
(!this.pageInfo.ts ||
697-
(responseStatusCode && responseStatusCode < this.pageInfo.tsStatus))
702+
(responseStatusCode && responseStatusCode <= this.pageInfo.tsStatus))
698703
) {
704+
const errorReason = await this.blockPageResponse(
705+
url,
706+
reqresp,
707+
responseHeaders,
708+
);
709+
710+
if (errorReason) {
711+
await cdp.send("Fetch.failRequest", {
712+
requestId,
713+
errorReason,
714+
});
715+
return true;
716+
}
717+
699718
logger.debug("Setting page timestamp", {
700719
ts: reqresp.ts,
701720
url,
@@ -706,8 +725,6 @@ export class Recorder {
706725
this.mainFrameId = params.frameId;
707726
}
708727

709-
reqresp.fillFetchRequestPaused(params);
710-
711728
if (this.noResponseForStatus(responseStatusCode)) {
712729
reqresp.payload = new Uint8Array();
713730
return false;
@@ -866,6 +883,36 @@ export class Recorder {
866883
return true;
867884
}
868885

886+
async blockPageResponse(
887+
url: string,
888+
reqresp: RequestResponseInfo,
889+
responseHeaders?: Protocol.Fetch.HeaderEntry[],
890+
): Promise<Protocol.Network.ErrorReason | undefined> {
891+
if (reqresp.isRedirectStatus()) {
892+
try {
893+
let loc = this.getLocation(responseHeaders);
894+
if (loc) {
895+
loc = new URL(loc, url).href;
896+
897+
this.pageUrl = loc;
898+
899+
if (this.pageSeed && this.pageSeed.isExcluded(loc)) {
900+
logger.warn(
901+
"Skipping page that redirects to excluded URL",
902+
{ newUrl: loc, origUrl: this.pageUrl },
903+
"recorder",
904+
);
905+
906+
return "BlockedByResponse";
907+
}
908+
}
909+
} catch (e) {
910+
// ignore
911+
logger.debug("Redirect check error", e, "recorder");
912+
}
913+
}
914+
}
915+
869916
startPage({ pageid, url }: { pageid: string; url: string }) {
870917
this.pageid = pageid;
871918
this.pageUrl = url;
@@ -1187,6 +1234,21 @@ export class Recorder {
11871234
return null;
11881235
}
11891236

1237+
protected getLocation(
1238+
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
1239+
) {
1240+
if (!headers) {
1241+
return null;
1242+
}
1243+
for (const header of headers) {
1244+
if (header.name.toLowerCase() === "location") {
1245+
return header.value;
1246+
}
1247+
}
1248+
1249+
return null;
1250+
}
1251+
11901252
protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
11911253
if (!headers) {
11921254
return -1;

src/util/seeds.ts

+10-2
Original file line numberDiff line numberDiff line change
@@ -280,15 +280,23 @@ export class ScopedSeed {
280280
}
281281
}
282282

283+
if (this.isExcluded(url)) {
284+
return false;
285+
}
286+
287+
return { url, isOOS };
288+
}
289+
290+
isExcluded(url: string) {
283291
// check exclusions
284292
for (const e of this.exclude) {
285293
if (e.test(url)) {
286294
//console.log(`Skipping ${url} excluded by ${e}`);
287-
return false;
295+
return true;
288296
}
289297
}
290298

291-
return { url, isOOS };
299+
return false;
292300
}
293301
}
294302

src/util/state.ts

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ export class PageState {
7474
favicon?: string;
7575

7676
skipBehaviors = false;
77+
pageSkipped = false;
7778
filteredFrames: Frame[] = [];
7879
loadState: LoadState = LoadState.FAILED;
7980

src/util/worker.ts

+2-12
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,7 @@ import os from "os";
22

33
import { logger, formatErr } from "./logger.js";
44
import { sleep, timedRun } from "./timing.js";
5-
import {
6-
DirectFetchRequest,
7-
DirectFetchResponse,
8-
Recorder,
9-
} from "./recorder.js";
5+
import { Recorder } from "./recorder.js";
106
import { rxEscape } from "./seeds.js";
117
import { CDPSession, Page } from "puppeteer-core";
128
import { PageState, WorkerId } from "./state.js";
@@ -24,9 +20,6 @@ export type WorkerState = {
2420
workerid: WorkerId;
2521
// eslint-disable-next-line @typescript-eslint/ban-types
2622
callbacks: Record<string, Function>;
27-
directFetchCapture:
28-
| ((request: DirectFetchRequest) => Promise<DirectFetchResponse>)
29-
| null;
3023
recorder: Recorder | null;
3124
markPageUsed: () => void;
3225
frameIdToExecId: Map<string, number>;
@@ -175,16 +168,13 @@ export class PageWorker {
175168
this.page = page;
176169
this.cdp = cdp;
177170
this.callbacks = {};
178-
const directFetchCapture = this.recorder
179-
? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
180-
: null;
171+
181172
this.opts = {
182173
page,
183174
cdp,
184175
workerid,
185176
callbacks: this.callbacks,
186177
recorder: this.recorder,
187-
directFetchCapture,
188178
frameIdToExecId: new Map<string, number>(),
189179
markPageUsed: () => {
190180
if (!this.alwaysReuse) {

0 commit comments

Comments
 (0)