Skip to content

Commit d4bc9e8

Browse files
authored
Catch 4xx and 5xx page.goto() responses to mark invalid URLs as failed (#300)
* Catch 400 pywb errors on page load and mark page failed * Add --failOnFailedSeed option to fail crawl with exit code 1 if seed doesn't load, resolves #207 * Handle 4xx or 5xx page.goto responses as page load errors
1 parent 71b618f commit d4bc9e8

File tree

2 files changed

+31
-4
lines changed

2 files changed

+31
-4
lines changed

crawler.js

+25-4
Original file line numberDiff line numberDiff line change
@@ -966,6 +966,8 @@ export class Crawler {
966966

967967
const logDetails = data.logDetails;
968968

969+
const failCrawlOnError = ((depth === 0) && this.params.failOnFailedSeed);
970+
969971
let isHTMLPage = await timedRun(
970972
this.isHTML(url),
971973
FETCH_TIMEOUT_SECS,
@@ -1011,6 +1013,17 @@ export class Crawler {
10111013
try {
10121014
const resp = await page.goto(url, gotoOpts);
10131015

1016+
// Handle 4xx or 5xx response as a page load error
1017+
const statusCode = resp.status();
1018+
if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
1019+
if (failCrawlOnError) {
1020+
logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
1021+
} else {
1022+
logger.error("Page Load Error, skipping page", {statusCode, ...logDetails});
1023+
throw new Error(`Page ${url} returned status code ${statusCode}`);
1024+
}
1025+
}
1026+
10141027
const contentType = await this.browser.responseHeader(resp, "content-type");
10151028

10161029
isHTMLPage = this.isHTMLContentType(contentType);
@@ -1020,15 +1033,23 @@ export class Crawler {
10201033
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
10211034
if (e.name === "TimeoutError") {
10221035
if (data.loadState !== LoadState.CONTENT_LOADED) {
1023-
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
1024-
throw e;
1036+
if (failCrawlOnError) {
1037+
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
1038+
} else {
1039+
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
1040+
throw e;
1041+
}
10251042
} else {
10261043
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
10271044
data.skipBehaviors = true;
10281045
}
10291046
} else {
1030-
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
1031-
throw e;
1047+
if (failCrawlOnError) {
1048+
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
1049+
} else {
1050+
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
1051+
throw e;
1052+
}
10321053
}
10331054
}
10341055
}

util/argParser.js

+6
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,12 @@ class ArgParser {
373373
type: "boolean",
374374
default: false,
375375
},
376+
377+
"failOnFailedSeed": {
378+
describe: "If set, crawler will fail with exit code 1 if any seed fails",
379+
type: "boolean",
380+
default: false
381+
}
376382
};
377383
}
378384

0 commit comments

Comments
 (0)