Skip to content

Commit 1f93942

Browse files
committed
Add --failOnFailedSeed option to fail crawl if seed doesn't load
Resolves issue #207
1 parent c44e6e3 commit 1f93942

File tree

2 files changed

+35
-7
lines changed

2 files changed

+35
-7
lines changed

crawler.js

+29-7
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,14 @@ export class Crawler {
352352
return seed.isIncluded(url, depth, extraHops, logDetails);
353353
}
354354

355+
isSeedUrl(url) {
356+
const seeds = this.params.scopedSeeds.map(seed => seed.url);
357+
if (seeds.indexOf(url) != -1){
358+
return true;
359+
}
360+
return false;
361+
}
362+
355363
async setupPage({page, cdp, workerid}) {
356364
await this.browser.setupPage({page, cdp});
357365

@@ -941,6 +949,8 @@ export class Crawler {
941949

942950
const logDetails = data.logDetails;
943951

952+
const failCrawlOnError = (this.isSeedUrl(url) && this.params.failOnFailedSeed);
953+
944954
let isHTMLPage = await timedRun(
945955
this.isHTML(url),
946956
FETCH_TIMEOUT_SECS,
@@ -1000,9 +1010,13 @@ export class Crawler {
10001010

10011011
const status = resp.status();
10021012
if (status === 400) {
1003-
// pywb error, mark as page load failed
1004-
logger.error("Page Load Error, skipping page", {status, ...logDetails});
1005-
throw new Error(`Page ${url} returned 400 error`);
1013+
if (failCrawlOnError) {
1014+
logger.fatal("Page Load Error on Seed, failing crawl", {status, ...logDetails})
1015+
} else {
1016+
logger.error("Page Load Error, skipping page", {status, ...logDetails});
1017+
throw new Error(`Page ${url} returned 400 error`);
1018+
}
1019+
10061020
}
10071021

10081022
const contentType = await resp.headerValue("content-type");
@@ -1014,15 +1028,23 @@ export class Crawler {
10141028
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
10151029
if (e.name === "TimeoutError") {
10161030
if (data.loadState !== LoadState.CONTENT_LOADED) {
1017-
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
1018-
throw e;
1031+
if (failCrawlOnError) {
1032+
logger.fatal("Page Load Timeout on Seed, failing crawl", {msg, ...logDetails})
1033+
} else {
1034+
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
1035+
throw e;
1036+
}
10191037
} else {
10201038
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
10211039
data.skipBehaviors = true;
10221040
}
10231041
} else {
1024-
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
1025-
throw e;
1042+
if (failCrawlOnError) {
1043+
logger.fatal("Page Load Timeout on Seed, failing crawl", {msg, ...logDetails})
1044+
} else {
1045+
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
1046+
throw e;
1047+
}
10261048
}
10271049
}
10281050
}

util/argParser.js

+6
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,12 @@ class ArgParser {
372372
descripe: "If set, write error messages to redis",
373373
type: "boolean",
374374
default: false,
375+
},
376+
377+
"failOnFailedSeed": {
378+
describe: "If set, crawler will fail with exit code 1 if initial seed fails",
379+
type: "boolean",
380+
default: false
375381
}
376382
};
377383
}

0 commit comments

Comments
 (0)