Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Catch 4xx and 5xx page.goto() responses to mark invalid URLs as failed #300

Merged
merged 5 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 33 additions & 4 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,14 @@ export class Crawler {
return seed.isIncluded(url, depth, extraHops, logDetails);
}

isSeedUrl(url) {
const seeds = this.params.scopedSeeds.map(seed => seed.url);
if (seeds.indexOf(url) != -1){
return true;
}
return false;
}

async setupPage({page, cdp, workerid}) {
await this.browser.setupPage({page, cdp});

Expand Down Expand Up @@ -941,6 +949,8 @@ export class Crawler {

const logDetails = data.logDetails;

const failCrawlOnError = (this.isSeedUrl(url) && this.params.failOnFailedSeed);

let isHTMLPage = await timedRun(
this.isHTML(url),
FETCH_TIMEOUT_SECS,
Expand Down Expand Up @@ -998,6 +1008,17 @@ export class Crawler {
try {
const resp = await page.goto(url, gotoOpts);

// Handle 4xx or 5xx response as a page load error
const statusCode = resp.status();
if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
} else {
logger.error("Page Load Error, skipping page", {statusCode, ...logDetails});
throw new Error(`Page ${url} returned status code ${statusCode}`);
}
}

const contentType = await resp.headerValue("content-type");

isHTMLPage = this.isHTMLContentType(contentType);
Expand All @@ -1007,15 +1028,23 @@ export class Crawler {
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
if (e.name === "TimeoutError") {
if (data.loadState !== LoadState.CONTENT_LOADED) {
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
throw e;
if (failCrawlOnError) {
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
} else {
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
throw e;
}
} else {
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
data.skipBehaviors = true;
}
} else {
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
throw e;
if (failCrawlOnError) {
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
} else {
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
throw e;
}
}
}
}
Expand Down
6 changes: 6 additions & 0 deletions util/argParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,12 @@ class ArgParser {
descripe: "If set, write error messages to redis",
type: "boolean",
default: false,
},

"failOnFailedSeed": {
describe: "If set, crawler will fail with exit code 1 if initial seed fails",
type: "boolean",
default: false
}
};
}
Expand Down