From c44e6e37f289787113fa0e45b83d9a89a1219737 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 25 Apr 2023 16:19:39 -0400 Subject: [PATCH 1/4] Catch 400 pywb errors on page load and mark page failed --- crawler.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crawler.js b/crawler.js index 56404e71b..a60b6fd7b 100644 --- a/crawler.js +++ b/crawler.js @@ -998,6 +998,13 @@ export class Crawler { try { const resp = await page.goto(url, gotoOpts); + const status = resp.status(); + if (status === 400) { + // pywb error, mark as page load failed + logger.error("Page Load Error, skipping page", {status, ...logDetails}); + throw new Error(`Page ${url} returned 400 error`); + } + const contentType = await resp.headerValue("content-type"); isHTMLPage = this.isHTMLContentType(contentType); From 818d4c925f9a39115701c73d5f1bc98c231ccab2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 25 Apr 2023 16:35:32 -0400 Subject: [PATCH 2/4] Add --failOnFailedSeed option to fail crawl if seed doesn't load Resolves issue #207 --- crawler.js | 36 +++++++++++++++++++++++++++++------- util/argParser.js | 6 ++++++ 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/crawler.js b/crawler.js index a60b6fd7b..0cfcfcc21 100644 --- a/crawler.js +++ b/crawler.js @@ -352,6 +352,14 @@ export class Crawler { return seed.isIncluded(url, depth, extraHops, logDetails); } + isSeedUrl(url) { + const seeds = this.params.scopedSeeds.map(seed => seed.url); + if (seeds.indexOf(url) != -1){ + return true; + } + return false; + } + async setupPage({page, cdp, workerid}) { await this.browser.setupPage({page, cdp}); @@ -941,6 +949,8 @@ export class Crawler { const logDetails = data.logDetails; + const failCrawlOnError = (this.isSeedUrl(url) && this.params.failOnFailedSeed); + let isHTMLPage = await timedRun( this.isHTML(url), FETCH_TIMEOUT_SECS, @@ -1000,9 +1010,13 @@ export class Crawler { const status = resp.status(); if (status === 400) { - // pywb error, mark as page load failed - logger.error("Page Load Error, skipping page", {status, ...logDetails}); - throw new Error(`Page ${url} returned 400 error`); + if (failCrawlOnError) { + logger.fatal("Page Load Error on Seed, failing crawl", {status, ...logDetails}); + } else { + logger.error("Page Load Error, skipping page", {status, ...logDetails}); + throw new Error(`Page ${url} returned 400 error`); + } + } const contentType = await resp.headerValue("content-type"); @@ -1014,15 +1028,23 @@ export class Crawler { if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) { if (e.name === "TimeoutError") { if (data.loadState !== LoadState.CONTENT_LOADED) { - logger.error("Page Load Timeout, skipping page", {msg, ...logDetails}); - throw e; + if (failCrawlOnError) { + logger.fatal("Page Load Timeout on Seed, failing crawl", {msg, ...logDetails}); + } else { + logger.error("Page Load Timeout, skipping page", {msg, ...logDetails}); + throw e; + } } else { logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails}); data.skipBehaviors = true; } } else { - logger.error("Page Load Error, skipping page", {msg, ...logDetails}); - throw e; + if (failCrawlOnError) { + logger.fatal("Page Load Timeout on Seed, failing crawl", {msg, ...logDetails}); + } else { + logger.error("Page Load Error, skipping page", {msg, ...logDetails}); + throw e; + } } } } diff --git a/util/argParser.js b/util/argParser.js index c755d9075..606053060 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -372,6 +372,12 @@ class ArgParser { descripe: "If set, write error messages to redis", type: "boolean", default: false, + }, + + "failOnFailedSeed": { + describe: "If set, crawler will fail with exit code 1 if initial seed fails", + type: "boolean", + default: false } }; } From 26d83df58f516d0220cba4251f2b7ac01d2b5fdc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 25 Apr 2023 16:52:57 -0400 Subject: [PATCH 3/4] Handle 4xx or 5xx page.goto responses as page load errors --- crawler.js | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/crawler.js b/crawler.js index 0cfcfcc21..7557083c3 100644 --- a/crawler.js +++ b/crawler.js @@ -1008,15 +1008,15 @@ export class Crawler { try { const resp = await page.goto(url, gotoOpts); - const status = resp.status(); - if (status === 400) { + // Handle 4xx or 5xx response as a page load error + const statusCode = resp.status(); + if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) { if (failCrawlOnError) { - logger.fatal("Page Load Error on Seed, failing crawl", {status, ...logDetails}); + logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails}); } else { - logger.error("Page Load Error, skipping page", {status, ...logDetails}); - throw new Error(`Page ${url} returned 400 error`); + logger.error("Page Load Error, skipping page", {statusCode, ...logDetails}); + throw new Error(`Page ${url} returned status code ${statusCode}`); } - } const contentType = await resp.headerValue("content-type"); @@ -1029,7 +1029,7 @@ export class Crawler { if (e.name === "TimeoutError") { if (data.loadState !== LoadState.CONTENT_LOADED) { if (failCrawlOnError) { - logger.fatal("Page Load Timeout on Seed, failing crawl", {msg, ...logDetails}); + logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails}); } else { logger.error("Page Load Timeout, skipping page", {msg, ...logDetails}); throw e; @@ -1040,7 +1040,7 @@ export class Crawler { } } else { if (failCrawlOnError) { - logger.fatal("Page Load Timeout on Seed, failing crawl", {msg, ...logDetails}); + logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails}); } else { logger.error("Page Load Error, skipping page", {msg, ...logDetails}); throw e; From fa52432493aad9fcbe4304f47c91af5c4145da38 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 26 Apr 2023 17:04:44 -0400 Subject: [PATCH 4/4] Code review changes --- crawler.js | 10 +--------- util/argParser.js | 2 +- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/crawler.js b/crawler.js index 7557083c3..89d39b5f3 100644 --- a/crawler.js +++ b/crawler.js @@ -352,14 +352,6 @@ export class Crawler { return seed.isIncluded(url, depth, extraHops, logDetails); } - isSeedUrl(url) { - const seeds = this.params.scopedSeeds.map(seed => seed.url); - if (seeds.indexOf(url) != -1){ - return true; - } - return false; - } - async setupPage({page, cdp, workerid}) { await this.browser.setupPage({page, cdp}); @@ -949,7 +941,7 @@ export class Crawler { const logDetails = data.logDetails; - const failCrawlOnError = (this.isSeedUrl(url) && this.params.failOnFailedSeed); + const failCrawlOnError = ((depth === 0) && this.params.failOnFailedSeed); let isHTMLPage = await timedRun( this.isHTML(url), diff --git a/util/argParser.js b/util/argParser.js index 606053060..eed65d27f 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -375,7 +375,7 @@ class ArgParser { }, "failOnFailedSeed": { - describe: "If set, crawler will fail with exit code 1 if initial seed fails", + describe: "If set, crawler will fail with exit code 1 if any seed fails", type: "boolean", default: false }