@@ -352,6 +352,14 @@ export class Crawler {
352
352
return seed . isIncluded ( url , depth , extraHops , logDetails ) ;
353
353
}
354
354
355
+ isSeedUrl ( url ) {
356
+ const seeds = this . params . scopedSeeds . map ( seed => seed . url ) ;
357
+ if ( seeds . indexOf ( url ) != - 1 ) {
358
+ return true ;
359
+ }
360
+ return false ;
361
+ }
362
+
355
363
async setupPage ( { page, cdp, workerid} ) {
356
364
await this . browser . setupPage ( { page, cdp} ) ;
357
365
@@ -941,6 +949,8 @@ export class Crawler {
941
949
942
950
const logDetails = data . logDetails ;
943
951
952
+ const failCrawlOnError = ( this . isSeedUrl ( url ) && this . params . failOnFailedSeed ) ;
953
+
944
954
let isHTMLPage = await timedRun (
945
955
this . isHTML ( url ) ,
946
956
FETCH_TIMEOUT_SECS ,
@@ -1000,9 +1010,13 @@ export class Crawler {
1000
1010
1001
1011
const status = resp . status ( ) ;
1002
1012
if ( status === 400 ) {
1003
- // pywb error, mark as page load failed
1004
- logger . error ( "Page Load Error, skipping page" , { status, ...logDetails } ) ;
1005
- throw new Error ( `Page ${ url } returned 400 error` ) ;
1013
+ if ( failCrawlOnError ) {
1014
+ logger . fatal ( "Page Load Error on Seed, failing crawl" , { status, ...logDetails } ) ;
1015
+ } else {
1016
+ logger . error ( "Page Load Error, skipping page" , { status, ...logDetails } ) ;
1017
+ throw new Error ( `Page ${ url } returned 400 error` ) ;
1018
+ }
1019
+
1006
1020
}
1007
1021
1008
1022
const contentType = await resp . headerValue ( "content-type" ) ;
@@ -1014,15 +1028,23 @@ export class Crawler {
1014
1028
if ( ! msg . startsWith ( "net::ERR_ABORTED" ) || ! ignoreAbort ) {
1015
1029
if ( e . name === "TimeoutError" ) {
1016
1030
if ( data . loadState !== LoadState . CONTENT_LOADED ) {
1017
- logger . error ( "Page Load Timeout, skipping page" , { msg, ...logDetails } ) ;
1018
- throw e ;
1031
+ if ( failCrawlOnError ) {
1032
+ logger . fatal ( "Page Load Timeout on Seed, failing crawl" , { msg, ...logDetails } ) ;
1033
+ } else {
1034
+ logger . error ( "Page Load Timeout, skipping page" , { msg, ...logDetails } ) ;
1035
+ throw e ;
1036
+ }
1019
1037
} else {
1020
1038
logger . warn ( "Page Loading Slowly, skipping behaviors" , { msg, ...logDetails } ) ;
1021
1039
data . skipBehaviors = true ;
1022
1040
}
1023
1041
} else {
1024
- logger . error ( "Page Load Error, skipping page" , { msg, ...logDetails } ) ;
1025
- throw e ;
1042
+ if ( failCrawlOnError ) {
1043
+ logger . fatal ( "Page Load Timeout on Seed, failing crawl" , { msg, ...logDetails } ) ;
1044
+ } else {
1045
+ logger . error ( "Page Load Error, skipping page" , { msg, ...logDetails } ) ;
1046
+ throw e ;
1047
+ }
1026
1048
}
1027
1049
}
1028
1050
}
0 commit comments