@@ -23,8 +23,11 @@ import { TempFileBuffer, WARCSerializer } from "warcio/node";
23
23
import { WARCWriter } from "./warcwriter.js" ;
24
24
import { RedisCrawlState , WorkerId } from "./state.js" ;
25
25
import { CDPSession , Protocol } from "puppeteer-core" ;
26
+ import { Crawler } from "../crawler.js" ;
27
+
28
+ const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000 ;
29
+ const MAX_BROWSER_TEXT_FETCH_SIZE = 25_000_000 ;
26
30
27
- const MAX_BROWSER_FETCH_SIZE = 2_000_000 ;
28
31
const MAX_NETWORK_LOAD_SIZE = 200_000_000 ;
29
32
30
33
const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe" ;
@@ -44,9 +47,8 @@ function logNetwork(msg: string, data: any) {
44
47
export class Recorder {
45
48
workerid : WorkerId ;
46
49
collDir : string ;
47
- // TODO: Fix this the next time the file is edited.
48
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
49
- crawler : any ;
50
+
51
+ crawler : Crawler ;
50
52
51
53
crawlState : RedisCrawlState ;
52
54
@@ -75,6 +77,7 @@ export class Recorder {
75
77
76
78
writer : WARCWriter ;
77
79
80
+ pageUrl ! : string ;
78
81
pageid ! : string ;
79
82
80
83
constructor ( {
@@ -85,8 +88,8 @@ export class Recorder {
85
88
workerid : WorkerId ;
86
89
collDir : string ;
87
90
// TODO: Fix this the next time the file is edited.
88
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
89
- crawler : any ;
91
+
92
+ crawler : Crawler ;
90
93
} ) {
91
94
this . workerid = workerid ;
92
95
this . crawler = crawler ;
@@ -463,23 +466,28 @@ export class Recorder {
463
466
464
467
let streamingConsume = false ;
465
468
466
- if ( contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE ) {
469
+ const contentType = this . _getContentType ( responseHeaders ) ;
470
+
471
+ // set max fetch size higher for HTML responses for current page
472
+ const matchFetchSize = this . allowLargeContent ( contentType )
473
+ ? MAX_BROWSER_TEXT_FETCH_SIZE
474
+ : MAX_BROWSER_DEFAULT_FETCH_SIZE ;
475
+
476
+ if ( contentLen < 0 || contentLen > matchFetchSize ) {
467
477
const opts = {
468
478
tempdir : this . tempdir ,
469
479
reqresp,
470
480
expectedSize : contentLen ,
471
481
recorder : this ,
472
482
networkId,
473
483
cdp,
484
+ requestId,
485
+ matchFetchSize,
474
486
} ;
475
487
476
488
// fetching using response stream, await here and then either call fulFill, or if not started, return false
477
489
if ( contentLen < 0 ) {
478
- const fetcher = new ResponseStreamAsyncFetcher ( {
479
- ...opts ,
480
- requestId,
481
- cdp,
482
- } ) ;
490
+ const fetcher = new ResponseStreamAsyncFetcher ( opts ) ;
483
491
const res = await fetcher . load ( ) ;
484
492
switch ( res ) {
485
493
case "dupe" :
@@ -533,7 +541,7 @@ export class Recorder {
533
541
}
534
542
}
535
543
536
- const rewritten = await this . rewriteResponse ( reqresp ) ;
544
+ const rewritten = await this . rewriteResponse ( reqresp , contentType ) ;
537
545
538
546
// if in service worker, serialize here
539
547
// as won't be getting a loadingFinished message
@@ -590,6 +598,7 @@ export class Recorder {
590
598
591
599
startPage ( { pageid, url } : { pageid : string ; url : string } ) {
592
600
this . pageid = pageid ;
601
+ this . pageUrl = url ;
593
602
this . logDetails = { page : url , workerid : this . workerid } ;
594
603
if ( this . pendingRequests && this . pendingRequests . size ) {
595
604
logger . debug (
@@ -700,8 +709,11 @@ export class Recorder {
700
709
return false ;
701
710
}
702
711
703
- async rewriteResponse ( reqresp : RequestResponseInfo ) {
704
- const { url, responseHeadersList, extraOpts, payload } = reqresp ;
712
+ async rewriteResponse (
713
+ reqresp : RequestResponseInfo ,
714
+ contentType : string | null ,
715
+ ) {
716
+ const { url, extraOpts, payload } = reqresp ;
705
717
706
718
if ( ! payload || ! payload . length ) {
707
719
return false ;
@@ -710,9 +722,7 @@ export class Recorder {
710
722
let newString = null ;
711
723
let string = null ;
712
724
713
- const ct = this . _getContentType ( responseHeadersList ) ;
714
-
715
- switch ( ct ) {
725
+ switch ( contentType ) {
716
726
case "application/x-mpegURL" :
717
727
case "application/vnd.apple.mpegurl" :
718
728
string = payload . toString ( ) ;
@@ -759,6 +769,18 @@ export class Recorder {
759
769
//return Buffer.from(newString).toString("base64");
760
770
}
761
771
772
+ allowLargeContent ( contentType : string | null ) {
773
+ const allowLargeCTs = [
774
+ "text/html" ,
775
+ "application/json" ,
776
+ "text/javascript" ,
777
+ "application/javascript" ,
778
+ "application/x-javascript" ,
779
+ ] ;
780
+
781
+ return allowLargeCTs . includes ( contentType || "" ) ;
782
+ }
783
+
762
784
_getContentType (
763
785
headers ?: Protocol . Fetch . HeaderEntry [ ] | { name : string ; value : string } [ ] ,
764
786
) {
@@ -916,6 +938,8 @@ class AsyncFetcher {
916
938
filter ?: ( resp : Response ) => boolean ;
917
939
ignoreDupe = false ;
918
940
941
+ maxFetchSize : number ;
942
+
919
943
recorder : Recorder ;
920
944
921
945
tempdir : string ;
@@ -929,6 +953,7 @@ class AsyncFetcher {
929
953
networkId,
930
954
filter = undefined ,
931
955
ignoreDupe = false ,
956
+ maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE ,
932
957
} : {
933
958
tempdir : string ;
934
959
reqresp : RequestResponseInfo ;
@@ -937,6 +962,7 @@ class AsyncFetcher {
937
962
networkId : string ;
938
963
filter ?: ( resp : Response ) => boolean ;
939
964
ignoreDupe ?: boolean ;
965
+ maxFetchSize ?: number ;
940
966
} ) {
941
967
this . reqresp = reqresp ;
942
968
this . reqresp . expectedSize = expectedSize ;
@@ -953,6 +979,8 @@ class AsyncFetcher {
953
979
this . tempdir ,
954
980
`${ timestampNow ( ) } -${ uuidv4 ( ) } .data` ,
955
981
) ;
982
+
983
+ this . maxFetchSize = maxFetchSize ;
956
984
}
957
985
958
986
async load ( ) {
@@ -983,7 +1011,7 @@ class AsyncFetcher {
983
1011
984
1012
const serializer = new WARCSerializer ( responseRecord , {
985
1013
gzip,
986
- maxMemSize : MAX_BROWSER_FETCH_SIZE ,
1014
+ maxMemSize : this . maxFetchSize ,
987
1015
} ) ;
988
1016
989
1017
try {
0 commit comments