Skip to content

Commit ab0f66a

Browse files
authored
Raise size limit for large HTML pages (#430)
Previously, responses >2MB are streamed to disk and an empty response returned to browser, to avoid holding large response in memory. This limit was too small, as some HTML pages may be >2MB, resulting in no content loaded. This PR sets different limits for: - HTML as well as other JS necessary for page to load to 25MB - All other content limit is set to 5MB Also includes some more type fixing
1 parent 783d006 commit ab0f66a

File tree

2 files changed

+57
-41
lines changed

2 files changed

+57
-41
lines changed

src/util/recorder.ts

+47-19
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,11 @@ import { TempFileBuffer, WARCSerializer } from "warcio/node";
2323
import { WARCWriter } from "./warcwriter.js";
2424
import { RedisCrawlState, WorkerId } from "./state.js";
2525
import { CDPSession, Protocol } from "puppeteer-core";
26+
import { Crawler } from "../crawler.js";
27+
28+
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
29+
const MAX_BROWSER_TEXT_FETCH_SIZE = 25_000_000;
2630

27-
const MAX_BROWSER_FETCH_SIZE = 2_000_000;
2831
const MAX_NETWORK_LOAD_SIZE = 200_000_000;
2932

3033
const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe";
@@ -44,9 +47,8 @@ function logNetwork(msg: string, data: any) {
4447
export class Recorder {
4548
workerid: WorkerId;
4649
collDir: string;
47-
// TODO: Fix this the next time the file is edited.
48-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
49-
crawler: any;
50+
51+
crawler: Crawler;
5052

5153
crawlState: RedisCrawlState;
5254

@@ -75,6 +77,7 @@ export class Recorder {
7577

7678
writer: WARCWriter;
7779

80+
pageUrl!: string;
7881
pageid!: string;
7982

8083
constructor({
@@ -85,8 +88,8 @@ export class Recorder {
8588
workerid: WorkerId;
8689
collDir: string;
8790
// TODO: Fix this the next time the file is edited.
88-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
89-
crawler: any;
91+
92+
crawler: Crawler;
9093
}) {
9194
this.workerid = workerid;
9295
this.crawler = crawler;
@@ -463,23 +466,28 @@ export class Recorder {
463466

464467
let streamingConsume = false;
465468

466-
if (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE) {
469+
const contentType = this._getContentType(responseHeaders);
470+
471+
// set max fetch size higher for HTML responses for current page
472+
const matchFetchSize = this.allowLargeContent(contentType)
473+
? MAX_BROWSER_TEXT_FETCH_SIZE
474+
: MAX_BROWSER_DEFAULT_FETCH_SIZE;
475+
476+
if (contentLen < 0 || contentLen > matchFetchSize) {
467477
const opts = {
468478
tempdir: this.tempdir,
469479
reqresp,
470480
expectedSize: contentLen,
471481
recorder: this,
472482
networkId,
473483
cdp,
484+
requestId,
485+
matchFetchSize,
474486
};
475487

476488
// fetching using response stream, await here and then either call fulFill, or if not started, return false
477489
if (contentLen < 0) {
478-
const fetcher = new ResponseStreamAsyncFetcher({
479-
...opts,
480-
requestId,
481-
cdp,
482-
});
490+
const fetcher = new ResponseStreamAsyncFetcher(opts);
483491
const res = await fetcher.load();
484492
switch (res) {
485493
case "dupe":
@@ -533,7 +541,7 @@ export class Recorder {
533541
}
534542
}
535543

536-
const rewritten = await this.rewriteResponse(reqresp);
544+
const rewritten = await this.rewriteResponse(reqresp, contentType);
537545

538546
// if in service worker, serialize here
539547
// as won't be getting a loadingFinished message
@@ -590,6 +598,7 @@ export class Recorder {
590598

591599
startPage({ pageid, url }: { pageid: string; url: string }) {
592600
this.pageid = pageid;
601+
this.pageUrl = url;
593602
this.logDetails = { page: url, workerid: this.workerid };
594603
if (this.pendingRequests && this.pendingRequests.size) {
595604
logger.debug(
@@ -700,8 +709,11 @@ export class Recorder {
700709
return false;
701710
}
702711

703-
async rewriteResponse(reqresp: RequestResponseInfo) {
704-
const { url, responseHeadersList, extraOpts, payload } = reqresp;
712+
async rewriteResponse(
713+
reqresp: RequestResponseInfo,
714+
contentType: string | null,
715+
) {
716+
const { url, extraOpts, payload } = reqresp;
705717

706718
if (!payload || !payload.length) {
707719
return false;
@@ -710,9 +722,7 @@ export class Recorder {
710722
let newString = null;
711723
let string = null;
712724

713-
const ct = this._getContentType(responseHeadersList);
714-
715-
switch (ct) {
725+
switch (contentType) {
716726
case "application/x-mpegURL":
717727
case "application/vnd.apple.mpegurl":
718728
string = payload.toString();
@@ -759,6 +769,18 @@ export class Recorder {
759769
//return Buffer.from(newString).toString("base64");
760770
}
761771

772+
allowLargeContent(contentType: string | null) {
773+
const allowLargeCTs = [
774+
"text/html",
775+
"application/json",
776+
"text/javascript",
777+
"application/javascript",
778+
"application/x-javascript",
779+
];
780+
781+
return allowLargeCTs.includes(contentType || "");
782+
}
783+
762784
_getContentType(
763785
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
764786
) {
@@ -916,6 +938,8 @@ class AsyncFetcher {
916938
filter?: (resp: Response) => boolean;
917939
ignoreDupe = false;
918940

941+
maxFetchSize: number;
942+
919943
recorder: Recorder;
920944

921945
tempdir: string;
@@ -929,6 +953,7 @@ class AsyncFetcher {
929953
networkId,
930954
filter = undefined,
931955
ignoreDupe = false,
956+
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
932957
}: {
933958
tempdir: string;
934959
reqresp: RequestResponseInfo;
@@ -937,6 +962,7 @@ class AsyncFetcher {
937962
networkId: string;
938963
filter?: (resp: Response) => boolean;
939964
ignoreDupe?: boolean;
965+
maxFetchSize?: number;
940966
}) {
941967
this.reqresp = reqresp;
942968
this.reqresp.expectedSize = expectedSize;
@@ -953,6 +979,8 @@ class AsyncFetcher {
953979
this.tempdir,
954980
`${timestampNow()}-${uuidv4()}.data`,
955981
);
982+
983+
this.maxFetchSize = maxFetchSize;
956984
}
957985

958986
async load() {
@@ -983,7 +1011,7 @@ class AsyncFetcher {
9831011

9841012
const serializer = new WARCSerializer(responseRecord, {
9851013
gzip,
986-
maxMemSize: MAX_BROWSER_FETCH_SIZE,
1014+
maxMemSize: this.maxFetchSize,
9871015
});
9881016

9891017
try {

src/util/reqresp.ts

+10-22
Original file line numberDiff line numberDiff line change
@@ -63,28 +63,16 @@ export class RequestResponseInfo {
6363
this.requestId = requestId;
6464
}
6565

66-
// TODO: Fix this the next time the file is edited.
67-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
68-
fillRequest(params: Record<string, any>) {
66+
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
6967
this.url = params.request.url;
7068
this.method = params.request.method;
7169
if (!this.requestHeaders) {
7270
this.requestHeaders = params.request.headers;
7371
}
7472
this.postData = params.request.postData;
75-
this.hasPostData = params.request.hasPostData;
76-
77-
if (params.type) {
78-
this.resourceType = params.type;
79-
}
80-
}
81-
82-
// TODO: Fix this the next time the file is edited.
83-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
84-
fillFetchRequestPaused(params: Record<string, any>) {
85-
this.fillRequest(params);
73+
this.hasPostData = params.request.hasPostData || false;
8674

87-
this.status = params.responseStatusCode;
75+
this.status = params.responseStatusCode || 0;
8876
this.statusText = params.responseStatusText || getStatusText(this.status);
8977

9078
this.responseHeadersList = params.responseHeaders;
@@ -153,25 +141,25 @@ export class RequestResponseInfo {
153141
}
154142
}
155143

156-
fillResponseReceivedExtraInfo(params: Record<string, string>) {
144+
fillResponseReceivedExtraInfo(
145+
params: Protocol.Network.ResponseReceivedExtraInfoEvent,
146+
) {
157147
// this.responseHeaders = params.headers;
158148
// if (params.headersText) {
159149
// this.responseHeadersText = params.headersText;
160150
// }
161151
this.extraOpts.ipType = params.resourceIPAddressSpace;
162152
}
163153

164-
// TODO: Fix this the next time the file is edited.
165-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
166-
fillFetchResponse(response: Record<string, any>) {
154+
fillFetchResponse(response: Response) {
167155
this.responseHeaders = Object.fromEntries(response.headers);
168156
this.status = response.status;
169157
this.statusText = response.statusText || getStatusText(this.status);
170158
}
171159

172-
// TODO: Fix this the next time the file is edited.
173-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
174-
fillRequestExtraInfo(params: Record<string, any>) {
160+
fillRequestExtraInfo(
161+
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
162+
) {
175163
this.requestHeaders = params.headers;
176164
}
177165

0 commit comments

Comments
 (0)