Skip to content

Commit dd577e2

Browse files
committed
range request follow up to #709:
- fix: prefer streaming current response via takeStream, not only when size is unknown - ensure partial range requests are not async fetched, only full responses - don't serialize zero-payload responses - don't serialize 206 responses if there is size mismatch
1 parent ea05307 commit dd577e2

File tree

2 files changed

+35
-23
lines changed

2 files changed

+35
-23
lines changed

src/util/recorder.ts

+33-21
Original file line numberDiff line numberDiff line change
@@ -697,19 +697,18 @@ export class Recorder {
697697
requestId,
698698
};
699699

700-
// fetching using response stream, await here and then either call fulFill, or if not started, return false
701-
if (contentLen < 0) {
702-
const fetcher = new ResponseStreamAsyncFetcher(opts);
703-
const res = await fetcher.load();
704-
switch (res) {
705-
case "dupe":
706-
this.removeReqResp(networkId);
707-
return false;
708-
709-
case "fetched":
710-
streamingConsume = true;
711-
break;
712-
}
700+
// fetching using response stream as first attempt,
701+
// await here and then either call fulFill, or if dupe, return false
702+
const fetcher = new ResponseStreamAsyncFetcher(opts);
703+
const res = await fetcher.load();
704+
switch (res) {
705+
case "dupe":
706+
this.removeReqResp(networkId);
707+
return false;
708+
709+
case "fetched":
710+
streamingConsume = true;
711+
break;
713712
}
714713

715714
// if not consumed via takeStream, attempt async loading
@@ -750,7 +749,12 @@ export class Recorder {
750749

751750
// if in browser context, and not also intercepted in page context
752751
// serialize here, as won't be getting a loadingFinished message for it
753-
if (isBrowserContext && !reqresp.inPageContext && reqresp.payload) {
752+
if (
753+
isBrowserContext &&
754+
!reqresp.inPageContext &&
755+
reqresp.payload &&
756+
reqresp.payload.length > 0
757+
) {
754758
this.removeReqResp(networkId);
755759
await this.serializeToWARC(reqresp);
756760
}
@@ -788,7 +792,7 @@ export class Recorder {
788792
? "document not loaded in browser, possibly other URLs missing"
789793
: "URL not loaded in browser";
790794

791-
logger.debug(msg, { url, resourceType }, "recorder");
795+
logger.debug(msg, { url, resourceType, e }, "recorder");
792796
}
793797

794798
return true;
@@ -797,7 +801,11 @@ export class Recorder {
797801
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) {
798802
let fetcher: AsyncFetcher;
799803

800-
if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
804+
if (
805+
opts.reqresp.method !== "GET" ||
806+
contentLen > MAX_NETWORK_LOAD_SIZE ||
807+
!opts.reqresp.inPageContext
808+
) {
801809
fetcher = new AsyncFetcher(opts);
802810
} else {
803811
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
@@ -866,7 +874,7 @@ export class Recorder {
866874

867875
async awaitPageResources() {
868876
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
869-
if (reqresp.payload) {
877+
if (reqresp.payload && reqresp.payload.length > 0) {
870878
this.removeReqResp(requestId);
871879
await this.serializeToWARC(reqresp);
872880
// if no url, and not fetch intercept or async loading,
@@ -1455,8 +1463,10 @@ class AsyncFetcher {
14551463
},
14561464
"recorder",
14571465
);
1458-
//await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
1459-
//return fetched;
1466+
if (status === 206) {
1467+
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
1468+
return "notfetched";
1469+
}
14601470
}
14611471

14621472
const externalBuffer: TempFileBuffer =
@@ -1521,8 +1531,10 @@ class AsyncFetcher {
15211531
const { method, url } = reqresp;
15221532
logger.debug("Async started: fetch", { url }, "recorder");
15231533

1524-
const headers = reqresp.getRequestHeadersDict();
1525-
1534+
const headers = new Headers(reqresp.getRequestHeadersDict());
1535+
if (headers.has("range")) {
1536+
headers.set("range", "bytes=0-");
1537+
}
15261538
const dispatcher = getGlobalDispatcher().compose((dispatch) => {
15271539
return (opts, handler) => {
15281540
if (opts.headers) {

src/util/worker.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -438,9 +438,9 @@ export async function runWorkers(
438438

439439
await Promise.allSettled(workers.map((worker) => worker.run()));
440440

441-
await crawler.browser.close();
442-
443441
await closeWorkers();
442+
443+
await crawler.browser.close();
444444
}
445445

446446
// ===========================================================================

0 commit comments

Comments
 (0)