Skip to content

Commit b0e93cb

Browse files
authored
Add option for sleep interval after behaviors run + timing cleanup (#257)
* Add --pageExtraDelay option to add extra delay/wait time after every page (fixes #131) * Store total page time in 'maxPageTime', include pageExtraDelay * Rename timeout->pageLoadTimeout * cleanup: - store seconds for most interval checks, convert to ms only for api calls, remove most sec<->ms conversions - add secondsElapsed() utility function to help checking time elapsed - cleanup comments --------- Co-authored-by: Ilya Kreymer <[email protected]>
1 parent 02fb137 commit b0e93cb

File tree

6 files changed

+72
-48
lines changed

6 files changed

+72
-48
lines changed

README.md

+26-18
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,12 @@ Browsertrix Crawler includes a number of additional command-line options, explai
6868
--crawlId, --id A user provided ID for this crawl or
6969
crawl configuration (can also be se
7070
t via CRAWL_ID env var)
71-
[string] [default: "06bf9a4df9f7"]
71+
[string] [default: "ce75810e6874"]
7272
--newContext Deprecated as of 0.8.0, any values p
7373
assed will be ignored
7474
[string] [default: null]
75-
--waitUntil Puppeteer page.goto() condition to w
76-
ait for before continuing, can be mu
77-
ltiple separate by ','
75+
--waitUntil Playwright page.goto() condition to
76+
wait for before continuing
7877
[default: "load"]
7978
--depth The depth of the crawl for all seeds
8079
[number] [default: -1]
@@ -83,11 +82,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
8382
[number] [default: 0]
8483
--limit Limit crawl to this number of pages
8584
[number] [default: 0]
86-
--timeout Timeout for each page to load (in se
85+
--pageLoadTimeout, --timeout Timeout for each page to load (in se
8786
conds) [number] [default: 90]
8887
--scopeType A predefined scope of the crawl. For
89-
more customization, use 'custom' and
90-
set scopeIncludeRx regexes
88+
more customization, use 'custom' an
89+
d set scopeIncludeRx regexes
9190
[string] [choices: "page", "page-spa", "prefix", "host", "domain", "any", "cus
9291
tom"]
9392
--scopeIncludeRx, --include Regex of page URLs that should be in
@@ -131,19 +130,20 @@ Browsertrix Crawler includes a number of additional command-line options, explai
131130
--generateWACZ, --generatewacz, --ge If set, generate wacz
132131
nerateWacz [boolean] [default: false]
133132
--logging Logging options for crawler, can inc
134-
lude: stats, pywb, behaviors, behavi
135-
ors-debug, jserrors
133+
lude: stats (enabled by default), js
134+
errors, pywb, debug
136135
[string] [default: "stats"]
137136
--text If set, extract text to the pages.js
138-
only file [boolean] [default: false]
137+
onl file [boolean] [default: false]
139138
--cwd Crawl working directory for captures
140139
(pywb root). If not set, defaults t
141140
o process.cwd()
142141
[string] [default: "/crawls"]
143142
--mobileDevice Emulate mobile device by name from:
144-
https://github.com/puppeteer/puppete
145-
er/blob/main/src/common/DeviceDescri
146-
ptors.ts [string]
143+
https://github.com/microsoft/playwri
144+
ght/blob/main/packages/playwright-co
145+
re/src/server/deviceDescriptorsSourc
146+
e.json [string]
147147
--userAgent Override user-agent with specified s
148148
tring [string]
149149
--userAgentSuffix Append suffix to existing browser us
@@ -162,12 +162,16 @@ Browsertrix Crawler includes a number of additional command-line options, explai
162162
age behavior will run on each page.
163163
If 0, a behavior can run until finis
164164
h. [number] [default: 90]
165+
--pageExtraDelay, --delay If >0, amount of time to sleep (in s
166+
econds) after behaviors before movin
167+
g on to next page
168+
[number] [default: 0]
165169
--profile Path to tar.gz file which will be ex
166170
tracted and used as the browser prof
167171
ile [string]
168172
--screenshot Screenshot options for crawler, can
169-
include: view, thumbnail, fullPage
170-
(comma-separated list)
173+
include: view, thumbnail, fullPage (
174+
comma-separated list)
171175
[string] [default: ""]
172176
--screencastPort If set to a non-zero value, starts a
173177
n HTTP server with screencast access
@@ -181,9 +185,10 @@ Browsertrix Crawler includes a number of additional command-line options, explai
181185
o record in combined WARCs
182186
--redisStoreUrl If set, url for remote redis server
183187
to store state. Otherwise, using in-
184-
memory store [string]
188+
memory store
189+
[string] [default: "redis://localhost:6379/0"]
185190
--saveState If the crawl state should be seriali
186-
zed to the crawls/ directory. Default
191+
zed to the crawls/ directory. Defaul
187192
ts to 'partial', only saved when cra
188193
wl is interrupted
189194
[string] [choices: "never", "partial", "always"] [default: "partial"]
@@ -212,8 +217,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
212217
--netIdleWait if set, wait for network idle after
213218
page load and after behaviors are do
214219
ne (in seconds). if -1 (default), de
215-
determine based on scope
220+
termine based on scope
216221
[number] [default: -1]
222+
--lang if set, sets the language used by th
223+
e browser, should be ISO 639 languag
224+
e[-country] code [string]
217225
--config Path to YAML config file
218226
219227
```

crawler.js

+23-13
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import { parseArgs } from "./util/argParser.js";
2020
import { initRedis } from "./util/redis.js";
2121
import { logger, errJSON } from "./util/logger.js";
2222
import { runWorkers } from "./util/worker.js";
23-
import { sleep, timedRun } from "./util/timing.js";
23+
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
2424

2525
import { Browser } from "./util/browser.js";
2626

@@ -76,7 +76,11 @@ export class Crawler {
7676

7777
this.saveStateFiles = [];
7878
this.lastSaveTime = 0;
79-
this.saveStateInterval = this.params.saveStateInterval * 1000;
79+
80+
// sum of page load + behavior timeouts + 2 x fetch + cloudflare + link extraction timeouts + extra page delay
81+
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
82+
this.maxPageTime = this.params.pageLoadTimeout + this.params.behaviorTimeout +
83+
FETCH_TIMEOUT_SECS*2 + PAGE_OP_TIMEOUT_SECS*2 + this.params.pageExtraDelay;
8084

8185
this.emulateDevice = this.params.emulateDevice || {};
8286

@@ -85,7 +89,7 @@ export class Crawler {
8589

8690
this.gotoOpts = {
8791
waitUntil: this.params.waitUntil,
88-
timeout: this.params.timeout
92+
timeout: this.params.pageLoadTimeout * 1000
8993
};
9094

9195
// pages directory
@@ -152,7 +156,9 @@ export class Crawler {
152156

153157
logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
154158

155-
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
159+
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
160+
161+
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.maxPageTime, os.hostname());
156162

157163
if (this.params.saveState === "always" && this.params.saveStateInterval) {
158164
logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
@@ -406,11 +412,9 @@ export class Crawler {
406412
} else if (data.skipBehaviors) {
407413
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
408414
} else {
409-
const behaviorTimeout = this.params.behaviorTimeout / 1000;
410-
411415
const res = await timedRun(
412416
this.runBehaviors(page, data.filteredFrames, logDetails),
413-
behaviorTimeout,
417+
this.params.behaviorTimeout,
414418
"Behaviors timed out",
415419
logDetails,
416420
"behavior"
@@ -423,6 +427,11 @@ export class Crawler {
423427
}
424428
}
425429

430+
if (this.params.pageExtraDelay) {
431+
logger.info(`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`, logDetails);
432+
await sleep(this.params.pageExtraDelay);
433+
}
434+
426435
return true;
427436
}
428437

@@ -557,8 +566,8 @@ export class Crawler {
557566
}
558567

559568
if (this.params.timeLimit) {
560-
const elapsed = (Date.now() - this.startTime) / 1000;
561-
if (elapsed > this.params.timeLimit) {
569+
const elapsed = secondsElapsed(this.startTime);
570+
if (elapsed >= this.params.timeLimit) {
562571
logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
563572
interrupt = true;
564573
}
@@ -683,9 +692,10 @@ export class Crawler {
683692
}
684693
});
685694

686-
const totalPageTimeout = (this.params.behaviorTimeout + this.params.timeout) / 1000 + 60;
687-
688-
await runWorkers(this, this.params.workers, totalPageTimeout);
695+
// --------------
696+
// Run Crawl Here!
697+
await runWorkers(this, this.params.workers, this.maxPageTime);
698+
// --------------
689699

690700
await this.serializeConfig(true);
691701

@@ -1359,7 +1369,7 @@ export class Crawler {
13591369

13601370
if (!done) {
13611371
// if not done, save state only after specified interval has elapsed
1362-
if ((now.getTime() - this.lastSaveTime) < this.saveStateInterval) {
1372+
if (secondsElapsed(this.lastSaveTime, now) < this.params.saveStateInterval) {
13631373
return;
13641374
}
13651375
}

util/argParser.js

+10-8
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ class ArgParser {
7474
type: "number",
7575
},
7676

77-
"timeout": {
77+
"pageLoadTimeout": {
78+
alias: "timeout",
7879
describe: "Timeout for each page to load (in seconds)",
7980
default: 90,
8081
type: "number",
@@ -223,6 +224,13 @@ class ArgParser {
223224
type: "number",
224225
},
225226

227+
"pageExtraDelay": {
228+
alias: "delay",
229+
describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
230+
default: 0,
231+
type: "number",
232+
},
233+
226234
"profile": {
227235
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
228236
type: "string",
@@ -354,10 +362,7 @@ class ArgParser {
354362
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
355363
}
356364

357-
argv.timeout *= 1000;
358-
359-
// waitUntil condition must be: load, domcontentloaded, networkidle
360-
// TODO: Playwright migration - for now, can only support one
365+
// waitUntil condition must be one of WAIT_UNTIL_OPTS: load, domcontentloaded, networkidle
361366
// (see: https://playwright.dev/docs/api/class-page#page-goto-option-wait-until)
362367
if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
363368
logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
@@ -385,9 +390,6 @@ class ArgParser {
385390
argv.behaviors = argv.behaviors.split(",");
386391
}
387392
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
388-
if (argv.behaviorTimeout) {
389-
behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
390-
}
391393
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
392394
argv.behaviorOpts = JSON.stringify(behaviorOpts);
393395

util/state.js

+3-3
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ export class PageState
3939
// ============================================================================
4040
export class RedisCrawlState
4141
{
42-
constructor(redis, key, pageTimeout, uid) {
42+
constructor(redis, key, maxPageTime, uid) {
4343
this.redis = redis;
4444

4545
this.maxRetryPending = 1;
@@ -48,7 +48,7 @@ export class RedisCrawlState
4848

4949
this.uid = uid;
5050
this.key = key;
51-
this.pageTimeout = pageTimeout / 1000;
51+
this.maxPageTime = maxPageTime;
5252

5353
this.qkey = this.key + ":q";
5454
this.pkey = this.key + ":p";
@@ -152,7 +152,7 @@ return 0;
152152
async markStarted(url) {
153153
const started = this._timestamp();
154154

155-
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.pageTimeout);
155+
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime);
156156
}
157157

158158
async markFinished(url) {

util/timing.js

+4
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
2424
});
2525
}
2626

27+
export function secondsElapsed(startTime, nowDate = null) {
28+
nowDate = nowDate || new Date();
2729

30+
return (nowDate.getTime() - startTime) / 1000;
31+
}

util/worker.js

+6-6
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ const MAX_REUSE = 5;
66
const NEW_WINDOW_TIMEOUT = 10;
77

88
// ===========================================================================
9-
export function runWorkers(crawler, numWorkers, timeout) {
9+
export function runWorkers(crawler, numWorkers, maxPageTime) {
1010
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
1111

1212
const workers = [];
1313

1414
for (let i = 0; i < numWorkers; i++) {
15-
//workers.push(new PageWorker(`worker-${i+1}`, crawler, timeout));
16-
workers.push(new PageWorker(i, crawler, timeout));
15+
//workers.push(new PageWorker(`worker-${i+1}`, crawler, maxPageTime));
16+
workers.push(new PageWorker(i, crawler, maxPageTime));
1717
}
1818

1919
return Promise.allSettled(workers.map((worker) => worker.run()));
@@ -23,10 +23,10 @@ export function runWorkers(crawler, numWorkers, timeout) {
2323
// ===========================================================================
2424
export class PageWorker
2525
{
26-
constructor(id, crawler, timeout) {
26+
constructor(id, crawler, maxPageTime) {
2727
this.id = id;
2828
this.crawler = crawler;
29-
this.timeout = timeout;
29+
this.maxPageTime = maxPageTime;
3030

3131
this.reuseCount = 0;
3232
this.page = null;
@@ -134,7 +134,7 @@ export class PageWorker
134134
await Promise.race([
135135
timedRun(
136136
this.crawler.crawlPage(opts),
137-
this.timeout,
137+
this.maxPageTime,
138138
"Page Worker Timeout",
139139
{workerid},
140140
"worker"

0 commit comments

Comments
 (0)