Add option for sleep interval after behaviors run + timing cleanup (#257)

tw4l · web-flow · commit b0e93cb06ec4 · 2023-03-22T11:50:18.000-07:00
* Add --pageExtraDelay option to add extra delay/wait time after every page (fixes #131) * Store total page time in 'maxPageTime', include pageExtraDelay * Rename timeout->pageLoadTimeout * cleanup: - store seconds for most interval checks, convert to ms only for api calls, remove most sec<->ms conversions - add secondsElapsed() utility function to help checking time elapsed - cleanup comments --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
diff --git a/README.md b/README.md
@@ -68,13 +68,12 @@ Browsertrix Crawler includes a number of additional command-line options, explai
       --crawlId, --id                       A user provided ID for this crawl or
                                              crawl configuration (can also be se
                                             t via CRAWL_ID env var)
-                                              [string] [default: "06bf9a4df9f7"]
+                                              [string] [default: "ce75810e6874"]
       --newContext                          Deprecated as of 0.8.0, any values p
                                             assed will be ignored
                                                         [string] [default: null]
-      --waitUntil                           Puppeteer page.goto() condition to w
-                                            ait for before continuing, can be mu
-                                            ltiple separate by ','
+      --waitUntil                           Playwright page.goto() condition to
+                                            wait for before continuing
                                                                [default: "load"]
       --depth                               The depth of the crawl for all seeds
                                                           [number] [default: -1]
@@ -83,11 +82,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
                                                            [number] [default: 0]
       --limit                               Limit crawl to this number of pages
                                                            [number] [default: 0]
-      --timeout                             Timeout for each page to load (in se
+      --pageLoadTimeout, --timeout          Timeout for each page to load (in se
                                             conds)        [number] [default: 90]
       --scopeType                           A predefined scope of the crawl. For
-                                            more customization, use 'custom' and
-                                             set scopeIncludeRx regexes
+                                             more customization, use 'custom' an
+                                            d set scopeIncludeRx regexes
   [string] [choices: "page", "page-spa", "prefix", "host", "domain", "any", "cus
                                                                            tom"]
       --scopeIncludeRx, --include           Regex of page URLs that should be in
@@ -131,19 +130,20 @@ Browsertrix Crawler includes a number of additional command-line options, explai
       --generateWACZ, --generatewacz, --ge  If set, generate wacz
       nerateWacz                                      [boolean] [default: false]
       --logging                             Logging options for crawler, can inc
-                                            lude: stats, pywb, behaviors, behavi
-                                            ors-debug, jserrors
+                                            lude: stats (enabled by default), js
+                                            errors, pywb, debug
                                                      [string] [default: "stats"]
       --text                                If set, extract text to the pages.js
-                                            only file  [boolean] [default: false]
+                                            onl file  [boolean] [default: false]
       --cwd                                 Crawl working directory for captures
                                              (pywb root). If not set, defaults t
                                             o process.cwd()
                                                    [string] [default: "/crawls"]
       --mobileDevice                        Emulate mobile device by name from:
-                                            https://github.com/puppeteer/puppete
-                                            er/blob/main/src/common/DeviceDescri
-                                            ptors.ts                    [string]
+                                            https://github.com/microsoft/playwri
+                                            ght/blob/main/packages/playwright-co
+                                            re/src/server/deviceDescriptorsSourc
+                                            e.json                      [string]
       --userAgent                           Override user-agent with specified s
                                             tring                       [string]
       --userAgentSuffix                     Append suffix to existing browser us
@@ -162,12 +162,16 @@ Browsertrix Crawler includes a number of additional command-line options, explai
                                             age behavior will run on each page.
                                             If 0, a behavior can run until finis
                                             h.            [number] [default: 90]
+      --pageExtraDelay, --delay             If >0, amount of time to sleep (in s
+                                            econds) after behaviors before movin
+                                            g on to next page
+                                                           [number] [default: 0]
       --profile                             Path to tar.gz file which will be ex
                                             tracted and used as the browser prof
                                             ile                         [string]
       --screenshot                          Screenshot options for crawler, can
-                                            include: view, thumbnail, fullPage
-                                            (comma-separated list)
+                                            include: view, thumbnail, fullPage (
+                                            comma-separated list)
                                                           [string] [default: ""]
       --screencastPort                      If set to a non-zero value, starts a
                                             n HTTP server with screencast access
@@ -181,9 +185,10 @@ Browsertrix Crawler includes a number of additional command-line options, explai
                                             o record in combined WARCs
       --redisStoreUrl                       If set, url for remote redis server
                                             to store state. Otherwise, using in-
-                                            memory store                [string]
+                                            memory store
+                                  [string] [default: "redis://localhost:6379/0"]
       --saveState                           If the crawl state should be seriali
-                                            zed to the crawls/ directory. Default
+                                            zed to the crawls/ directory. Defaul
                                             ts to 'partial', only saved when cra
                                             wl is interrupted
            [string] [choices: "never", "partial", "always"] [default: "partial"]
@@ -212,8 +217,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
       --netIdleWait                         if set, wait for network idle after
                                             page load and after behaviors are do
                                             ne (in seconds). if -1 (default), de
-                                            determine based on scope
+                                            termine based on scope
                                                           [number] [default: -1]
+      --lang                                if set, sets the language used by th
+                                            e browser, should be ISO 639 languag
+                                            e[-country] code            [string]
       --config                              Path to YAML config file
 
 ```
diff --git a/crawler.js b/crawler.js
@@ -20,7 +20,7 @@ import { parseArgs } from "./util/argParser.js";
 import { initRedis } from "./util/redis.js";
 import { logger, errJSON } from "./util/logger.js";
 import { runWorkers } from "./util/worker.js";
-import { sleep, timedRun } from "./util/timing.js";
+import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
 
 import { Browser } from "./util/browser.js";
 
@@ -76,7 +76,11 @@ export class Crawler {
 
     this.saveStateFiles = [];
     this.lastSaveTime = 0;
-    this.saveStateInterval = this.params.saveStateInterval * 1000;    
+
+    // sum of page load + behavior timeouts + 2 x fetch + cloudflare + link extraction timeouts + extra page delay
+    // if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
+    this.maxPageTime = this.params.pageLoadTimeout + this.params.behaviorTimeout +
+                       FETCH_TIMEOUT_SECS*2 + PAGE_OP_TIMEOUT_SECS*2 + this.params.pageExtraDelay;
 
     this.emulateDevice = this.params.emulateDevice || {};
 
@@ -85,7 +89,7 @@ export class Crawler {
 
     this.gotoOpts = {
       waitUntil: this.params.waitUntil,
-      timeout: this.params.timeout
+      timeout: this.params.pageLoadTimeout * 1000
     };
 
     // pages directory
@@ -152,7 +156,9 @@ export class Crawler {
 
     logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
 
-    this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
+    logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
+
+    this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.maxPageTime, os.hostname());
 
     if (this.params.saveState === "always" && this.params.saveStateInterval) {
       logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
@@ -406,11 +412,9 @@ export class Crawler {
       } else if (data.skipBehaviors) {
         logger.info("Skipping behaviors for slow page", logDetails, "behavior");
       } else {
-        const behaviorTimeout = this.params.behaviorTimeout / 1000;
-
         const res = await timedRun(
           this.runBehaviors(page, data.filteredFrames, logDetails),
-          behaviorTimeout,
+          this.params.behaviorTimeout,
           "Behaviors timed out",
           logDetails,
           "behavior"
@@ -423,6 +427,11 @@ export class Crawler {
       }
     }
 
+    if (this.params.pageExtraDelay) {
+      logger.info(`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`, logDetails);
+      await sleep(this.params.pageExtraDelay);
+    }
+
     return true;
   }
 
@@ -557,8 +566,8 @@ export class Crawler {
     }
 
     if (this.params.timeLimit) {
-      const elapsed = (Date.now() - this.startTime) / 1000;
-      if (elapsed > this.params.timeLimit) {
+      const elapsed = secondsElapsed(this.startTime);
+      if (elapsed >= this.params.timeLimit) {
         logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
         interrupt = true;
       }
@@ -683,9 +692,10 @@ export class Crawler {
       }
     });
 
-    const totalPageTimeout = (this.params.behaviorTimeout + this.params.timeout) / 1000 + 60;
-
-    await runWorkers(this, this.params.workers, totalPageTimeout);
+    // --------------
+    // Run Crawl Here!
+    await runWorkers(this, this.params.workers, this.maxPageTime);
+    // --------------
 
     await this.serializeConfig(true);
 
@@ -1359,7 +1369,7 @@ export class Crawler {
 
     if (!done) {
       // if not done, save state only after specified interval has elapsed
-      if ((now.getTime() - this.lastSaveTime) < this.saveStateInterval) {
+      if (secondsElapsed(this.lastSaveTime, now) < this.params.saveStateInterval) {
         return;
       }
     }
diff --git a/util/argParser.js b/util/argParser.js
@@ -74,7 +74,8 @@ class ArgParser {
         type: "number",
       },
 
-      "timeout": {
+      "pageLoadTimeout": {
+        alias: "timeout",
         describe: "Timeout for each page to load (in seconds)",
         default: 90,
         type: "number",
@@ -223,6 +224,13 @@ class ArgParser {
         type: "number",
       },
 
+      "pageExtraDelay": {
+        alias: "delay",
+        describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
+        default: 0,
+        type: "number",
+      },
+
       "profile": {
         describe: "Path to tar.gz file which will be extracted and used as the browser profile",
         type: "string",
@@ -354,10 +362,7 @@ class ArgParser {
       logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
     }
 
-    argv.timeout *= 1000;
-
-    // waitUntil condition must be: load, domcontentloaded, networkidle
-    // TODO: Playwright migration - for now, can only support one
+    // waitUntil condition must be one of WAIT_UNTIL_OPTS: load, domcontentloaded, networkidle
     // (see: https://playwright.dev/docs/api/class-page#page-goto-option-wait-until)
     if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
       logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
@@ -385,9 +390,6 @@ class ArgParser {
       argv.behaviors = argv.behaviors.split(",");
     }
     argv.behaviors.forEach((x) => behaviorOpts[x] = true);
-    if (argv.behaviorTimeout) {
-      behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
-    }
     behaviorOpts.log = BEHAVIOR_LOG_FUNC;
     argv.behaviorOpts = JSON.stringify(behaviorOpts);
 
diff --git a/util/state.js b/util/state.js
@@ -39,7 +39,7 @@ export class PageState
 // ============================================================================
 export class RedisCrawlState
 {
-  constructor(redis, key, pageTimeout, uid) {
+  constructor(redis, key, maxPageTime, uid) {
     this.redis = redis;
 
     this.maxRetryPending = 1;
@@ -48,7 +48,7 @@ export class RedisCrawlState
 
     this.uid = uid;
     this.key = key;
-    this.pageTimeout = pageTimeout / 1000;
+    this.maxPageTime = maxPageTime;
 
     this.qkey = this.key + ":q";
     this.pkey = this.key + ":p";
@@ -152,7 +152,7 @@ return 0;
   async markStarted(url) {
     const started = this._timestamp();
 
-    return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.pageTimeout);
+    return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime);
   }
 
   async markFinished(url) {
diff --git a/util/timing.js b/util/timing.js
@@ -24,4 +24,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
     });
 }
 
+export function secondsElapsed(startTime, nowDate = null) {
+  nowDate = nowDate || new Date();
 
+  return (nowDate.getTime() - startTime) / 1000;
+}
diff --git a/util/worker.js b/util/worker.js
@@ -6,14 +6,14 @@ const MAX_REUSE = 5;
 const NEW_WINDOW_TIMEOUT = 10;
 
 // ===========================================================================
-export function runWorkers(crawler, numWorkers, timeout) {
+export function runWorkers(crawler, numWorkers, maxPageTime) {
   logger.info(`Creating ${numWorkers} workers`, {}, "worker");
 
   const workers = [];
 
   for (let i = 0; i < numWorkers; i++) {
-    //workers.push(new PageWorker(`worker-${i+1}`, crawler, timeout));
-    workers.push(new PageWorker(i, crawler, timeout));
+    //workers.push(new PageWorker(`worker-${i+1}`, crawler, maxPageTime));
+    workers.push(new PageWorker(i, crawler, maxPageTime));
   }
 
   return Promise.allSettled(workers.map((worker) => worker.run()));
@@ -23,10 +23,10 @@ export function runWorkers(crawler, numWorkers, timeout) {
 // ===========================================================================
 export class PageWorker
 {
-  constructor(id, crawler, timeout) {
+  constructor(id, crawler, maxPageTime) {
     this.id = id;
     this.crawler = crawler;
-    this.timeout = timeout;
+    this.maxPageTime = maxPageTime;
 
     this.reuseCount = 0;
     this.page = null;
@@ -134,7 +134,7 @@ export class PageWorker
       await Promise.race([
         timedRun(
           this.crawler.crawlPage(opts),
-          this.timeout,
+          this.maxPageTime,
           "Page Worker Timeout",
           {workerid},
           "worker"

Original file line number	Diff line number	Diff line change
`@@ -24,4 +24,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai`
`24`	`24`	`});`
`25`	`25`	`}`
`26`	`26`
	`27`	`+export function secondsElapsed(startTime, nowDate = null) {`
	`28`	`+ nowDate = nowDate \|\| new Date();`
`27`	`29`
	`30`	`+ return (nowDate.getTime() - startTime) / 1000;`
	`31`	`+}`