Skip to content

Commit f4c4203

Browse files
authored
crawl stopping / additional states: (#303)
* crawl stopping / additional states: - adds check for 'isCrawlStopped()' which checks redis key to see if crawl has been stopped externally, and interrupts work loop and prevents crawl from starting on load - additional crawl states: 'generate-wacz', 'generate-cdx', 'generate-warc', 'uploading-wacz', and 'pending-wait' to indicate when crawl is no longer running but crawler performing work - addresses part of webrecorder/browsertrix#263, webrecorder/browsertrix#637
1 parent d4bc9e8 commit f4c4203

File tree

3 files changed

+24
-2
lines changed

3 files changed

+24
-2
lines changed

crawler.js

+13-1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/
4444
const FETCH_TIMEOUT_SECS = 30;
4545
const PAGE_OP_TIMEOUT_SECS = 5;
4646

47+
const POST_CRAWL_STATES = ["generate-wacz", "uploading-wacz", "generate-cdx", "generate-warc"];
48+
4749

4850
// ============================================================================
4951
export class Crawler {
@@ -706,7 +708,12 @@ export class Crawler {
706708
this.storage = initStorage();
707709
}
708710

709-
if (initState === "finalize") {
711+
if (POST_CRAWL_STATES.includes(initState)) {
712+
logger.info("crawl already finished, running post-crawl tasks", {state: initState});
713+
await this.postCrawl();
714+
return;
715+
} else if (await this.crawlState.isCrawlStopped()) {
716+
logger.info("crawl stopped, running post-crawl tasks");
710717
await this.postCrawl();
711718
return;
712719
}
@@ -784,6 +791,7 @@ export class Crawler {
784791

785792
if (this.params.generateCDX) {
786793
logger.info("Generating CDX");
794+
await this.crawlState.setStatus("generate-cdx");
787795
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
788796
}
789797

@@ -824,6 +832,7 @@ export class Crawler {
824832

825833
async generateWACZ() {
826834
logger.info("Generating WACZ");
835+
await this.crawlState.setStatus("generate-wacz");
827836

828837
const archiveDir = path.join(this.collDir, "archive");
829838

@@ -901,6 +910,7 @@ export class Crawler {
901910
}
902911
*/
903912
if (this.storage) {
913+
await this.crawlState.setStatus("uploading-wacz");
904914
const filename = process.env.STORE_FILENAME || "@[email protected]";
905915
const targetFilename = interpolateFilename(filename, this.crawlId);
906916

@@ -1318,6 +1328,7 @@ export class Crawler {
13181328

13191329
async awaitPendingClear() {
13201330
logger.info("Waiting to ensure pending data is written to WARCs...");
1331+
await this.crawlState.setStatus("pending-wait");
13211332

13221333
const redis = await initRedis("redis://localhost/0");
13231334

@@ -1353,6 +1364,7 @@ export class Crawler {
13531364

13541365
async combineWARC() {
13551366
logger.info("Generating Combined WARCs");
1367+
await this.crawlState.setStatus("generate-warc");
13561368

13571369
// Get the list of created Warcs
13581370
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));

util/state.js

+10
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,16 @@ return 0;
208208
return await this.redis.hset(this.crawlSizeKey, this.uid, size);
209209
}
210210

211+
async isCrawlStopped() {
212+
return await this.redis.get("crawl-stop") === "1";
213+
}
214+
215+
// note: not currently called in crawler, but could be
216+
// crawl may be stopped by setting this elsewhere in shared redis
217+
async stopCrawl() {
218+
await this.redis.set("crawl-stop", "1");
219+
}
220+
211221
async incFailCount() {
212222
const key = `${this.key}:status:failcount:${this.uid}`;
213223
const res = await this.redis.incr(key);

util/worker.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ export class PageWorker
180180
async runLoop() {
181181
const crawlState = this.crawler.crawlState;
182182

183-
while (!this.crawler.interrupted) {
183+
while (!this.crawler.interrupted && !await crawlState.isCrawlStopped()) {
184184
const data = await crawlState.nextFromQueue();
185185

186186
// see if any work data in the queue

0 commit comments

Comments
 (0)