diff --git a/crawler.js b/crawler.js index 4f83ae023..468e927e1 100644 --- a/crawler.js +++ b/crawler.js @@ -13,7 +13,7 @@ import * as warcio from "warcio"; import { HealthChecker } from "./util/healthcheck.js"; import { TextExtract } from "./util/textextract.js"; -import { initStorage, getFileSize, getDirSize, interpolateFilename } from "./util/storage.js"; +import { initStorage, getFileSize, getDirSize, interpolateFilename, getDiskUsage } from "./util/storage.js"; import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js"; import { Screenshots } from "./util/screenshots.js"; import { parseArgs } from "./util/argParser.js"; @@ -553,11 +553,14 @@ export class Crawler { async checkLimits() { let interrupt = false; - if (this.params.sizeLimit) { - const dir = path.join(this.collDir, "archive"); - - const size = await getDirSize(dir); + let dir; + let size; + if (this.params.sizeLimit || this.params.diskUtilization) { + dir = path.join(this.collDir, "archive"); + size = await getDirSize(dir); + } + if (this.params.sizeLimit) { if (size >= this.params.sizeLimit) { logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`); interrupt = true; @@ -573,6 +576,33 @@ export class Crawler { } } + if (this.params.diskUtilization) { + // Check that disk usage isn't already above threshold + const diskUsage = await getDiskUsage(); + const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1)); + if (usedPercentage >= this.params.diskUtilization) { + logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${this.params.diskUtilization}%, stopping`); + interrupt = true; + } + + // Check that disk usage isn't likely to cross threshold + const kbUsed = parseInt(diskUsage["Used"]); + const kbTotal = parseInt(diskUsage["1K-blocks"]); + let kbArchiveDirSize = Math.floor(size/1024); + if (this.params.combineWARC && this.params.generateWACZ) { + kbArchiveDirSize *= 4; + } else if (this.params.combineWARC || this.params.generateWACZ) { + kbArchiveDirSize *= 2; + } + + const projectedTotal = kbUsed + kbArchiveDirSize; + const projectedUsedPercentage = Math.floor(kbTotal/projectedTotal); + if (projectedUsedPercentage >= this.params.diskUtilization) { + logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${this.params.diskUtilization}%, stopping`); + interrupt = true; + } + } + if (interrupt) { this.gracefulFinish(); } diff --git a/util/argParser.js b/util/argParser.js index 2ca6c7baa..4bc6c1e2d 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -291,6 +291,12 @@ class ArgParser { default: 0, }, + "diskUtilization": { + describe: "If set, save state and exit if disk utilization exceeds this percentage value", + type: "number", + default: 90, + }, + "timeLimit": { describe: "If set, save state and exit after time limit, in seconds", type: "number", @@ -465,6 +471,10 @@ class ArgParser { argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename); } + if ((argv.diskUtilization < 0 || argv.diskUtilization > 99)) { + argv.diskUtilization = 90; + } + return true; } } diff --git a/util/storage.js b/util/storage.js index 3f3c15441..b5a0e46f2 100644 --- a/util/storage.js +++ b/util/storage.js @@ -1,5 +1,7 @@ +import child_process from "child_process"; import fs from "fs"; import fsp from "fs/promises"; +import util from "util"; import os from "os"; import { createHash } from "crypto"; @@ -148,6 +150,21 @@ export async function getDirSize(dir) { return size; } +export async function getDiskUsage(path="/") { + const exec = util.promisify(child_process.exec); + const result = await exec(`df ${path}`); + const lines = result.stdout.split("\n"); + const keys = lines[0].split(/\s+/ig); + const rows = lines.slice(1).map(line => { + const values = line.split(/\s+/ig); + return keys.reduce((o, k, index) => { + o[k] = values[index]; + return o; + }, {}); + }); + return rows[0]; +} + function checksumFile(hashName, path) { return new Promise((resolve, reject) => { const hash = createHash(hashName);