From 2ff573dc51eb831aaddebcc0c949fe311ba3b8ef Mon Sep 17 00:00:00 2001 From: Rowan Manning <138944+rowanmanning@users.noreply.github.com> Date: Mon, 13 May 2024 13:48:53 +0100 Subject: [PATCH] Remove the graphite checks This is a breaking change. We're removing the various health check types that rely on reading Graphite metrics because, as of Q3 2024, we will no longer be able to read this data. For now we recommend removing these checks from your apps. The alerts aren't very commonly-used and we have better ways of monitoring uptime now. If you still need alerts based on Graphite metrics then you can do so by creating a dashboard in the grafana monorepo and adding alerts: https://github.com/Financial-Times/grafana/tree/main/terraform/dashboards See-Also: https://financialtimes.atlassian.net/browse/CPREL-1056 --- README.md | 26 +- src/checks/graphiteSpike.check.js | 175 ------ src/checks/graphiteThreshold.check.js | 144 ----- src/checks/graphiteWorking.check.js | 120 ---- src/checks/index.js | 3 - test/fixtures/config/aggregate.js | 18 +- test/fixtures/config/graphiteSpikeFixture.js | 16 - .../config/graphiteThresholdFixture.js | 16 - .../fixtures/config/graphiteWorkingFixture.js | 13 - test/fixtures/config/paywall.js | 9 +- test/graphiteSpike.check.spec.js | 242 -------- test/graphiteThreshold.check.spec.js | 541 ------------------ test/graphiteworking.spec.js | 137 ----- test/healthchecks.spec.js | 4 +- test/startup.spec.js | 5 - 15 files changed, 29 insertions(+), 1440 deletions(-) delete mode 100644 src/checks/graphiteSpike.check.js delete mode 100644 src/checks/graphiteThreshold.check.js delete mode 100644 src/checks/graphiteWorking.check.js delete mode 100644 test/fixtures/config/graphiteSpikeFixture.js delete mode 100644 test/fixtures/config/graphiteThresholdFixture.js delete mode 100644 test/fixtures/config/graphiteWorkingFixture.js delete mode 100644 test/graphiteSpike.check.spec.js delete mode 100644 test/graphiteThreshold.check.spec.js delete mode 100644 test/graphiteworking.spec.js diff --git a/README.md b/README.md index 05d721e..c0b28da 100644 --- a/README.md +++ b/README.md @@ -96,34 +96,20 @@ Reports on the status of other checks. Useful if you have a multi-region service - `'atLeastOne'` the check succeeds if at least one of its subchecks succeeds #### `graphiteSpike` -Compares current and historical graphite metrics to see if there is a spike -* `numerator`: [required] Name of main graphite metric to count (may contain wildcards) -* `divisor`: [optional] Name of graphite metric to divide by (may contain wildcards) -* `normalize`: [optional] Boolean indicating whether to normalize to adjust for difference in size between sample and baseline timescales. Default is `true` if no divisor specified, `false` otherwise. -* `samplePeriod`: [default: `'10min'`] Length of time to count metrics for a sample of current behaviour -* `baselinePeriod`: [default: `'7d'`] Length of time to count metrics for to establish baseline behaviour -* `direction`: [default: `'up'`] Direction in which to look for spikes; 'up' = sharp increase in activity, 'down' = sharp decrease in activity -* `threshold`: [default: `3`] Amount of difference between current and baseline activity which registers as a spike e.g. 5 means current activity must be 5 times greater/less than the baseline activity +> [!CAUTION] +> The `graphiteSpike` check type has been removed. If you need to create alerts based on Graphite data you must now do so in the [grafana repo](https://github.com/Financial-Times/grafana). #### `graphiteThreshold` Checks whether the value of a graphite metric has crossed a threshold -* `metric`: [required] Name of graphite metric to count (may contain wildcards) -* `threshold`: [required] Value to check the metrics against -* `samplePeriod`: [default: `'10min'`] Length of time to count metrics for a sample of current behaviour -* `isConsistentBreach`: [default: `false`] Boolan value to dictate whether all data points need to have breached the defined threshold to fail the check -* `direction`: [default: `'above'`] Direction on which to trigger the healthcheck: - - `'above'` = alert if value goes above the threshold - - `'below'` = alert if value goes below the threshold +> [!CAUTION] +> The `graphiteThreshold` check type has been removed. If you need to create alerts based on Graphite data you must now do so in the [grafana repo](https://github.com/Financial-Times/grafana). #### `graphiteWorking` -Checks if the value of a graphite metric has received data recently. - -* `metric`: [required] Name of graphite metric to count (may contain wildcards) - - Use `summarize` if the metric receives data infrequently, e.g. `summarize(next.heroku.next-article.some-infrequent-periodic-metric, '30mins', 'sum', true)` -* `time`: [default: `'-5minutes'`] Length of time to count metrics +> [!CAUTION] +> The `graphiteWorking` check type has been removed. If you need to create alerts based on Graphite data you must now do so in the [grafana repo](https://github.com/Financial-Times/grafana). #### `cloudWatchThreshold` Checks whether the value of a CloudWatch metric has crossed a threshold diff --git a/src/checks/graphiteSpike.check.js b/src/checks/graphiteSpike.check.js deleted file mode 100644 index ffefe47..0000000 --- a/src/checks/graphiteSpike.check.js +++ /dev/null @@ -1,175 +0,0 @@ -'use strict'; -/** - * Detect spikes/troughs in a given metric compared to baseline historical data. - * - * Let's say for example you want to get a notification if there's a sudden increase in the average network response time for your API. - * - * You can detect data spikes by comparing a "sample" (say, the last ten minutes of data) against a longer "baseline" (say, the last seven days of data). - * - * If the sample of the thing you're measuring is significantly larger — or significantly smaller — than the baseline, then - * it's considered to be an abberation; that is, a "spike". - * - * Sometimes the thing you want to measure is a ratio of two properties. For example: What's the amount of 5xx network responses (errors), compared - * to the amount of 2xx network responses (normal traffic). You'd expect a small percent of errors, but you probably care about error spikes. - * - * numerator: The property you want to focus on; e.g. Response times or errors. - * divisor: (optional) The property you want to compare against; e.g. Normal responses. - * samplePeriod: How much time you want to look for spikes in. Too brief, and you might miss gradual spikes. Too long, and you might get false positives. - * baselinePeriod: How long it takes to get a good average of your metric. It needs to be long enough to smooth out any past spikes. - * direction: Whether you care about sample metric increases (up) or decreases (down). - */ - -const logger = require('@dotcom-reliability-kit/logger'); -const status = require('./status'); -const Check = require('./check'); -const fetch = require('node-fetch'); -const fetchres = require('fetchres'); -const ms = require('ms'); - -const logEventPrefix = 'GRAPHITE_SPIKE_CHECK'; - -/** Detects spikes/troughs in a given metric compared to baseline historical data */ - -class GraphiteSpikeCheck extends Check { - constructor(options) { - super(options); - this.threshold = options.threshold || 3; - this.direction = options.direction || 'up'; - this.id = options.id; - this.name = options.name; - - this.samplePeriod = options.samplePeriod || '10min'; - this.baselinePeriod = options.baselinePeriod || '7d'; - this.seriesFunction = options.seriesFunction || 'sumSeries'; - this.summarizeFunction = options.summarizeFunction || 'sum'; - - this.ftGraphiteBaseUrl = 'https://graphitev2-api.ft.com/render/?'; - this.ftGraphiteKey = process.env.FT_GRAPHITE_KEY; - - if (!this.ftGraphiteKey) { - throw new Error('You must set FT_GRAPHITE_KEY environment variable'); - } - - if (!options.numerator) { - throw new Error( - `You must pass in a numerator for the "${options.name}" check - e.g., "next.heroku.article.*.express.start"` - ); - } - - this.sampleUrl = this.generateUrl( - options.numerator, - options.divisor, - this.samplePeriod - ); - this.baselineUrl = this.generateUrl( - options.numerator, - options.divisor, - this.baselinePeriod - ); - - // If there's no divisor specified we probably need to normalize sample and baseline to account for the difference in size between their time ranges - this.shouldNormalize = - typeof options.normalize !== 'undefined' - ? options.normalize - : !options.divisor; - - if (this.shouldNormalize) { - this.sampleMs = ms(this.samplePeriod); - this.baselineMs = ms(this.baselinePeriod); - } - - this.checkOutput = 'Graphite spike check has not yet run'; - } - - generateUrl(numerator, divisor, period) { - const urlBase = - this.ftGraphiteBaseUrl + `from=-${period}&format=json&target=`; - if (divisor) { - return ( - urlBase + - `divideSeries(summarize(${this.seriesFunction}(transformNull(${numerator})),"${period}","${this.summarizeFunction}",true),summarize(${this.seriesFunction}(transformNull(${divisor})),"${period}","${this.summarizeFunction}",true))` - ); - } else { - return ( - urlBase + - `summarize(${this.seriesFunction}(transformNull(${numerator})),"${period}","${this.summarizeFunction}",true)` - ); - } - } - - normalize(data) { - if (this.shouldNormalize) { - data.sample = data.sample / this.sampleMs; - data.baseline = data.baseline / this.baselineMs; - } - - return data; - } - - async tick() { - try { - const [sample, baseline] = await Promise.all([ - fetch(this.sampleUrl, { headers: { key: this.ftGraphiteKey } }).then( - fetchres.json - ), - fetch(this.baselineUrl, { headers: { key: this.ftGraphiteKey } }).then( - fetchres.json - ) - ]); - - const baselineValue = baseline[0] && baseline[0].datapoints[0][0]; - const sampleValue = sample[0] && sample[0].datapoints[0][0]; - - const data = this.normalize({ - sample: sampleValue && !Object.is(sampleValue, null) ? sampleValue : 0, - // baseline should not be allowed to be smaller than one as it is use as a divisor - baseline: - baselineValue && - !Object.is(baselineValue, null) && - !Object.is(baselineValue, 0) - ? baselineValue - : 1 - }); - - const ok = - this.direction === 'up' - ? data.sample / data.baseline < this.threshold - : data.sample / data.baseline > 1 / this.threshold; - - const details = `Direction: ${this.direction} Sample: ${data.sample} Baseline: ${data.baseline} Threshold: ${this.threshold}`; - if (ok) { - this.status = status.PASSED; - this.checkOutput = `No spike detected in graphite data. ${details}`; - } else { - this.status = status.FAILED; - this.checkOutput = `Spike detected in graphite data. ${details}`; - logger.warn({ - event: 'graphiteSpike fail', - checkId: this.id, - checkName: this.name, - threshold: this.threshold, - sampleDetails: { - url: this.sampleUrl, - rawValue: sampleValue, - normalisedValue: data.sample - }, - baselineDetails: { - url: this.baselineUrl, - rawValue: baselineValue, - normalisedValue: data.baseline - } - }); - } - } catch (err) { - logger.error( - { event: `${logEventPrefix}_ERROR`, url: this.sampleUrl }, - err - ); - this.status = status.FAILED; - this.checkOutput = - 'Graphite spike check failed to fetch data: ' + err.message; - } - } -} - -module.exports = GraphiteSpikeCheck; diff --git a/src/checks/graphiteThreshold.check.js b/src/checks/graphiteThreshold.check.js deleted file mode 100644 index 3746d74..0000000 --- a/src/checks/graphiteThreshold.check.js +++ /dev/null @@ -1,144 +0,0 @@ -'use strict'; - -const logger = require('@dotcom-reliability-kit/logger'); -const status = require('./status'); -const Check = require('./check'); -const fetch = require('node-fetch'); -const fetchres = require('fetchres'); - -const logEventPrefix = 'GRAPHITE_THRESHOLD_CHECK'; - -// Detects when the value of a metric climbs above/below a threshold value - -class GraphiteThresholdCheck extends Check { - constructor(options) { - super(options); - this.threshold = options.threshold; - this.direction = options.direction || 'above'; - - this.samplePeriod = options.samplePeriod || '10min'; - this.isConsistentBreach = options.isConsistentBreach || false; - - this.ftGraphiteBaseUrl = 'https://graphitev2-api.ft.com/render/?'; - this.ftGraphiteKey = process.env.FT_GRAPHITE_KEY; - if (!this.ftGraphiteKey) { - throw new Error('You must set FT_GRAPHITE_KEY environment variable'); - } - - if (!options.metric) { - throw new Error( - `You must pass in a metric for the "${options.name}" check - e.g., "next.heroku.article.*.express.start"` - ); - } - - this.metric = options.metric; - this.sampleUrl = this.generateUrl(options.metric, this.samplePeriod); - - this.checkOutput = 'Graphite threshold check has not yet run'; - } - - generateUrl(metric, period) { - return ( - this.ftGraphiteBaseUrl + `format=json&from=-${period}&target=` + metric - ); - } - - async tick() { - try { - const results = await fetch(this.sampleUrl, { - headers: { key: this.ftGraphiteKey } - }).then(fetchres.json); - - const simplifiedResults = results.map((result) => { - const divideSeriesRegex = - /divideSeries\(sumSeries\(.*?\),\s?sumSeries\(.*?\)\)/g; - const asPercentRegex = - /asPercent\(summarize\(sumSeries\(.*?\),.*?,.*?,.*?\),\s?summarize\(sumSeries\(.*?\),.*?,.*?,.*?\)\)/g; - - if ( - (result.target && asPercentRegex.test(result.target)) || - (result.target && divideSeriesRegex.test(result.target)) - ) { - const fetchCountPerTimeUnit = result.datapoints.map((item) => - Number(item[0]) - ); - if (fetchCountPerTimeUnit.length !== 1) { - logger.debug({ - event: 'HEALTHCHECK_LENGTH_NOT_1', - datapoints: result.datapoints - }); - } - const isFailing = - this.direction === 'above' - ? Number(fetchCountPerTimeUnit[0]) > this.threshold - : Number(fetchCountPerTimeUnit[0]) < this.threshold; - return { target: result.target, isFailing }; - } - - if (result.target && result.target.includes('summarize(sumSeries')) { - const fetchCountPerTimeUnit = result.datapoints.map((item) => - Number(item[0]) - ); - const sumUp = (previousValue, currentValue) => - previousValue + currentValue; - const talliedUp = fetchCountPerTimeUnit.reduce(sumUp); - const isFailing = - this.direction === 'above' - ? Number(talliedUp) > this.threshold - : Number(talliedUp) < this.threshold; - return { target: result.target, isFailing }; - } - - const datapointFailureStatuses = result.datapoints.map((value) => { - if (value[0] === null) { - // metric data is unavailable, we don't fail this threshold check if metric data is unavailable - // if you want a failing check for when metric data is unavailable, use graphiteWorking - logger.debug({ - event: `${logEventPrefix}_NULL_DATA`, - url: this.sampleUrl - }); - return false; - } else { - return this.direction === 'above' - ? Number(value[0]) > this.threshold - : Number(value[0]) < this.threshold; - } - }); - - const isFailing = this.isConsistentBreach - ? datapointFailureStatuses.every(Boolean) - : datapointFailureStatuses.some(Boolean); - - return { target: result.target, isFailing }; - }); - - const failed = simplifiedResults.some((result) => result.isFailing); - const failingMetrics = simplifiedResults - .filter((result) => result.isFailing) - .map((result) => result.target); - - this.status = failed ? status.FAILED : status.PASSED; - - // The metric crossed a threshold - this.checkOutput = failed - ? `In the last ${ - this.samplePeriod - }, the following metric(s) have moved ${ - this.direction - } the threshold value of ${this.threshold}: ${failingMetrics.join( - ' ' - )}` - : `No threshold error detected in graphite data for ${this.metric}.`; - } catch (err) { - logger.error( - { event: `${logEventPrefix}_ERROR`, url: this.sampleUrl }, - err - ); - this.status = status.FAILED; - this.checkOutput = - 'Graphite threshold check failed to fetch data: ' + err.message; - } - } -} - -module.exports = GraphiteThresholdCheck; diff --git a/src/checks/graphiteWorking.check.js b/src/checks/graphiteWorking.check.js deleted file mode 100644 index 6bcaa5f..0000000 --- a/src/checks/graphiteWorking.check.js +++ /dev/null @@ -1,120 +0,0 @@ -const status = require('./status'); -const Check = require('./check'); -const fetch = require('node-fetch'); -const logger = require('@dotcom-reliability-kit/logger'); -const fetchres = require('fetchres'); - -const logEventPrefix = 'GRAPHITE_WORKING_CHECK'; - -function badJSON(message) { - const err = new Error(message); - logger.error({ event: `${logEventPrefix}_BAD_JSON` }, err); - throw err; -} - -class GraphiteWorkingCheck extends Check { - constructor(options) { - options.technicalSummary = - options.technicalSummary || - 'There has been no metric data for a sustained period of time'; - options.panicGuide = - options.panicGuide || - 'Check this is up and running. Check this has been able to send metrics to Graphite (see Heroku and Splunk logs). Check Graphite has not been dropping metric data.'; - - super(options); - - this.ftGraphiteKey = process.env.FT_GRAPHITE_KEY; - if (!this.ftGraphiteKey) { - throw new Error('You must set FT_GRAPHITE_KEY environment variable'); - } - - this.metric = options.metric || options.key; - if (!this.metric) { - throw new Error( - `You must pass in a metric for the "${options.name}" check - e.g., "next.heroku.article.*.express.start"` - ); - } - - const fromTime = options.time || '-5minutes'; - this.url = encodeURI( - `https://graphitev2-api.ft.com/render/?target=${this.metric}&from=${fromTime}&format=json` - ); - - this.checkOutput = 'This check has not yet run'; - } - - async tick() { - try { - const json = await fetch(this.url, { - headers: { key: this.ftGraphiteKey } - }).then(fetchres.json); - - if (!json.length) { - badJSON('returned JSON should be an array'); - } - - if (!json[0].datapoints) { - badJSON('No datapoints property'); - } - - if (json[0].datapoints.length < 1) { - badJSON('Expected at least one datapoint'); - } - - const simplifiedResults = json.map((result) => { - let nullsForHowManySeconds; - - if (result.datapoints.every((datapoint) => datapoint[0] === null)) { - nullsForHowManySeconds = Infinity; - } else { - // This sums the number of seconds since the last non-null result at the tail of the list of metrics. - nullsForHowManySeconds = result.datapoints - .map((datapoint, index, array) => [ - datapoint[0], - index > 0 ? datapoint[1] - array[index - 1][1] : 0 - ]) - .reduce( - (xs, datapoint) => - datapoint[0] === null ? xs + datapoint[1] : 0, - 0 - ); - } - - const simplifiedResult = { - target: result.target, - nullsForHowManySeconds - }; - logger.debug( - { event: `${logEventPrefix}_NULLS_FOR_HOW_LONG` }, - simplifiedResult - ); - return simplifiedResult; - }); - - const failedResults = simplifiedResults.filter( - (r) => r.nullsForHowManySeconds >= 180 - ); - - if (failedResults.length === 0) { - this.status = status.PASSED; - this.checkOutput = `${this.metric} has data`; - } else { - this.status = status.FAILED; - this.checkOutput = failedResults - .map( - (r) => - `${r.target} has been null for ${Math.round( - r.nullsForHowManySeconds / 60 - )} minutes.` - ) - .join(' '); - } - } catch (err) { - logger.error({ event: `${logEventPrefix}_ERROR`, url: this.url }, err); - this.status = status.FAILED; - this.checkOutput = err.toString(); - } - } -} - -module.exports = GraphiteWorkingCheck; diff --git a/src/checks/index.js b/src/checks/index.js index c3e6a8d..2f55d5f 100644 --- a/src/checks/index.js +++ b/src/checks/index.js @@ -5,9 +5,6 @@ module.exports = { responseCompare: require('./responseCompare.check'), json: require('./json.check'), string: require('./string.check'), - graphiteSpike: require('./graphiteSpike.check'), - graphiteThreshold: require('./graphiteThreshold.check'), - graphiteWorking: require('./graphiteWorking.check'), cloudWatchAlarm: require('./cloudWatchAlarm.check'), cloudWatchThreshold: require('./cloudWatchThreshold.check'), fastlyKeyExpiration: require('./fastlyKeyExpiration.check') diff --git a/test/fixtures/config/aggregate.js b/test/fixtures/config/aggregate.js index 37d4453..55a19e3 100644 --- a/test/fixtures/config/aggregate.js +++ b/test/fixtures/config/aggregate.js @@ -4,23 +4,33 @@ module.exports = { description: '', checks: [ { - type: 'graphiteThreshold', + type: 'json', name: 'test1', - metric: 'next.fake.metric', + url: 'http://pretendurl.com', severity: 2, businessImpact: 'blah', technicalSummary: 'god knows', panicGuide: "Don't Panic", + checkResult: { + PASSED: 'Text if check passed', + FAILED: 'Text is check failed', + PENDING: 'This check has not yet run' + }, interval: '1s' }, { - type: 'graphiteThreshold', + type: 'json', name: 'test2', - metric: 'next.fake.metric', + url: 'http://pretendurl.com', severity: 2, businessImpact: 'blah', technicalSummary: 'god knows', panicGuide: "Don't Panic", + checkResult: { + PASSED: 'Text if check passed', + FAILED: 'Text is check failed', + PENDING: 'This check has not yet run' + }, interval: '1s' }, { diff --git a/test/fixtures/config/graphiteSpikeFixture.js b/test/fixtures/config/graphiteSpikeFixture.js deleted file mode 100644 index 5f7d508..0000000 --- a/test/fixtures/config/graphiteSpikeFixture.js +++ /dev/null @@ -1,16 +0,0 @@ -'use strict'; -module.exports = { - name: 'graphite', - descriptions: '', - checks: [ - { - type: 'graphiteSpike', - numerator: 'next.metric.200', - name: 'test', - severity: 2, - businessImpact: 'catastrophic', - technicalSummary: 'god knows', - panicGuide: "Don't Panic" - } - ] -}; diff --git a/test/fixtures/config/graphiteThresholdFixture.js b/test/fixtures/config/graphiteThresholdFixture.js deleted file mode 100644 index 60cbbf1..0000000 --- a/test/fixtures/config/graphiteThresholdFixture.js +++ /dev/null @@ -1,16 +0,0 @@ -'use strict'; -module.exports = { - name: 'graphite', - descriptions: '', - checks: [ - { - type: 'graphiteThreshold', - metric: 'next.metric.200', - name: 'test', - severity: 2, - businessImpact: 'catastrophic', - technicalSummary: 'god knows', - panicGuide: "Don't Panic" - } - ] -}; diff --git a/test/fixtures/config/graphiteWorkingFixture.js b/test/fixtures/config/graphiteWorkingFixture.js deleted file mode 100644 index eb62592..0000000 --- a/test/fixtures/config/graphiteWorkingFixture.js +++ /dev/null @@ -1,13 +0,0 @@ -module.exports = { - name: 'graphite check fixture', - description: '', - checks: [ - { - type: 'graphiteWorking', - name: 'test1', - metric: 'next.fastly.f8585BOxnGQDMbnkJoM1e.all.requests', - severity: 2, - businessImpact: 'loss of millions in pounds' - } - ] -}; diff --git a/test/fixtures/config/paywall.js b/test/fixtures/config/paywall.js index edbf424..b914495 100644 --- a/test/fixtures/config/paywall.js +++ b/test/fixtures/config/paywall.js @@ -3,13 +3,18 @@ module.exports = { description: ' ', checks: [ { - type: 'graphiteThreshold', + type: 'json', name: 'test', severity: 2, - metric: 'next.fake.metric', + url: 'http://pretendurl.com', businessImpact: 'blah', technicalSummary: 'god knows', panicGuide: "Don't Panic", + checkResult: { + PASSED: 'Text if check passed', + FAILED: 'Text is check failed', + PENDING: 'This check has not yet run' + }, interval: '1s' } ] diff --git a/test/graphiteSpike.check.spec.js b/test/graphiteSpike.check.spec.js deleted file mode 100644 index a692c12..0000000 --- a/test/graphiteSpike.check.spec.js +++ /dev/null @@ -1,242 +0,0 @@ -'use strict'; - -const expect = require('chai').expect; -const fixture = require('./fixtures/config/graphiteSpikeFixture').checks[0]; -const proxyquire = require('proxyquire').noCallThru().noPreserveCache(); -const sinon = require('sinon'); - -function getCheckConfig(conf) { - return Object.assign({}, fixture, conf || {}); -} - -let mockFetch; -let Check; - -// Mocks a pair of calls to graphite for sample and baseline data -function mockGraphite(results) { - mockFetch = sinon.stub().returns( - Promise.resolve({ - status: 200, - ok: true, - json: () => Promise.resolve([{ datapoints: [[results.shift()]] }]) - }) - ); - - Check = proxyquire('../src/checks/graphiteSpike.check', { - 'node-fetch': mockFetch - }); -} - -describe('Graphite Spike Check', function () { - let check; - - afterEach(function () { - check.stop(); - }); - - describe('service config', function () { - it('Should use the FT graphite', function (done) { - mockGraphite([2, 1]); - check = new Check( - getCheckConfig({ - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(mockFetch.firstCall.args[0]).to.match( - /^https:\/\/graphitev2-api\.ft\.com\/render\/\?/ - ); - done(); - }); - }); - }); - - it('Should be able to report a successful check of absolute values', function (done) { - mockGraphite([2, 1]); - check = new Check( - getCheckConfig({ - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(mockFetch.firstCall.args[0]).to.contain( - 'from=-10min&format=json&target=summarize(sumSeries(transformNull(next.metric.200)),"10min","sum",true)' - ); - expect(mockFetch.secondCall.args[0]).to.contain( - 'from=-7d&format=json&target=summarize(sumSeries(transformNull(next.metric.200)),"7d","sum",true)' - ); - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be able to report a successful check of percentage values', function (done) { - mockGraphite([2, 1]); - check = new Check( - getCheckConfig({ - divisor: 'metric.*' - }) - ); - check.start(); - setTimeout(() => { - expect(mockFetch.firstCall.args[0]).to.contain( - 'from=-10min&format=json&target=divideSeries(summarize(sumSeries(transformNull(next.metric.200)),"10min","sum",true),summarize(sumSeries(transformNull(metric.*)),"10min","sum",true))' - ); - expect(mockFetch.secondCall.args[0]).to.contain( - 'from=-7d&format=json&target=divideSeries(summarize(sumSeries(transformNull(next.metric.200)),"7d","sum",true),summarize(sumSeries(transformNull(metric.*)),"7d","sum",true))' - ); - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be possible to detect spikes', function (done) { - mockGraphite([4, 1]); - check = new Check( - getCheckConfig({ - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - - it('Should be possible to detect negative non-spikes', function (done) { - mockGraphite([1, 2]); - check = new Check( - getCheckConfig({ - direction: 'down', - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be possible to detect negative spikes', function (done) { - mockGraphite([1, 4]); - check = new Check( - getCheckConfig({ - direction: 'down', - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - - it('Should be possible to configure sample and baseline periods', function (done) { - mockGraphite([2, 1]); - check = new Check( - getCheckConfig({ - samplePeriod: '24h', - baselinePeriod: '2d', - divisor: 'metric.*', - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(mockFetch.firstCall.args[0]).to.contain( - 'from=-24h&format=json&target=divideSeries(summarize(sumSeries(transformNull(next.metric.200)),"24h","sum",true),summarize(sumSeries(transformNull(metric.*)),"24h","sum",true))' - ); - expect(mockFetch.secondCall.args[0]).to.contain( - 'from=-2d&format=json&target=divideSeries(summarize(sumSeries(transformNull(next.metric.200)),"2d","sum",true),summarize(sumSeries(transformNull(metric.*)),"2d","sum",true))' - ); - done(); - }); - }); - - it('Should normalize by default when no divisor specified', function (done) { - mockGraphite([0.75, 0.5]); - check = new Check( - getCheckConfig({ - samplePeriod: '1h', - baselinePeriod: '2h', - threshold: 2 - }) - ); - check.start(); - setTimeout(() => { - // because the 0.75 should get converted to a 3 once normalized - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - - it('Should be possible to configure spike threshold', function (done) { - mockGraphite([6, 1]); - check = new Check( - getCheckConfig({ - threshold: 5, - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - - it('Should be possible to configure negative spike threshold', function (done) { - mockGraphite([1, 6]); - check = new Check( - getCheckConfig({ - direction: 'down', - threshold: 5, - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - - it('Should be possible to handle sample and baseline null values', function (done) { - mockGraphite([null, null]); - check = new Check( - getCheckConfig({ - direction: 'up', - threshold: 5, - divisor: 'metric.*', - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be possible to handle sample and baseline 0 values', function (done) { - mockGraphite([0, 0]); - check = new Check( - getCheckConfig({ - direction: 'up', - threshold: 5, - divisor: 'metric.*', - normalize: false - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); -}); diff --git a/test/graphiteThreshold.check.spec.js b/test/graphiteThreshold.check.spec.js deleted file mode 100644 index da13066..0000000 --- a/test/graphiteThreshold.check.spec.js +++ /dev/null @@ -1,541 +0,0 @@ -'use strict'; - -const expect = require('chai').expect; -const fixture = require('./fixtures/config/graphiteThresholdFixture').checks[0]; -const proxyquire = require('proxyquire').noCallThru().noPreserveCache(); -const sinon = require('sinon'); - -function getCheckConfig(conf) { - return Object.assign({}, fixture, conf || {}); -} - -const mockLogger = { - error: function () {}, - info: function () {}, - debug: function () {} -}; - -let mockFetch; -let Check; - -// Mocks a pair of calls to graphite for sample and baseline data -function mockGraphite(results) { - mockFetch = sinon.stub().returns( - Promise.resolve({ - status: 200, - ok: true, - json: () => Promise.resolve(results) - }) - ); - - Check = proxyquire('../src/checks/graphiteThreshold.check', { - '@dotcom-reliability-kit/logger': mockLogger, - 'node-fetch': mockFetch - }); -} - -describe('Graphite Threshold Check', function () { - let check; - - afterEach(function () { - check.stop(); - }); - - context('Upper threshold enforced', function () { - it('Should be healthy if all datapoints below upper threshold', function (done) { - mockGraphite([ - { - datapoints: [ - [7, 1234567890], - [8, 1234567891] - ] - }, - { - datapoints: [ - [9, 1234567892], - [10, 1234567893] - ] - } - ]); - check = new Check( - getCheckConfig({ - threshold: 11 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be healthy if any datapoints are equal to upper threshold', function (done) { - mockGraphite([ - { - datapoints: [ - [8, 1234567890], - [9, 1234567891] - ] - }, - { - datapoints: [ - [10, 1234567892], - [11, 1234567893] - ] - } - ]); - check = new Check( - getCheckConfig({ - threshold: 11 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('should be unhealthy if any datapoints are above upper threshold', (done) => { - mockGraphite([ - { - datapoints: [ - [8, 1234567890], - [9, 1234567891] - ] - }, - { - datapoints: [ - [10, 1234567892], - [12, 1234567893] - ] - } - ]); - check = new Check( - getCheckConfig({ - threshold: 11 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - }); - - context('Lower threshold enforced', function () { - it('Should be healthy if all datapoints are above lower threshold', function (done) { - mockGraphite([ - { - datapoints: [ - [12, 1234567890], - [13, 1234567891] - ] - }, - { - datapoints: [ - [14, 1234567892], - [15, 1234567893] - ] - } - ]); - check = new Check( - getCheckConfig({ - threshold: 11, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be healthy if any datapoints are equal to lower threshold', function (done) { - mockGraphite([ - { - datapoints: [ - [11, 1234567890], - [12, 1234567891] - ] - }, - { - datapoints: [ - [13, 1234567892], - [14, 1234567893] - ] - } - ]); - check = new Check( - getCheckConfig({ - threshold: 11, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('should be unhealthy if any datapoints are below lower threshold', (done) => { - mockGraphite([ - { - target: 'next.heroku.cpu.min', - datapoints: [ - [10, 1234567890], - [12, 1234567891] - ] - }, - { - target: 'next.heroku.disk.min', - datapoints: [ - [10, 1234567890], - [12, 1234567891] - ] - }, - { - target: 'next.heroku.memory.min', - datapoints: [ - [13, 1234567892], - [14, 1234567893] - ] - } - ]); - check = new Check( - getCheckConfig({ - threshold: 11, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - expect(check.getStatus().checkOutput).to.equal( - 'In the last 10min, the following metric(s) have moved below the threshold value of 11: next.heroku.cpu.min next.heroku.disk.min' - ); - done(); - }); - }); - - it('should be healthy if any non-null datapoint is below lower threshold', (done) => { - mockGraphite([ - { - target: - 'next.heroku.es-interface.pushUpdater_1_EU.push_event_received', - datapoints: [ - [2, 1531921740], - [3, 1531921800], - [2, 1531921860], - [null, 1531921920], - [null, 1531921980] - ] - } - ]); - check = new Check( - getCheckConfig({ - threshold: 1, - direction: 'below', - samplePeriod: '5min' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - expect(check.getStatus().checkOutput).to.equal( - 'No threshold error detected in graphite data for next.metric.200.' - ); - done(); - }); - }); - }); - - context('It handles summarize(sumSeries)', function () { - it('Should be healthy if sum above threshold', function (done) { - mockGraphite([ - { - datapoints: [ - [null, 1635125640], - [null, 1635129240], - [1, 1635132840], - [1, 1635136440], - [null, 1635140040], - [3, 1635143640] - ], - target: 'summarize(sumSeries)' - } - ]); - check = new Check( - getCheckConfig({ - threshold: 1, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be healthy if sum below threshold', function (done) { - mockGraphite([ - { - datapoints: [ - [null, 1635125640], - [null, 1635129240], - [1, 1635132840], - [1, 1635136440], - [null, 1635140040], - [3, 1635143640] - ], - target: 'summarize(sumSeries)' - } - ]); - check = new Check( - getCheckConfig({ - threshold: 8 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be unhealthy if sum below threshold', function (done) { - mockGraphite([ - { - datapoints: [ - [null, 1635125640], - [null, 1635129240], - [1, 1635132840], - [1, 1635136440], - [null, 1635140040], - [3, 1635143640] - ], - target: 'summarize(sumSeries)' - } - ]); - check = new Check( - getCheckConfig({ - threshold: 6, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - - it('Should be unhealthy if sum above threshold', function (done) { - mockGraphite([ - { - datapoints: [ - [null, 1635125640], - [null, 1635129240], - [1, 1635132840], - [1, 1635136440], - [null, 1635140040], - [3, 1635143640] - ], - target: 'summarize(sumSeries)' - } - ]); - check = new Check( - getCheckConfig({ - threshold: 4 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - }); - - context('It handles asPercent', function () { - it('Should be healthy if above threshold', function (done) { - mockGraphite([ - { - datapoints: [[8, 1635125640]], - target: `asPercent(summarize(sumSeries(failure.count), '10min', 'sum', true), summarize(sumSeries(success.count), '10min', 'sum', true))` - } - ]); - - check = new Check( - getCheckConfig({ - threshold: 1, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be healthy if below threshold', function (done) { - mockGraphite([ - { - datapoints: [[null, 1635125640]], - target: `asPercent(summarize(sumSeries(failure.count), '10min', 'sum', true), summarize(sumSeries(success.count), '10min', 'sum', true))` - } - ]); - check = new Check( - getCheckConfig({ - threshold: 1 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be unhealthy if above threshold', function (done) { - mockGraphite([ - { - datapoints: [[8, 1635125640]], - target: `asPercent(summarize(sumSeries(failure.count), '10min', 'sum', true),summarize(sumSeries(success.count), '10min', 'sum', true))` - } - ]); - check = new Check( - getCheckConfig({ - threshold: 5 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - - it('Should be unhealthy if below threshold', function (done) { - mockGraphite([ - { - datapoints: [[null, 1635125640]], - target: `asPercent(summarize(sumSeries(failure.count), '10min', 'sum', true), summarize(sumSeries(success.count), '10min', 'sum', true))` - } - ]); - check = new Check( - getCheckConfig({ - threshold: 4, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - }); - - context('It handles divideSeries', function () { - it('Should be healthy if above threshold', function (done) { - mockGraphite([ - { - datapoints: [[8, 1635125640]], - target: `divideSeries(sumSeries(error.count), sumSeries(status.*.count))` - } - ]); - - check = new Check( - getCheckConfig({ - threshold: 1, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be healthy if below threshold', function (done) { - mockGraphite([ - { - datapoints: [[null, 1635125640]], - target: `divideSeries(sumSeries(error.count), sumSeries(status.*.count))` - } - ]); - check = new Check( - getCheckConfig({ - threshold: 1 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.true; - done(); - }); - }); - - it('Should be unhealthy if above threshold', function (done) { - mockGraphite([ - { - datapoints: [[8, 1635125640]], - target: `divideSeries(sumSeries(error.count),sumSeries(status.*.count))` - } - ]); - check = new Check( - getCheckConfig({ - threshold: 5 - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - - it('Should be unhealthy if below threshold', function (done) { - mockGraphite([ - { - datapoints: [[null, 1635125640]], - target: `divideSeries(sumSeries(error.count), sumSeries(status.*.count))` - } - ]); - check = new Check( - getCheckConfig({ - threshold: 4, - direction: 'below' - }) - ); - check.start(); - setTimeout(() => { - expect(check.getStatus().ok).to.be.false; - done(); - }); - }); - }); - - it('Should be possible to configure sample period', function (done) { - mockGraphite([{ datapoints: [] }]); - check = new Check( - getCheckConfig({ - samplePeriod: '24h' - }) - ); - check.start(); - setTimeout(() => { - expect(mockFetch.firstCall.args[0]).to.contain( - 'from=-24h&target=next.metric.200' - ); - done(); - }); - }); -}); diff --git a/test/graphiteworking.spec.js b/test/graphiteworking.spec.js deleted file mode 100644 index 9c02083..0000000 --- a/test/graphiteworking.spec.js +++ /dev/null @@ -1,137 +0,0 @@ -'use strict'; -const expect = require('chai').expect; -const sinon = require('sinon'); -const proxyquire = require('proxyquire').noCallThru().noPreserveCache(); - -describe('Graphite Working Check', function () { - const fixture = require('./fixtures/config/graphiteWorkingFixture').checks[0]; - - let GraphiteWorkingCheck; - let check; - let mockFetch; - - const goodResponse = [ - { - target: - 'summarize(next.fastly.133g5BGAc00Hv4v8t0dMry.asia.requests, "1h", "sum", true)', - datapoints: [ - [null, 1459333140], - [null, 1459333200], - [1, 1459333260], - [1, 1459333320], - [null, 1459333380], - [null, 1459333420] - ] - } - ]; - - const recentlyBadResponse = [ - { - target: - 'summarize(next.fastly.133g5BGAc00Hv4v8t0dMry.anzac.requests, "1h", "sum", true)', - datapoints: [ - [null, 1459333140], - [1, 1459333200], - [1, 1459333260], - [null, 1459333320], - [null, 1459333380], - [null, 1459333440] - ] - } - ]; - - const completelyBadResponse = [ - { - target: - 'summarize(next.fastly.133g5BGAc00Hv4v8t0dMry.anzac.requests, "1h", "sum", true)', - datapoints: [ - [null, 1459333140], - [null, 1459333200], - [null, 1459333260], - [null, 1459333320], - [null, 1459333380], - [null, 1459333440] - ] - } - ]; - - function waitFor(time) { - return new Promise((resolve) => setTimeout(resolve, time)); - } - - function setup(response) { - mockFetch = sinon.stub().returns( - Promise.resolve({ - ok: true, - json: function () { - return Promise.resolve(response); - } - }) - ); - - GraphiteWorkingCheck = proxyquire( - '../src/checks/graphiteWorking.check.js', - { 'node-fetch': mockFetch } - ); - check = new GraphiteWorkingCheck(fixture); - } - - it('Should call graphite using the given key', () => { - setup(goodResponse); - check.start(); - return waitFor(10).then(() => { - sinon.assert.called(mockFetch); - let url = mockFetch.lastCall.args[0]; - expect(url).to.contain(fixture.metric); - expect(url).to.contain('format=json'); - }); - }); - - it('Should pass if there is some data for the given key', () => { - setup(goodResponse); - check.start(); - return waitFor(10).then(() => { - expect(check.getStatus().checkOutput).to.equal( - 'next.fastly.f8585BOxnGQDMbnkJoM1e.all.requests has data' - ); - expect(check.getStatus().ok).to.be.true; - }); - }); - - it('Should fail if there is has been 2 or more minutes of missing data', () => { - setup(recentlyBadResponse); - check.start(); - return waitFor(10).then(() => { - expect(check.getStatus().ok).to.be.false; - expect(check.getStatus().checkOutput).to.equal( - 'summarize(next.fastly.133g5BGAc00Hv4v8t0dMry.anzac.requests, "1h", "sum", true) has been null for 3 minutes.' - ); - }); - }); - - it('Should fail if there is no data', () => { - setup(completelyBadResponse); - check.start(); - return waitFor(10).then(() => { - expect(check.getStatus().ok).to.be.false; - expect(check.getStatus().checkOutput).to.equal( - 'summarize(next.fastly.133g5BGAc00Hv4v8t0dMry.anzac.requests, "1h", "sum", true) has been null for Infinity minutes.' - ); - }); - }); - - //todo get the graphite api key into the CI config - doesn't seem possible right now... - describe.skip('Integration', function () { - before(() => { - GraphiteWorkingCheck = require('../src/checks/graphiteWorking.check'); - check = new GraphiteWorkingCheck(fixture); - }); - - it('Can actually call graphite', () => { - check.start(); - return waitFor(1000).then(() => { - expect(check.getStatus().ok).to.be.true; - }); - }); - }); -}); diff --git a/test/healthchecks.spec.js b/test/healthchecks.spec.js index 881b98d..aea6b3e 100644 --- a/test/healthchecks.spec.js +++ b/test/healthchecks.spec.js @@ -1,7 +1,7 @@ 'use strict'; const expect = require('chai').expect; -const GraphiteThresholdCheck = require('../src/checks/').graphiteThreshold; +const JsonCheck = require('../src/checks/').json; describe('Healthchecks', function () { let Healthchecks; @@ -29,7 +29,7 @@ describe('Healthchecks', function () { }); it('Should create new checks as described in the config', function () { - expect(healthchecks.checks[0]).to.be.an.instanceOf(GraphiteThresholdCheck); + expect(healthchecks.checks[0]).to.be.an.instanceOf(JsonCheck); }); it('Should report its status correctly', function () { diff --git a/test/startup.spec.js b/test/startup.spec.js index 467bdff..b430ff3 100644 --- a/test/startup.spec.js +++ b/test/startup.spec.js @@ -5,11 +5,6 @@ const path = require('path'); const HealthChecks = require('../src/healthchecks'); const startup = require('../src/startup'); -// We set this so that the tests don't immediately fail due to a missing environment variable. -// n-gage used to provide the environment variables locally via dotenv and the tests were written -// assuming that it'd be present. -process.env.FT_GRAPHITE_KEY = 'mock-graphite-key'; - describe('Startup', function () { it('Should read in the config dir and create new healthcheck objects', function () { const result = startup(path.resolve(__dirname, 'fixtures/config/'));