From 01e714c6e24696bdfdc3d6f6181648ef1d9e3031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20L=C3=BChne?= Date: Mon, 8 Jan 2018 16:49:32 +0100 Subject: [PATCH] Provide generalized aggregation This implements a generalized method for aggregating time-series data. Data can be aggregated over week or month intervals with a variety of aggregation methods to choose from. This will be useful for providing chart views at different levels (such as two-year periods vs. just showing the last month). Additionally, the generalized form of aggregation can be used to smooth out graphs where the sampling frequency changed with an update to Hubble Enterprise. The aggregation is done by splitting the time data into subsequent, gapless periods of time (weeks starting with Mondays or months), for each of which the aggregated values are then computed and returned. Aggregation methods define how to aggregate the values within individual time periods. The following aggregation methods are supported: - sum - mean - min - max - first (the chronologically first available value for that period) - last - median Periods with incomplete data at the beginning or the end of the time series are excluded from the aggregation. Finally, the pull request usage chart is changed to make use of the new aggregation facilities to reduce the granularity from daily to monthly data for now. This might be changed when we implement detail views. I also added several unit tests to check the aggregation methods (for off-by-one errors in particular) as well as a short piece of documentation on the new configuration options. --- docs/README.md | 4 +- docs/assets/js/charts.js | 146 +++++++++++++++++++++++++++++---------- docs/pr-total.html | 5 +- docs/pr-usage.html | 1 + docs/spec/.eslintrc.json | 3 + docs/spec/charts.js | 138 +++++++++++++++++++++++++++++++++++- 6 files changed, 256 insertions(+), 41 deletions(-) diff --git a/docs/README.md b/docs/README.md index f4f0576a..05f02cd3 100644 --- a/docs/README.md +++ b/docs/README.md @@ -49,7 +49,9 @@ For details on how each kind of chart is rendered, take a look at [`charts.js`]( | `series ` | array of strings | only include these data series and drop all others (referenced by TSV table headings) | | `visibleSeries ` | array of strings | only show the listed data series and hide all others initially (referenced by TSV table headings) | | `sliceData ` | array `[t0, t1]` | slice the data from the TSV file as if `data.slice(t0, t1)` was called | -| `aggregate ` | weekly | if set to `weekly`, aggregate the data by week by computing the sum of the values within each week | +| `aggregate ` | dictionary (see below) | defines how data should be aggregated (default: `undefined`, which leaves the data untouched) | +| `aggregate.period` | `week`, `month` | specifies the range over which the data shall be aggregated | +| `aggregate.method` | `sum`, `mean`, `min`, `max`, `first`, `last`, `median` | specifies the aggregation method; `first` and `last` select the chronologically first or last data point present in each period, respectively | | `showRawDataLink` | `true`, `false` | show the link to download the chart’s raw data (default: `true`) | ##### List Charts diff --git a/docs/assets/js/charts.js b/docs/assets/js/charts.js index efcb8edc..b351feaa 100644 --- a/docs/assets/js/charts.js +++ b/docs/assets/js/charts.js @@ -142,6 +142,111 @@ function createSpinner(canvas) }; } +function aggregateTimeData(data, aggregationConfig) +{ + if (!(data instanceof Array)) + throw 'expected data array as input'; + + if (data.length < 1) + return; + + // Turn date strings into proper date objects + for (let i = 0; i < data.length; i++) + data[i]['date'] = d3.isoParse(data[i]['date']); + + // Sort data, just in case it isn’t already + data.sort((row1, row2) => row1['date'] - row2['date']); + + const dateStart = data[0]['date']; + // Ranges are exclusive, so add one more day to include the last date + const dateEnd = d3.utcDay.offset(data[data.length - 1]['date'], 1); + + let period; + + switch (aggregationConfig['period']) + { + case 'week': + period = d3.utcMonday; + break; + case 'month': + period = d3.utcMonth; + break; + default: + throw 'unknown aggregation period "' + aggregationConfig['period'] + '"'; + } + + // Don't use incomplete periods at the beginning and the end of the data + const t0 = period.ceil(dateStart); + // In d3, ranges include the start value but exclude the end value. + // We want to include the last period as well, so add one more period + const t1 = period.offset(period.floor(dateEnd), 1); + const periods = period.range(t0, t1); + + let aggregatedData = Array(); + + for (let i = 0; i < periods.length - 1; i++) + { + const t0 = periods[i]; + const t1 = periods[i + 1]; + + // Note that this assumes complete data in the period. + // Should data points be missing, aggregation methods such as the sum will lead to results that can't be + // compared to periods with complete data. + // Hence, the maintainers of the data need to ensure that the input is well-formed + const dates = data.filter(row => row['date'] >= t0 && row['date'] < t1); + + let row = Object(); + row['date'] = t0; + + $.each(Object.keys(data[0]), + function(keyID, key) + { + // Exclude the date itself from aggregation + if (key == 'date') + return; + + if (dates.length == 0) + { + row[key] = undefined; + return; + } + + const accessor = (row => row[key]); + + switch (aggregationConfig['method']) + { + case 'sum': + row[key] = d3.sum(dates, accessor); + break; + case 'mean': + row[key] = d3.mean(dates, accessor); + break; + case 'median': + row[key] = d3.median(dates, accessor); + break; + case 'first': + row[key] = dates[0][key]; + break; + case 'last': + row[key] = dates[dates.length - 1][key]; + break; + case 'min': + row[key] = d3.min(dates, accessor); + break; + case 'max': + row[key] = d3.max(dates, accessor); + break; + default: + throw 'unknown aggregation method "' + aggregationConfig['method'] + '"'; + } + }); + + aggregatedData.push(row); + } + + return aggregatedData; +} + function createHistoryChart(canvas) { const url = $(canvas).data('url'); @@ -169,47 +274,12 @@ function createHistoryChart(canvas) const context = canvas.getContext('2d'); - if (readConfig($(canvas), 'aggregate') == 'weekly') - { - let aggregatedData = Array(); - data.sort( - function(row1, row2) - { - let date1 = new Date(row1['date']); - let date2 = new Date(row2['date']); - return date1 - date2; - }); - - let currentRow = Object(); - - for (let i = 0; i < data.length; i++) - { - if (i % 7 == 0) - $.each(Object.keys(data[i]).slice(1), - function(keyID, key) - { - currentRow[key] = 0; - }); - - currentRow['date'] = data[i]['date']; - - $.each(Object.keys(data[i]).slice(1), - function(keyID, key) - { - currentRow[key] += data[i][key]; - }); - - if (i % 7 == 6) - // Store a copy of the aggregated data - aggregatedData.push($.extend({}, currentRow)); - } - - data = aggregatedData; - } - if (hasConfig($(canvas), 'sliceData')) data = data.slice(readConfig($(canvas), 'sliceData')[0], readConfig($(canvas), 'sliceData')[1]); + if (hasConfig($(canvas), 'aggregate')) + data = aggregateTimeData(data, $(canvas).data('config').aggregate); + const originalDataSeries = Object.keys(data[0]).slice(1); const dataSeries = hasConfig($(canvas), 'series') diff --git a/docs/pr-total.html b/docs/pr-total.html index eb9b44a5..9cdd63fb 100644 --- a/docs/pr-total.html +++ b/docs/pr-total.html @@ -17,7 +17,10 @@

Pull Requests (Total, by Week)

"visibleSeries": [ "merged" ], - "aggregate": "weekly" + "aggregate": { + "period": "week", + "method": "sum" + } }'>

diff --git a/docs/pr-usage.html b/docs/pr-usage.html index 551c7bd7..035d2901 100644 --- a/docs/pr-usage.html +++ b/docs/pr-usage.html @@ -9,6 +9,7 @@

Pull Request Usage

diff --git a/docs/spec/.eslintrc.json b/docs/spec/.eslintrc.json index 1717deec..b2ec1a08 100644 --- a/docs/spec/.eslintrc.json +++ b/docs/spec/.eslintrc.json @@ -6,5 +6,8 @@ "node": true, "jasmine": true, "jquery": true + }, + "globals": { + "d3": false } } diff --git a/docs/spec/charts.js b/docs/spec/charts.js index 6205cf25..87a02094 100644 --- a/docs/spec/charts.js +++ b/docs/spec/charts.js @@ -1,4 +1,11 @@ -/* global createChordChart, createHistoryChart, createList, createTable, createSpinner */ +/* global + aggregateTimeData, + createChordChart, + createHistoryChart, + createList, + createTable, + createSpinner, +*/ describe('global charts.js', function() { @@ -62,4 +69,133 @@ describe('global charts.js', function() }); }); }); + describe('aggregation for time series', function() + { + // Generate data from startDate to endDate (both inclusive) with a generator functor + function generateData(startDate, endDate, generator) + { + let dates = d3.utcDay.range(d3.isoParse(startDate), d3.utcDay.offset(d3.isoParse(endDate), 1)); + let data = Array(); + + for (let i = 0; i < dates.length; i++) + data.push({'date': dates[i], 'value': generator(i)}); + + return data; + } + + // Integer range generator + function integerRangeGenerator(start, modulo) + { + if (modulo) + return (i => (start + i) % modulo); + + return (i => start + i); + } + + const dateToString = d3.utcFormat('%Y-%m-%d'); + + it('should aggregate over weeks correctly', function() + { + const aggregationConfig = {'period': 'week', 'method': 'max'}; + const generator = integerRangeGenerator(0, 28); + // 2018-01-01 is a Monday, and 2018-09-30 is a Sunday + const data = generateData('2018-01-01', '2018-09-30', generator); + const aggregatedData = aggregateTimeData(data, aggregationConfig); + + expect(aggregatedData.length = 39); + expect(dateToString(aggregatedData[0]['date'])).toEqual('2018-01-01'); + expect(dateToString(aggregatedData[1]['date'])).toEqual('2018-01-08'); + expect(dateToString(aggregatedData[2]['date'])).toEqual('2018-01-15'); + expect(dateToString(aggregatedData[37]['date'])).toEqual('2018-09-17'); + expect(dateToString(aggregatedData[38]['date'])).toEqual('2018-09-24'); + expect(aggregatedData[0]['value']).toEqual(6); + expect(aggregatedData[1]['value']).toEqual(13); + expect(aggregatedData[2]['value']).toEqual(20); + expect(aggregatedData[4]['value']).toEqual(6); + expect(aggregatedData[5]['value']).toEqual(13); + expect(aggregatedData[36]['value']).toEqual(6); + expect(aggregatedData[37]['value']).toEqual(13); + expect(aggregatedData[38]['value']).toEqual(20); + }); + + it('should not have off-by-one errors (1)', function() + { + const aggregationConfig = {'period': 'week', 'method': 'max'}; + const generator = integerRangeGenerator(27, 28); + // 2017-12-31 is a Sunday, and 2018-10-01 is a Monday + const data = generateData('2017-12-31', '2018-10-01', generator); + const aggregatedData = aggregateTimeData(data, aggregationConfig); + + expect(aggregatedData.length = 39); + expect(dateToString(aggregatedData[0]['date'])).toEqual('2018-01-01'); + expect(dateToString(aggregatedData[1]['date'])).toEqual('2018-01-08'); + expect(dateToString(aggregatedData[2]['date'])).toEqual('2018-01-15'); + expect(dateToString(aggregatedData[37]['date'])).toEqual('2018-09-17'); + expect(dateToString(aggregatedData[38]['date'])).toEqual('2018-09-24'); + expect(aggregatedData[0]['value']).toEqual(6); + expect(aggregatedData[1]['value']).toEqual(13); + expect(aggregatedData[2]['value']).toEqual(20); + expect(aggregatedData[4]['value']).toEqual(6); + expect(aggregatedData[5]['value']).toEqual(13); + expect(aggregatedData[36]['value']).toEqual(6); + expect(aggregatedData[37]['value']).toEqual(13); + expect(aggregatedData[38]['value']).toEqual(20); + }); + + it('should not have off-by-one errors (2)', function() + { + const aggregationConfig = {'period': 'week', 'method': 'max'}; + const generator = integerRangeGenerator(1, 28); + // 2018-01-02 is a Tuesday, and 2018-09-29 is a Saturday + const data = generateData('2018-01-02', '2018-09-29', generator); + const aggregatedData = aggregateTimeData(data, aggregationConfig); + + expect(aggregatedData.length = 37); + expect(dateToString(aggregatedData[0]['date'])).toEqual('2018-01-08'); + expect(dateToString(aggregatedData[1]['date'])).toEqual('2018-01-15'); + expect(dateToString(aggregatedData[35]['date'])).toEqual('2018-09-10'); + expect(dateToString(aggregatedData[36]['date'])).toEqual('2018-09-17'); + expect(aggregatedData[0]['value']).toEqual(13); + expect(aggregatedData[1]['value']).toEqual(20); + expect(aggregatedData[3]['value']).toEqual(6); + expect(aggregatedData[4]['value']).toEqual(13); + expect(aggregatedData[35]['value']).toEqual(6); + expect(aggregatedData[36]['value']).toEqual(13); + }); + + it('should aggregate sums correctly', function() + { + const aggregationConfig = {'period': 'week', 'method': 'sum'}; + const generator = integerRangeGenerator(0, 10); + // 2018-01-01 is a Monday, and 2018-09-30 is a Sunday + const data = generateData('2018-01-01', '2018-09-30', generator); + const aggregatedData = aggregateTimeData(data, aggregationConfig); + + expect(aggregatedData.length = 39); + expect(aggregatedData[0]['value']).toEqual(21); + expect(aggregatedData[1]['value']).toEqual(30); + expect(aggregatedData[2]['value']).toEqual(39); + expect(aggregatedData[36]['value']).toEqual(35); + expect(aggregatedData[37]['value']).toEqual(24); + expect(aggregatedData[38]['value']).toEqual(33); + }); + + it('should aggregate over months correctly', function() + { + const aggregationConfig = {'period': 'month', 'method': 'first'}; + const generator = integerRangeGenerator(9, 10); + const data = generateData('2017-12-31', '2019-01-01', generator); + const aggregatedData = aggregateTimeData(data, aggregationConfig); + + expect(aggregatedData.length = 12); + expect(dateToString(aggregatedData[0]['date'])).toEqual('2018-01-01'); + expect(dateToString(aggregatedData[1]['date'])).toEqual('2018-02-01'); + expect(dateToString(aggregatedData[10]['date'])).toEqual('2018-11-01'); + expect(dateToString(aggregatedData[11]['date'])).toEqual('2018-12-01'); + expect(aggregatedData[0]['value']).toEqual(0); + expect(aggregatedData[1]['value']).toEqual(1); + expect(aggregatedData[10]['value']).toEqual(4); + expect(aggregatedData[11]['value']).toEqual(4); + }); + }); });