Skip to content

Commit

Permalink
feat: use check statuses 1h/1d summary to get the best window for
Browse files Browse the repository at this point in the history
check_statuses

[skip ci]
  • Loading branch information
adityathebe committed Jan 4, 2024
1 parent 73363e0 commit d282be7
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 11 deletions.
109 changes: 107 additions & 2 deletions query/check_details.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,32 @@ import (
"github.com/samber/lo"
)

var (
const (
// Maximum number of past checks in the in-memory cache
DefaultCacheCount = 5

// Default search window
DefaultCheckQueryWindow = "1h"

// The number of data points that should be strived for
// when aggregating check statuses.
desiredNumOfCheckStatuses = 100
)

var (
// allowed list of window durations that are used when aggregating check statuses.
allowedWindows = []time.Duration{
time.Minute, // 1m
time.Minute * 5, // 5m
time.Minute * 15, // 15m
time.Minute * 30, // 30m
time.Hour, // 1h
time.Hour * 3, // 3h
time.Hour * 6, // 6h
time.Hour * 12, // 12h
time.Hour * 24, // 24h
time.Hour * 24 * 7, // 1w
}
)

type Timeseries struct {
Expand Down Expand Up @@ -138,10 +158,84 @@ func (q CheckQueryParams) GetWhereClause() (string, map[string]interface{}, erro
return strings.TrimSpace(clause), args, nil
}

func (q CheckQueryParams) ExecuteDetails(ctx context.Context) ([]Timeseries, types.Uptime, types.Latency, error) {
func getBestPartitioner(totalChecks int, rangeDuration time.Duration) time.Duration {
if totalChecks <= desiredNumOfCheckStatuses {
return 0 // No need to perform window aggregation
}

bestDelta := 100000000 // sufficiently large delta to begin with
bestWindow := allowedWindows[0]

for _, wp := range allowedWindows {
numWindows := int(rangeDuration / wp)
delta := abs(desiredNumOfCheckStatuses - numWindows)

if delta < bestDelta {
bestDelta = delta
bestWindow = wp
} else {
// as soon as we notice that the delta gets worse, we break the loop
break
}
}

numWindows := int(rangeDuration / bestWindow)
if abs(desiredNumOfCheckStatuses-totalChecks) <= abs(desiredNumOfCheckStatuses-numWindows) {
// If this best partition creates windows such that the resulting number of data points deviate more
// from the desired data points than the actual data points, then we do not aggregate.
// Example: if there are 144 checks for the duration of 6 days,
// then the best partition, 1 hour, would generate 144 data points.
// But the original data points (120) are closer to 100, so we do not aggregate.
return 0
}

return bestWindow
}

func optimalWindow(ctx context.Context, from, to time.Time) (time.Duration, error) {
var view string
timeRange := to.Sub(from)
if timeRange > time.Hour*24*21 {
view = "check_statuses_1d"
} else if timeRange > time.Hour*48 {
view = "check_statuses_1h"
} else {
return -1, nil //
}

q := fmt.Sprintf(`
SELECT
SUM(total) AS total,
MAX(created_at) AS latest,
MIN(created_at) AS earliest
FROM
%s
WHERE
created_at >= ? AND created_at <= ?;`, view)
var total *int
var latest, earliest *time.Time
if err := ctx.DB().Raw(q, from, to).Row().Scan(&total, &latest, &earliest); err != nil {
return 0, err
}
if total == nil {
return -1, nil //
}

return getBestPartitioner(*total, earliest.Sub(*latest)), nil
}

func CheckStatuses(ctx context.Context, q CheckQueryParams) ([]Timeseries, types.Uptime, types.Latency, error) {
start := q.GetStartTime().Format(time.RFC3339)
end := q.GetEndTime().Format(time.RFC3339)

// For the given ranges try to find the best window using check statuses summary
window, err := optimalWindow(ctx, *q.GetStartTime(), *q.GetEndTime())
if err != nil {
return nil, types.Uptime{}, types.Latency{}, err
} else if window >= 0 {
q.WindowDuration = window
}

query := `
With grouped_by_window AS (
SELECT
Expand Down Expand Up @@ -309,3 +403,14 @@ func parseDuration(d string, name string) (clause string, arg interface{}, err e
}
return "", nil, fmt.Errorf("start time must be a duration or RFC3339 timestamp")
}

// abs returns the absolute value of i.
// math.Abs only supports float64 and this avoids the needless type conversions
// and ugly expression.
func abs(n int) int {
if n > 0 {
return n
}

return -n
}
14 changes: 7 additions & 7 deletions tests/fixtures/dummy/check_statuses.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"github.com/flanksource/duty/models"
)

func generateStatus(check models.Check, t time.Time, count int, passingMod int) []models.CheckStatus {
func generateStatus(check models.Check, t time.Time, schedule time.Duration, count int, passingMod int) []models.CheckStatus {
var statuses = []models.CheckStatus{}

for i := 0; i < count; i++ {
Expand All @@ -27,9 +27,9 @@ func generateStatus(check models.Check, t time.Time, count int, passingMod int)

func AllDummyCheckStatuses() []models.CheckStatus {
statuses := append(
generateStatus(LogisticsAPIHealthHTTPCheck, CurrentTime, 70, 5),
generateStatus(DeletedCheck, CurrentTime, 1, 1)[0],
generateStatus(DeletedCheckOld, *DeletedCheckOld.CreatedAt, 1, 1)[0],
generateStatus(LogisticsAPIHealthHTTPCheck, CurrentTime, time.Minute, 70, 5),
generateStatus(DeletedCheck, CurrentTime, time.Minute, 1, 1)[0],
generateStatus(DeletedCheckOld, *DeletedCheckOld.CreatedAt, time.Minute, 1, 1)[0],
models.CheckStatus{
CheckID: LogisticsAPIHomeHTTPCheck.ID,
Duration: 100,
Expand All @@ -46,12 +46,12 @@ func AllDummyCheckStatuses() []models.CheckStatus {
},
)

statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-15*time.Minute), 1, 1)[0])
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-2*time.Hour), 10, 2)...)
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-15*time.Minute), time.Minute, 1, 1)[0])
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-2*time.Hour), time.Minute, 10, 2)...)

// Check statuses from 2022-01-01
// not dervied from current time for consistency
statuses = append(statuses, generateStatus(CartAPIHeathCheckAgent, time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC), 70, 5)...)
statuses = append(statuses, generateStatus(CartAPIHeathCheckAgent, time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC), time.Minute*5, 1440, 5)...) // 1440 check statuses spanning 5 days

return statuses
}
4 changes: 2 additions & 2 deletions tests/query_check_details_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
. "github.com/onsi/gomega"
)

var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, func() {
var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, ginkgo.Focus, func() {
type testRecord struct {
since string
statuses int
Expand Down Expand Up @@ -48,7 +48,7 @@ var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, func() {
err = q.Init(urlParam)
Expect(err).To(BeNil())

ts, uptime, latency, err := q.ExecuteDetails(DefaultContext)
ts, uptime, latency, err := query.CheckStatuses(DefaultContext, q)
Expect(err).To(BeNil())

Expect(len(ts)).To(Equal(td.statuses), "unexpected number of results")
Expand Down
8 changes: 8 additions & 0 deletions tests/setup/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,14 @@ func BeforeSuiteFn(args ...interface{}) context.Context {
logger.Infof("Created dummy data %v", len(dummyData.Checks))
}

if err, _ := job.AggregateCheckStatus1d(DefaultContext); err != nil {
panic(err.Error())
}

if err, _ := job.AggregateCheckStatus1h(DefaultContext); err != nil {
panic(err.Error())
}

DefaultContext := DefaultContext.WithKubernetes(fake.NewSimpleClientset(&v1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: "test-cm",
Expand Down

0 comments on commit d282be7

Please sign in to comment.