Skip to content

Commit

Permalink
Merge #115241
Browse files Browse the repository at this point in the history
115241: server: deflake TestHealthCheck r=herkolategan a=stevendanna

This test manually updates metrics and then makes assertions about what health alerts are returned.

If the test server is slow to start up, then it is possible that the node's own metric update kicks in (after 10s) and interferes with these assertions.

Here we deflake the test by (1) ensuring the replica count on system zone configuration only requires 1 replica so that there are no underreplicated ranges and (2) retrying our update and assertion in case a stats update resets it to 0 at the wrong time.

Fixes #115077

Release note: None

Co-authored-by: Steven Danna <[email protected]>
  • Loading branch information
craig[bot] and stevendanna committed Dec 5, 2023
2 parents dde1c32 + c3a8dce commit 3771f53
Showing 1 changed file with 17 additions and 20 deletions.
37 changes: 17 additions & 20 deletions pkg/server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,46 +84,43 @@ func TestHealthCheck(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()

cfg := zonepb.DefaultZoneConfig()
cfg.NumReplicas = proto.Int32(1)
sCfg := zonepb.DefaultSystemZoneConfig()
sCfg.NumReplicas = proto.Int32(1)
s := serverutils.StartServerOnly(t, base.TestServerArgs{
Knobs: base.TestingKnobs{
Server: &TestingKnobs{
DefaultZoneConfigOverride: &cfg,
DefaultZoneConfigOverride: &cfg,
DefaultSystemZoneConfigOverride: &sCfg,
},
},
})
defer s.Stopper().Stop(context.Background())

ctx := context.Background()

recorder := s.MetricsRecorder()
defer s.Stopper().Stop(ctx)
recorder := s.StorageLayer().MetricsRecorder()

{
summary := *recorder.GenerateNodeStatus(ctx)
result := recorder.CheckHealth(ctx, summary)
if len(result.Alerts) != 0 {
t.Fatal(result)
}
}

store, err := s.GetStores().(*kvserver.Stores).GetStore(1)
if err != nil {
t.Fatal(err)
require.Equal(t, 0, len(result.Alerts), "unexpected health alerts: %v", result)
}

store.Metrics().UnavailableRangeCount.Inc(100)
store, err := s.StorageLayer().GetStores().(*kvserver.Stores).GetStore(1)
require.NoError(t, err)

{
summary := *recorder.GenerateNodeStatus(ctx)
result := recorder.CheckHealth(ctx, summary)
testutils.SucceedsSoon(t, func() error {
store.Metrics().UnavailableRangeCount.Update(100)
result := recorder.CheckHealth(ctx, *recorder.GenerateNodeStatus(ctx))
expAlerts := []statuspb.HealthAlert{
{StoreID: 1, Category: statuspb.HealthAlert_METRICS, Description: "ranges.unavailable", Value: 100.0},
}
if !reflect.DeepEqual(expAlerts, result.Alerts) {
t.Fatalf("expected %+v, got %+v", expAlerts, result.Alerts)
return errors.Newf("expected %+v, got %+v", expAlerts, result.Alerts)
}
}
return nil
})
}

// TestEngineTelemetry tests that the server increments a telemetry counter on
Expand Down

0 comments on commit 3771f53

Please sign in to comment.