Skip to content

Commit

Permalink
[PLAT-15341][PLAT-15667] clock drift alerts and warnings not triggering
Browse files Browse the repository at this point in the history
Summary:
1. on-prem universes where not getting warn threshold value. Improved
how node drift health check values are set
2. Returning an incorrect metric for clock drift found via chronyc.
Now correctly returns milliseconds instead of seconds

Test Plan:
validated alerts are triggered and correct threshold for
on-prem warnings

Reviewers: muthu, nsingh

Reviewed By: muthu

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D38910
  • Loading branch information
shubin-yb committed Oct 10, 2024
1 parent 017e2a3 commit 4b20a35
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -765,18 +765,21 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
if (!provider.getCode().equals(CloudType.onprem.toString())
&& !provider.getCode().equals(CloudType.kubernetes.toString())) {
nodeInfo.setCheckClock(true);
if (confGetter.getConfForScope(params.universe, UniverseConfKeys.healthCheckTimeDrift)) {
nodeInfo.setCheckTimeDrift(true);
nodeInfo.setTimeDriftWrnThreshold(
confGetter.getConfForScope(
params.universe, UniverseConfKeys.healthCheckTimeDriftWrnThreshold));
nodeInfo.setTimeDriftErrThreshold(
confGetter.getConfForScope(
params.universe, UniverseConfKeys.healthCheckTimeDriftErrThreshold));
nodeInfo.setClockSyncServiceRequired(
confGetter.getConfForScope(
params.universe, UniverseConfKeys.healthCheckClockSyncServiceRequired));
}
}
// Clock drift config values. Clock drift health checks are only run for non-k8s universes
// and if they are enabled.
if (!provider.getCode().equals(CloudType.kubernetes.toString())
&& confGetter.getConfForScope(params.universe, UniverseConfKeys.healthCheckTimeDrift)) {
nodeInfo.setCheckTimeDrift(true);
nodeInfo.setTimeDriftWrnThreshold(
confGetter.getConfForScope(
params.universe, UniverseConfKeys.healthCheckTimeDriftWrnThreshold));
nodeInfo.setTimeDriftErrThreshold(
confGetter.getConfForScope(
params.universe, UniverseConfKeys.healthCheckTimeDriftErrThreshold));
nodeInfo.setClockSyncServiceRequired(
confGetter.getConfForScope(
params.universe, UniverseConfKeys.healthCheckClockSyncServiceRequired));
} else {
nodeInfo.setClockSyncServiceRequired(false);
}
Expand Down Expand Up @@ -1166,10 +1169,13 @@ public static class NodeInfo {
private int masterRpcPort = 7100;
private int tserverRpcPort = 9100;
private int ysqlServerHttpPort = 13000;

// Clock and drift check values will get overridden.
private boolean checkClock = false;
private boolean checkTimeDrift = true;
private int timeDriftWrnThreshold = 250;
private int timeDriftWrnThreshold = 200;
private int timeDriftErrThreshold = 400;

private Long nodeStartTime = null;
private boolean testReadWrite = true;
private boolean testYsqlshConnectivity = true;
Expand Down
3 changes: 2 additions & 1 deletion managed/src/main/resources/health/node_health.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -2126,7 +2126,8 @@ def _chrony_get_clock_drift_ms():
skew = float(skew_match.group(1))
delay = float(delay_match.group(1))
dispersion = float(dispersion_match.group(1))
return skew + dispersion + (.5 * delay)
# Main algorithm is (skew + dispersion + (.5 * delay))
return (skew + dispersion + (.5 * delay)) * 1000 # Convert seconds to milliseconds
return "Failed to get clock drift from chrony"

def _ntp_get_clock_drift_ms():
Expand Down

0 comments on commit 4b20a35

Please sign in to comment.