Skip to content

Commit

Permalink
[PLAT-15864] provisioning preflight checks validate clock skew
Browse files Browse the repository at this point in the history
Summary:
Preflight checks for manually provisioned nodes now also check for the node's clock
to be synced, instead of just having a sync service running.

Also fixed small bug with clock sync script and ntpd

Test Plan:
ran preflight checks on a node with each time sync service, validating
it will check for clock skew

Reviewers: muthu, nsingh

Reviewed By: muthu, nsingh

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D41105
  • Loading branch information
shubin-yb committed Jan 31, 2025
1 parent 199d59e commit 7c858c4
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 6 deletions.
28 changes: 28 additions & 0 deletions managed/devops/opscli/ybops/data/preflight_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ PROMETHEUS_FREE_SPACE_MB=100
HOME_FREE_SPACE_MB=2048
VM_MAX_MAP_COUNT=262144
PYTHON_EXECUTABLES=('python3.6' 'python3' 'python3.7' 'python3.8' 'python')
PYTHON_EXECUTABLE=""
# Set python executable
set_python_executable() {
for py_executable in "${PYTHON_EXECUTABLES[@]}"; do
if which "$py_executable" > /dev/null 2>&1; then
PYTHON_EXECUTABLE="$py_executable"
export PYTHON_EXECUTABLE
return
fi
done
}
set_python_executable
LINUX_OS_NAME=""

set_linux_os_name() {
Expand Down Expand Up @@ -262,10 +274,20 @@ check_ntp_service() {
# Check if one of chronyd, ntpd and systemd-timesyncd is running on the node
service_regex="Active: active \(running\)"
service_check=false
skew_ms=500
for ntp_service in chronyd ntp ntpd systemd-timesyncd; do
service_status=$(systemctl status $ntp_service)
if [[ $service_status =~ $service_regex ]]; then
service_check=true
if [[ $ntp_service == "chronyd" ]]; then
chrony_tracking="$(chronyc tracking)"
skew=$(echo "${chrony_tracking}" | awk "/System time/ {print \$4}")
skew_ms=$("${PYTHON_EXECUTABLE}" -c "print(int(${skew} * 1000))")
elif [[ $ntp_service == "ntp" || $ntp_service == "ntpd" ]]; then
skew_ms=$(ntpq -p | awk '$1 ~ "^*" {print $9}')
elif [[ $ntp_service == "systemd-timesyncd" ]]; then
skew_ms=0
fi
break
fi
done
Expand All @@ -274,6 +296,12 @@ check_ntp_service() {
else
update_result_json "NTP time synchronization set up" false
fi
echo "Skew: $skew_ms ms"
if awk "BEGIN{exit !(${skew_ms} < 400)}"; then
update_result_json "ntp_skew" true
else
update_result_json "ntp_skew" false
fi
fi
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ check_clock_sync_chrony() {
}

check_clock_sync_ntpd() {
local skew=$(ntpq -p | awk "$1 ~ \"^*\" {print \$9}")
local skew=$(ntpq -p | awk '$1 ~ "^*" {print $9}')
local acceptable_skew_ms=$(${PYTHON_EXECUTABLE} -c "print(${acceptable_clock_skew_sec} * 1000)")

if [[ -z "$skew" ]]; then
Expand Down
29 changes: 29 additions & 0 deletions managed/node-agent/resources/preflight_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,20 @@ ysql_server_http_port="13000"
ysql_server_rpc_port="5433"
node_exporter_port="9300"

PYTHON_EXECUTABLES=('python3.6' 'python3' 'python3.7' 'python3.8' 'python')
PYTHON_EXECUTABLE=""
# Set python executable
set_python_executable() {
for py_executable in "${PYTHON_EXECUTABLES[@]}"; do
if which "$py_executable" > /dev/null 2>&1; then
PYTHON_EXECUTABLE="$py_executable"
export PYTHON_EXECUTABLE
return
fi
done
}
set_python_executable

preflight_provision_check() {
# Check for internet access.
if [[ "$airgap" = false ]]; then
Expand Down Expand Up @@ -147,10 +161,20 @@ check_ntp_synchronization() {
# Check if one of chronyd, ntpd and systemd-timesyncd is running on the node
service_regex="Active: active \(running\)"
service_check=false
skew_ms=500
for ntp_service in chronyd ntp ntpd systemd-timesyncd; do
service_status=$(systemctl status $ntp_service)
if [[ $service_status =~ $service_regex ]]; then
service_check=true
if [[ $ntp_service == "chronyd" ]]; then
chrony_tracking="$(chronyc tracking)"
skew=$(echo "${chrony_tracking}" | awk "/System time/ {print \$4}")
skew_ms=$("${PYTHON_EXECUTABLE}" -c "print(int(${skew} * 1000))")
elif [[ $ntp_service == "ntp" || $ntp_service == "ntpd" ]]; then
skew_ms=$(ntpq -p | awk '$1 ~ "^*" {print $9}')
elif [[ $ntp_service == "systemd-timesyncd" ]]; then
skew_ms=0
fi
break
fi
done
Expand All @@ -159,6 +183,11 @@ check_ntp_synchronization() {
else
update_result_json "ntp_service_status" false
fi
if [[ $skew_ms -lt 400 ]]; then
update_result_json "ntp_skew" true
else
update_result_json "ntp_skew" false
fi
fi
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
import java.util.UUID;
import java.util.stream.Collectors;
import javax.inject.Inject;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import play.libs.Json;

@Slf4j
public class PrecheckNodeDetached extends AbstractTaskBase {

private final NodeManager nodeManager;
Expand Down Expand Up @@ -71,12 +73,16 @@ public static void processPreflightResponse(
.collect(Collectors.toList());
if (failedChecks.size() > 0) {
response.code = 1;
List<String> failedCheckNames =
failedChecks.stream().map(v -> v.getType().toString()).collect(Collectors.toList());
log.error("Node {} has failed preflight checks: {}", nodeName, failedCheckNames);
response.message = Json.toJson(failedChecks).toPrettyString();
}
} catch (IllegalArgumentException e) {
for (JsonNode node : responseJson) {
if (!node.isBoolean() || !node.asBoolean()) {
// If a check failed, change the return code so processShellResponse errors.
log.error("Node {} has failed preflight checks", nodeName, e);
response.code = 1;
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,9 @@ public enum Type {

YSQL_SERVER_RPC_PORT("YSQL server rpc port is open"),

VM_MAX_MAP_COUNT("VM max memory map count");
VM_MAX_MAP_COUNT("VM max memory map count"),

NTP_SKEW("NTP time skew in acceptable range");

private final String description;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ private boolean isNodeConfigValid(ValidationData input) {
case GSUTIL:
case S3CMD:
case YB_HOME_DIR_CLEAN:
case NTP_SKEW:
case DATA_DIR_CLEAN:
{
return Boolean.parseBoolean(nodeConfig.getValue());
Expand Down
6 changes: 4 additions & 2 deletions managed/src/main/resources/swagger-strict.json
Original file line number Diff line number Diff line change
Expand Up @@ -11962,7 +11962,8 @@
"YCQL_SERVER_RPC_PORT",
"YSQL_SERVER_HTTP_PORT",
"YSQL_SERVER_RPC_PORT",
"VM_MAX_MAP_COUNT"
"VM_MAX_MAP_COUNT",
"NTP_SKEW"
],
"type" : "string"
},
Expand Down Expand Up @@ -26720,7 +26721,8 @@
"YCQL_SERVER_RPC_PORT",
"YSQL_SERVER_HTTP_PORT",
"YSQL_SERVER_RPC_PORT",
"VM_MAX_MAP_COUNT"
"VM_MAX_MAP_COUNT",
"NTP_SKEW"
],
"type" : "string"
},
Expand Down
6 changes: 4 additions & 2 deletions managed/src/main/resources/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -12012,7 +12012,8 @@
"YCQL_SERVER_RPC_PORT",
"YSQL_SERVER_HTTP_PORT",
"YSQL_SERVER_RPC_PORT",
"VM_MAX_MAP_COUNT"
"VM_MAX_MAP_COUNT",
"NTP_SKEW"
],
"type" : "string"
},
Expand Down Expand Up @@ -26887,7 +26888,8 @@
"YCQL_SERVER_RPC_PORT",
"YSQL_SERVER_HTTP_PORT",
"YSQL_SERVER_RPC_PORT",
"VM_MAX_MAP_COUNT"
"VM_MAX_MAP_COUNT",
"NTP_SKEW"
],
"type" : "string"
},
Expand Down

0 comments on commit 7c858c4

Please sign in to comment.