Skip to content

Commit

Permalink
added check for node age and basic unit test for the same
Browse files Browse the repository at this point in the history
  • Loading branch information
leonkuperman committed Jun 20, 2024
1 parent 7232a99 commit 4c7963f
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 13 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
venv
__pycache__/
8 changes: 8 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,17 @@
"app.kubernetes.io/name=castai-agent,app.kubernetes.io/name=castai-cluster-controller"
).split(",")

# check an environment variable for startup sleep time, default 20 seconds
STARTUP_SLEEP_TIME = int(os.getenv("STARTUP_SLEEP_TIME", 20))
DELAY_WAIT_PENDING_PODS = int(os.getenv("DELAY_WAIT_PENDING_PODS", 20))

# Define the minimum number of ready nodes required before draining critical nodes
MIN_READY_NODES: int = int(os.getenv("MIN_READY_NODES", 1))

CRON_JOB_POD_SUBSTRING = os.getenv("CRON_JOB_PREFIX", "castai-node-drainer")

MIN_NODE_AGE_DAYS: int = int(os.getenv("MIN_NODE_AGE_DAYS", 7))

def load_config() -> None:
logging.info("Loading Kubernetes configuration...")
try:
Expand Down
24 changes: 11 additions & 13 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,8 @@ def process_node(v1: CoreV1Api, node_name: str) -> None:
k8s_events.create_kubernetes_event(v1, "Node", node_name, "default", "CastNodeRotation", "Node cordon init", "castai-agent")
node_utils.cordon_node(v1, node_name)

# The check_controller_replicas function is used to identify if all replicas of a controller
# are running on a single node. It returns the controller’s kind, name, namespace, and the
# list of pods if such a controller is found. If no such controller is found, it returns None values.
# This function is useful for scenarios where you need to ensure that controller replicas are
# distributed across different nodes to avoid single points of failure.
while True:
kind, name, namespace, controller_pods = pod_utils.check_controller_replicas(v1, node_name)
kind, name, namespace, controller_pods = pod_utils.check_controller_replicas(v1, node_name) #check if any controller is "all in" on the node
if kind and name and namespace and controller_pods:
# we want to evict the first pod in the list of controller_pods (not all of them)
pod = controller_pods[0]
Expand All @@ -53,9 +48,9 @@ def main() -> None:
logging.info("Starting node rotator...")
logging.info("************************************************")

# check an environment variable for startup sleep time, default 20 seconds
startup_sleep_time = int(os.getenv("STARTUP_SLEEP_TIME", 20))
delay_wait_pending_pods = int(os.getenv("DELAY_WAIT_PENDING_PODS", 20))
startup_sleep_time = config.STARTUP_SLEEP_TIME
delay_wait_pending_pods = config.DELAY_WAIT_PENDING_PODS
cron_job_pod_substring = config.CRON_JOB_POD_SUBSTRING

logging.info(f"Sleeping for {startup_sleep_time} seconds before starting node rotation.")
time.sleep(startup_sleep_time)
Expand All @@ -64,7 +59,6 @@ def main() -> None:
v1 = CoreV1Api()

# Get the node name for the running cron job pod
cron_job_pod_substring = "castai-node-drainer" # Replace with the desired substring
cron_job_node_name = node_utils.get_node_for_running_pod(v1, cron_job_pod_substring)
logging.info(f" cronjob node {cron_job_node_name}")

Expand All @@ -76,10 +70,14 @@ def main() -> None:

# Separate critical and non-critical nodes
for node_name in original_nodes:
if node_utils.is_node_running_critical_pods(v1, node_name):
critical_nodes.append(node_name)
node = v1.read_node(node_name)
if node_utils.is_node_older_than(node, config.MIN_NODE_AGE_DAYS):
if node_utils.is_node_running_critical_pods(v1, node_name):
critical_nodes.append(node_name)
else:
non_critical_nodes.append(node_name)
else:
non_critical_nodes.append(node_name)
logging.info(f"Node {node_name} is not older then {config.MIN_NODE_AGE_DAYS}. Skipping.")

# Remove the cron job node from critical and non-critical node lists, as the cron job node should be processed last
critical_nodes, non_critical_nodes = node_utils.remove_cron_job_node(cron_job_node_name, critical_nodes, non_critical_nodes)
Expand Down
9 changes: 9 additions & 0 deletions src/node_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
from datetime import datetime, timezone, timedelta
import os
import logging
import subprocess
Expand Down Expand Up @@ -81,3 +82,11 @@ def wait_for_new_nodes(v1: CoreV1Api, original_nodes: List[str]) -> List[str]:
logging.info(f"Currently {len(ready_new_nodes)} new ready nodes. Waiting for new nodes to be ready...")
total_wait_cycles -= 1 # decrement the total_wait_cycles
time.sleep(10)

def is_node_older_than(node: V1Node, days: int) -> bool:
creation_timestamp = node.metadata.creation_timestamp
age = datetime.now(timezone.utc) - creation_timestamp
logging.error(f"Node age: {age}")
logging.error(f"delta days: {timedelta(days=days)}")
logging.error(f"age > timedelta(days=days + 1): {age > timedelta(days=days +1)}")
return age > timedelta(days=days + 1)
35 changes: 35 additions & 0 deletions tests/test_node_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import sys, os
import unittest
from datetime import datetime, timedelta, timezone
from kubernetes.client import V1Node, V1ObjectMeta

# Add src directory to the system path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))

from node_utils import is_node_older_than

class TestNodeUtils(unittest.TestCase):

def test_is_node_older_than_true(self):
# Create a node with a creation timestamp older than 7 days
creation_timestamp = datetime.now(timezone.utc) - timedelta(days=8)
node = V1Node(metadata=V1ObjectMeta(creation_timestamp=creation_timestamp))
result = is_node_older_than(node, 7)
self.assertTrue(result, "Node older than 7 days should return True")

def test_is_node_older_than_false(self):
# Create a node with a creation timestamp less than 7 days
creation_timestamp = datetime.now(timezone.utc) - timedelta(days=6)
node = V1Node(metadata=V1ObjectMeta(creation_timestamp=creation_timestamp))
result = is_node_older_than(node, 7)
self.assertFalse(result, "Node younger than 7 days should return False")

def test_is_node_older_than_exact(self):
# Create a node with a creation timestamp exactly 7 days old
creation_timestamp = datetime.now(timezone.utc) - timedelta(days=7)
node = V1Node(metadata=V1ObjectMeta(creation_timestamp=creation_timestamp))
result = is_node_older_than(node, 7)
self.assertFalse(result, "Node exactly 7 days old should return False")

if __name__ == '__main__':
unittest.main()

0 comments on commit 4c7963f

Please sign in to comment.