Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow for fine tuned timing and customization of node group #13

Merged
merged 3 commits into from
Oct 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
The versions coincide with releases on pip. Only major versions will be released as tags on Github.

## [0.0.x](https://github.com/converged-computing/kubescaler/tree/main) (0.0.x)
- allow manual customization and timing of nodegroup (e.g., for spot) (0.0.16)
- extensive changes to aws client (thanks to @rajibhossen!) (0.0.15)
- use api client with consistent token to associate nodes to cluster (0.0.14)
- remove dependency on subprocess and kubectl (0.0.13)
Expand Down
59 changes: 46 additions & 13 deletions kubescaler/scaler/aws/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def set_stack_failure(self, on_stack_failure):
)

@timed
def create_cluster(self):
def create_cluster(self, machine_types=None, create_nodes=True):
"""
Create a cluster.

Expand All @@ -157,6 +157,10 @@ def create_cluster(self):
that the AWS client discovers here (in token.py) to generate a token.
It's best to be consistent and use an environment set (that ideally
has a long enough expiration) OR just the $HOME/.aws/config.

machine_types is exposed to allow for custom instances request for spot!
But you can also use create_cluster_nodes and set create_nodes to False.
If you set create_nodes to false, it will not create the node group/nodes.
"""
print("🥞️ Creating VPC stack and subnets...")
self.set_vpc_stack()
Expand Down Expand Up @@ -184,32 +188,45 @@ def create_cluster(self):
self.ensure_kube_config()
self.get_keypair()

# Cut out early if we are not creating nodes
if not create_nodes:
print(
"Not creating nodes! Ensure to call create_cluster_nodes to do so and generate kubectl config."
)
return self.cluster
return self.create_cluster_nodes(machine_types)

@timed
def create_cluster_nodes(self, machine_types=None):
"""
Create cluster nodes! This is done separately in case you are doing experiments.
"""
# The cluster is actually created with no nodes - just the control plane!
# Here is where we create the workers, via a stack. Because apparently
# AWS really likes their pancakes. 🥞️
if self.eks_nodegroup:
self.set_or_create_nodegroup()
self.set_or_create_nodegroup(machine_types=machine_types)
else:
self.set_workers_stack()

# enabling cluster autoscaler. we will create an oidc provider and a cluster autoscaler role to be used by serviceaccount
if self.enable_cluster_autoscaler:
self.set_oidc_provider()

self.create_autoscaler_role()

self.create_auth_config()

# We can only wait for the node group after we set the auth config!
# I was surprised this is expecting the workers name and not the node
# group name.
self.wait_for_nodes()

print(f"🦊️ Writing config file to {self.kube_config_file}")
print(f" Usage: kubectl --kubeconfig={self.kube_config_file} get nodes")
print(f" Usage: kubectl --kubeconfig={self.kube_config_file} get nodes")
return self.cluster

def load_cluster_info(self):
"""
Load information for a cluster with eks describe cluster.
"""
self.set_vpc_stack()
self.set_subnets()

Expand Down Expand Up @@ -298,7 +315,7 @@ def wait_for_nodes(self):
start = time.time()
kubectl = self.get_k8s_client()
while True:
print(f"⏱️ Waiting for {self.node_count} nodes to be Ready...")
print(f"⏱️ Waiting for {self.node_count} nodes to be Ready...")
time.sleep(5)
nodes = kubectl.list_node()
ready_count = 0
Expand Down Expand Up @@ -381,6 +398,9 @@ def create_auth_config(self):
try:
k8sutils.create_from_yaml(kubectl.api_client, self.auth_config_file)
except Exception as e:
# Don't print the "this already exists error" we might be re-using it
if "already exists" in str(e):
pass
print(f"😭️ Kubectl create from yaml returns in error: {e}")

def ensure_kube_config(self):
Expand Down Expand Up @@ -470,19 +490,24 @@ def set_workers_stack(self):
if output["OutputKey"] == "NodeAutoScalingGroup":
self.node_autoscaling_group_name = output["OutputValue"]

def set_or_create_nodegroup(self):
def set_or_create_nodegroup(self, machine_types=None):
"""
Get or create the workers stack, or the nodes for the cluster.

If the nodgroup is not created yet, you can set a custom set of machine_types.
This is intended for the spot instance creation case.
"""
try:
self.nodegroup = self.eks.describe_nodegroup(
clusterName=self.cluster_name, nodegroupName=self.node_group_name
)
except Exception:
self.nodegroup = self.create_nodegroup()
self.nodegroup = self.create_nodegroup(machine_types=machine_types)

def set_oidc_provider(self):
"Get or Create an OIDC provider for the cluster. this will be used by cluster autoscaler Role."
"""
Get or Create an OIDC provider for the cluster. this will be used by cluster autoscaler Role.
"""
print("Setting Up the cluster OIDC Provider")
try:
self.oidc_provider_stack = self.cf.describe_stacks(
Expand Down Expand Up @@ -609,10 +634,17 @@ def create_workers_stack(self):
return self._create_stack(stack, self.workers_name)

@timed
def create_nodegroup(self):
def create_nodegroup(self, machine_types=None):
"""
Create the EKS Managed Node Group (the nodes for the EKS cluster)

Add additional machine types with machine_types.
"""
# Allow a custom set of 'on the fly' machine types for spot experiments
machine_types = machine_types or []
if not machine_types:
machine_types = [self.machine_type]

node_group = self.eks.create_nodegroup(
clusterName=self.cluster_name,
nodegroupName=self.node_group_name,
Expand All @@ -622,7 +654,7 @@ def create_nodegroup(self):
"desiredSize": self.node_count,
},
subnets=[str(subnet) for subnet in self.vpc_subnet_ids],
instanceTypes=[self.machine_type],
instanceTypes=machine_types,
amiType=self.ami_type,
remoteAccess={
"ec2SshKey": self.keypair_name,
Expand Down Expand Up @@ -1005,6 +1037,7 @@ def workers_name(self):
def node_group_name(self):
return self.cluster_name + "-worker-group"

@timed
def delete_nodegroup(self, nodegroup_name):
"""
Delete a stack and wait for it to be deleted
Expand All @@ -1016,7 +1049,7 @@ def delete_nodegroup(self, nodegroup_name):
clusterName=self.cluster_name, nodegroupName=self.node_group_name
)
except Exception:
logger.warning(f"Node Group {nodegroup_name} does not exist.")
logger.warning(f"✖️ Node Group {nodegroup_name} does not exist.")
return

try:
Expand Down
2 changes: 1 addition & 1 deletion kubescaler/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# SPDX-License-Identifier: (MIT)

__version__ = "0.0.15"
__version__ = "0.0.16"
AUTHOR = "Vanessa Sochat"
EMAIL = "[email protected]"
NAME = "kubescaler"
Expand Down