Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: each node pool can now have different init configs #6184

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions cluster-autoscaler/cloudprovider/hetzner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,38 @@ The cluster autoscaler for Hetzner Cloud scales worker nodes.

`HCLOUD_IMAGE` Defaults to `ubuntu-20.04`, @see https://docs.hetzner.cloud/#images. You can also use an image ID here (e.g. `15512617`), or a label selector associated with a custom snapshot (e.g. `customized_ubuntu=true`). The most recent snapshot will be used in the latter case.

`HCLOUD_CLUSTER_CONFIG` This is the new format replacing
* `HCLOUD_CLOUD_INIT`
* `HCLOUD_IMAGE`

Base64 encoded JSON according to the following structure

```json
{
"imagesForArch": { // These should be the same format as HCLOUD_IMAGE
"arm64": "",
"amd64": ""
},
"nodeConfigs": {
"pool1": { // This equals the pool name. Required for each pool that you have
"cloudInit": "", // HCLOUD_CLOUD_INIT make sure it isn't base64 encoded twice ;]
"labels": {
"node.kubernetes.io/role": "autoscaler-node"
},
"taints":
[
{
"key": "node.kubernetes.io/role",
"value": "autoscaler-node",
"effect": "NoExecute",
}
]
}
}
}
```


`HCLOUD_NETWORK` Default empty , The name of the network that is used in the cluster , @see https://docs.hetzner.cloud/#networks

`HCLOUD_FIREWALL` Default empty , The name of the firewall that is used in the cluster , @see https://docs.hetzner.cloud/#firewalls
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,12 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove
klog.Fatalf("Failed to create Hetzner cloud provider: %v", err)
}

if manager.clusterConfig.IsUsingNewFormat && len(manager.clusterConfig.NodeConfigs) == 0 {
klog.Fatalf("No cluster config present provider: %v", err)
}

validNodePoolName := regexp.MustCompile(`^[a-z0-9A-Z]+[a-z0-9A-Z\-\.\_]*[a-z0-9A-Z]+$|^[a-z0-9A-Z]{1}$`)
clusterUpdateLock := sync.Mutex{}

for _, nodegroupSpec := range do.NodeGroupSpecs {
spec, err := createNodePoolSpec(nodegroupSpec)
if err != nil {
Expand All @@ -206,6 +209,13 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove
klog.Fatalf("Failed to get servers for for node pool %s error: %v", nodegroupSpec, err)
}

if manager.clusterConfig.IsUsingNewFormat {
_, ok := manager.clusterConfig.NodeConfigs[spec.name]
if !ok {
klog.Fatalf("No node config present for node group id `%s` error: %v", spec.name, err)
}
}

manager.nodeGroups[spec.name] = &hetznerNodeGroup{
manager: manager,
id: spec.name,
Expand Down
75 changes: 63 additions & 12 deletions cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package hetzner
import (
"context"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"net/http"
Expand All @@ -45,8 +46,7 @@ type hetznerManager struct {
client *hcloud.Client
nodeGroups map[string]*hetznerNodeGroup
apiCallContext context.Context
cloudInit string
image string
clusterConfig *ClusterConfig
sshKey *hcloud.SSHKey
network *hcloud.Network
firewall *hcloud.Firewall
Expand All @@ -57,6 +57,33 @@ type hetznerManager struct {
cachedServers *serversCache
}

// ClusterConfig holds the configuration for all the nodepools
type ClusterConfig struct {
ImagesForArch ImageList
NodeConfigs map[string]*NodeConfig
IsUsingNewFormat bool
LegacyConfig LegacyConfig
}

// ImageList holds the image id/names for the different architectures
type ImageList struct {
Arm64 string
Amd64 string
}

// NodeConfig holds the configuration for a single nodepool
type NodeConfig struct {
CloudInit string
Taints []apiv1.Taint
Labels map[string]string
}

// LegacyConfig holds the configuration in the legacy format
type LegacyConfig struct {
CloudInit string
ImageName string
}

func newManager() (*hetznerManager, error) {
token := os.Getenv("HCLOUD_TOKEN")
if token == "" {
Expand All @@ -71,19 +98,44 @@ func newManager() (*hetznerManager, error) {
)

ctx := context.Background()
var err error

clusterConfigBase64 := os.Getenv("HCLOUD_CLUSTER_CONFIG")
cloudInitBase64 := os.Getenv("HCLOUD_CLOUD_INIT")
if cloudInitBase64 == "" {
return nil, errors.New("`HCLOUD_CLOUD_INIT` is not specified")

if clusterConfigBase64 == "" && cloudInitBase64 == "" {
return nil, errors.New("`HCLOUD_CLUSTER_CONFIG` or `HCLOUD_CLOUD_INIT` is not specified")
}
cloudInit, err := base64.StdEncoding.DecodeString(cloudInitBase64)
if err != nil {
return nil, fmt.Errorf("failed to parse cloud init error: %s", err)
var clusterConfig *ClusterConfig = &ClusterConfig{}

if clusterConfigBase64 != "" {
clusterConfig.IsUsingNewFormat = true
}

imageName := os.Getenv("HCLOUD_IMAGE")
if imageName == "" {
imageName = "ubuntu-20.04"
if clusterConfig.IsUsingNewFormat {
clusterConfigEnv, err := base64.StdEncoding.DecodeString(clusterConfigBase64)
if err != nil {
return nil, fmt.Errorf("failed to parse cluster config error: %s", err)
}
err = json.Unmarshal(clusterConfigEnv, &clusterConfig)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal cluster config JSON: %s", err)
}
}

if !clusterConfig.IsUsingNewFormat {
cloudInit, err := base64.StdEncoding.DecodeString(cloudInitBase64)
if err != nil {
return nil, fmt.Errorf("failed to parse cloud init error: %s", err)
}

imageName := os.Getenv("HCLOUD_IMAGE")
if imageName == "" {
imageName = "ubuntu-20.04"
}

clusterConfig.LegacyConfig.CloudInit = string(cloudInit)
clusterConfig.LegacyConfig.ImageName = imageName
}

publicIPv4 := true
Expand Down Expand Up @@ -141,15 +193,14 @@ func newManager() (*hetznerManager, error) {
m := &hetznerManager{
client: client,
nodeGroups: make(map[string]*hetznerNodeGroup),
cloudInit: string(cloudInit),
image: imageName,
sshKey: sshKey,
network: network,
firewall: firewall,
createTimeout: createTimeout,
apiCallContext: ctx,
publicIPv4: publicIPv4,
publicIPv6: publicIPv6,
clusterConfig: clusterConfig,
cachedServerType: newServerTypeCache(ctx, client),
cachedServers: newServersCache(ctx, client),
}
Expand Down
49 changes: 43 additions & 6 deletions cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package hetzner
import (
"context"
"fmt"
"maps"
"math/rand"
"strings"
"sync"
Expand Down Expand Up @@ -241,6 +242,16 @@ func (n *hetznerNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, err
}
node.Labels = cloudprovider.JoinStringMaps(node.Labels, nodeGroupLabels)

if n.manager.clusterConfig.IsUsingNewFormat && n.id != drainingNodePoolId {
for _, taint := range n.manager.clusterConfig.NodeConfigs[n.id].Taints {
node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{
Key: taint.Key,
Value: taint.Value,
Effect: taint.Effect,
})
}
}

nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(n.id))
nodeInfo.SetNode(&node)

Expand Down Expand Up @@ -325,14 +336,23 @@ func buildNodeGroupLabels(n *hetznerNodeGroup) (map[string]string, error) {
if err != nil {
return nil, err
}
klog.V(4).Infof("Build node group label for %s", n.id)

return map[string]string{
labels := map[string]string{
apiv1.LabelInstanceType: n.instanceType,
apiv1.LabelTopologyRegion: n.region,
apiv1.LabelArchStable: archLabel,
"csi.hetzner.cloud/location": n.region,
nodeGroupLabel: n.id,
}, nil
}

if n.manager.clusterConfig.IsUsingNewFormat && n.id != drainingNodePoolId {
maps.Copy(labels, n.manager.clusterConfig.NodeConfigs[n.id].Labels)
}

klog.V(4).Infof("%s nodegroup labels: %s", n.id, labels)

return labels, nil
}

func getMachineTypeResourceList(m *hetznerManager, instanceType string) (apiv1.ResourceList, error) {
Expand Down Expand Up @@ -392,10 +412,16 @@ func createServer(n *hetznerNodeGroup) error {
return err
}

cloudInit := n.manager.clusterConfig.LegacyConfig.CloudInit

if n.manager.clusterConfig.IsUsingNewFormat {
cloudInit = n.manager.clusterConfig.NodeConfigs[n.id].CloudInit
}

StartAfterCreate := true
opts := hcloud.ServerCreateOpts{
Name: newNodeName(n),
UserData: n.manager.cloudInit,
UserData: cloudInit,
Location: &hcloud.Location{Name: n.region},
ServerType: serverType,
Image: image,
Expand Down Expand Up @@ -443,7 +469,18 @@ func createServer(n *hetznerNodeGroup) error {
// server.
func findImage(n *hetznerNodeGroup, serverType *hcloud.ServerType) (*hcloud.Image, error) {
// Select correct image based on server type architecture
image, _, err := n.manager.client.Image.GetForArchitecture(context.TODO(), n.manager.image, serverType.Architecture)
imageName := n.manager.clusterConfig.LegacyConfig.ImageName
if n.manager.clusterConfig.IsUsingNewFormat {
if serverType.Architecture == hcloud.ArchitectureARM {
imageName = n.manager.clusterConfig.ImagesForArch.Arm64
}

if serverType.Architecture == hcloud.ArchitectureX86 {
imageName = n.manager.clusterConfig.ImagesForArch.Amd64
}
}

image, _, err := n.manager.client.Image.GetForArchitecture(context.TODO(), imageName, serverType.Architecture)
if err != nil {
// Keep looking for label if image was not found by id or name
if !strings.HasPrefix(err.Error(), "image not found") {
Expand All @@ -462,12 +499,12 @@ func findImage(n *hetznerNodeGroup, serverType *hcloud.ServerType) (*hcloud.Imag
Sort: []string{"created:desc"},
Architecture: []hcloud.Architecture{serverType.Architecture},
ListOpts: hcloud.ListOpts{
LabelSelector: n.manager.image,
LabelSelector: imageName,
},
})

if err != nil || len(images) == 0 {
return nil, fmt.Errorf("unable to find image %s with architecture %s: %v", n.manager.image, serverType.Architecture, err)
return nil, fmt.Errorf("unable to find image %s with architecture %s: %v", imageName, serverType.Architecture, err)
}

return images[0], nil
Expand Down