Skip to content

Commit

Permalink
Extend node optimizations with kernel parameter tuning
Browse files Browse the repository at this point in the history
Additional kernel parameter tuning is added to our tuning suite.
It allows user to provide their own desired custom values, ability to
apply recommended settings directly from Scylla image, as well as
scaling some of the values based on number of expected Scyllas running
on the same host.

NodeConfig API was extended with KernelParameters field allowing
configuring/disabling above logic.

Fixes scylladb#868
  • Loading branch information
zimnx committed Jan 24, 2022
1 parent a510330 commit fed874f
Show file tree
Hide file tree
Showing 8 changed files with 323 additions and 12 deletions.
25 changes: 25 additions & 0 deletions pkg/api/scylla/v1alpha1/types_nodeconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,28 @@ type NodeConfigPlacement struct {
NodeSelector map[string]string `json:"nodeSelector"`
}

type KernelParametersSpec struct {
// disableScyllaImageSettings controls if recommended kernel parameters settings available in Scylla image
// shouldn't be applied on tuned hosts. Turning off this on already optimized host doesn't revert values to initial settings.
DisableScyllaImageSettings bool `json:"disableScyllaImageSettings"`

// nodeMultitenancy is a value of expected number of Scylla Pods Nodes are going to host.
// Value is used together with tenantScalableKeys.
// +kubebuilder:default:=1
NodeMultitenancy int64 `json:"nodeMultitenancy"`

// tenantScalableKeys is a list of names of kernel parameters which values will be multiplied by the nodeMultitenancy.
// These keys must be available in either customKeyValues or provided by Scylla image.
// +optional
TenantScalableKeys []string `json:"tenantScalableKeys"`

// customKeyValues is a list of additional user provided kernel parameters.
// Array values must be in following format "key=value", where 'key' is kernel parameter name, and
// 'value' is value to which parameter must be set.
// +optional
CustomKeyValues []string `json:"customKeyValues"`
}

type NodeConfigSpec struct {
// placement contains scheduling rules for NodeConfig Pods.
// +kubebuilder:validation:Required
Expand All @@ -85,6 +107,9 @@ type NodeConfigSpec struct {
// are going to be optimized. Turning off optimizations on already optimized
// Nodes does not revert changes.
DisableOptimizations bool `json:"disableOptimizations"`

// kernelParameters contains settings of kernel parameter tuning.
KernelParameters KernelParametersSpec `json:"kernelParameters"`
}

// +kubebuilder:object:root=true
Expand Down
13 changes: 13 additions & 0 deletions pkg/cmd/operator/nodeconfigdaemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ type NodeConfigDaemonOptions struct {
ScyllaImage string
DisableOptimizations bool

DisableScyllaImageSettings bool
NodeMultitenancy int64
TenantScalableKeys []string
CustomKeyValues []string

CRIEndpoints []string

kubeClient kubernetes.Interface
Expand Down Expand Up @@ -102,6 +107,10 @@ func NewNodeConfigCmd(streams genericclioptions.IOStreams) *cobra.Command {
cmd.Flags().StringVarP(&o.ScyllaImage, "scylla-image", "", o.ScyllaImage, "Scylla image used for running perftune.")
cmd.Flags().BoolVarP(&o.DisableOptimizations, "disable-optimizations", "", o.DisableOptimizations, "Controls if optimizations are disabled")
cmd.Flags().StringArrayVarP(&o.CRIEndpoints, "cri-endpoint", "", o.CRIEndpoints, "CRI endpoint to connect to. It will try to connect to any of them, in the given order.")
cmd.Flags().BoolVarP(&o.DisableScyllaImageSettings, "disable-scylla-image-settings", "", o.DisableScyllaImageSettings, "Controls if recommended settings available in Scylla image are disabled.")
cmd.Flags().Int64VarP(&o.NodeMultitenancy, "node-multitenancy", "", o.NodeMultitenancy, "Controls how many Scylla Pods are expected to be hosted on this Node. Use together with tenant-scalable-keys.")
cmd.Flags().StringSliceVarP(&o.TenantScalableKeys, "tenant-scalable-keys", "", o.TenantScalableKeys, "List of kernel parameter keys scalable with provided multitenancy.")
cmd.Flags().StringSliceVarP(&o.CustomKeyValues, "custom-key-values", "", o.CustomKeyValues, "List of custom kernel parameters key-values.")

return cmd
}
Expand Down Expand Up @@ -220,6 +229,10 @@ func (o *NodeConfigDaemonOptions) Run(streams genericclioptions.IOStreams, cmd *
o.NodeConfigName,
types.UID(o.NodeConfigUID),
o.ScyllaImage,
o.DisableScyllaImageSettings,
o.NodeMultitenancy,
o.TenantScalableKeys,
o.CustomKeyValues,
)
if err != nil {
return fmt.Errorf("can't create node config instance controller: %w", err)
Expand Down
69 changes: 59 additions & 10 deletions pkg/controller/nodeconfig/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package nodeconfig

import (
"fmt"
"strings"

scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
"github.com/scylladb/scylla-operator/pkg/naming"
Expand Down Expand Up @@ -126,6 +127,48 @@ func makeNodeConfigDaemonSet(nc *scyllav1alpha1.NodeConfig, operatorImage, scyll
naming.NodeConfigNameLabel: nc.Name,
}

var initContainers []corev1.Container
volumes := []corev1.Volume{
makeHostDirVolume("hostfs", "/"),
}
volumeMounts := []corev1.VolumeMount{
makeVolumeMount("hostfs", "/host", false),
}

if !nc.Spec.KernelParameters.DisableScyllaImageSettings {
initContainers = []corev1.Container{
{
Name: naming.SysctlInitContainerName,
Image: scyllaImage,
ImagePullPolicy: corev1.PullIfNotPresent,
Command: []string{
"/usr/bin/bash",
"-euExo",
"pipefail",
"-c",
},
Args: []string{
fmt.Sprintf("cp /usr/lib/sysctl.d/*.conf %s", naming.ScyllaSysctlsDirName),
},
VolumeMounts: []corev1.VolumeMount{{
Name: "scylla-sysctls",
MountPath: naming.ScyllaSysctlsDirName,
}},
},
}

volumes = append(volumes, corev1.Volume{
Name: "scylla-sysctls",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
})
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: "scylla-sysctls",
MountPath: naming.ScyllaSysctlsDirName,
})
}

return &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: nc.Name,
Expand All @@ -146,13 +189,12 @@ func makeNodeConfigDaemonSet(nc *scyllav1alpha1.NodeConfig, operatorImage, scyll
Spec: corev1.PodSpec{
ServiceAccountName: naming.NodeConfigAppName,
// Required for getting the right iface name to tune
HostNetwork: true,
NodeSelector: nc.Spec.Placement.NodeSelector,
Affinity: &nc.Spec.Placement.Affinity,
Tolerations: nc.Spec.Placement.Tolerations,
Volumes: []corev1.Volume{
makeHostDirVolume("hostfs", "/"),
},
HostNetwork: true,
NodeSelector: nc.Spec.Placement.NodeSelector,
Affinity: &nc.Spec.Placement.Affinity,
Tolerations: nc.Spec.Placement.Tolerations,
Volumes: volumes,
InitContainers: initContainers,
Containers: []corev1.Container{
{
Name: naming.NodeConfigAppName,
Expand All @@ -175,6 +217,11 @@ for f in $( find /host -mindepth 1 -maxdepth 1 -type d -printf '%f\n' ); do
mount --rbind "/host/${f}" "./${f}"
done
for f in $( find /mnt -mindepth 1 -maxdepth 1 -type d -printf '%f\n' ); do
mkdir -p "./mnt/${f}"
mount --rbind "/mnt/${f}" "./mnt/${f}"
done
for f in $( find /host -mindepth 1 -maxdepth 1 -type f -printf '%f\n' ); do
touch "./${f}"
mount --bind "/host/${f}" "./${f}"
Expand Down Expand Up @@ -224,6 +271,10 @@ exec chroot ./ /scylla-operator/scylla-operator node-config-daemon \
--node-config-uid=` + fmt.Sprintf("%q", nc.UID) + ` \
--scylla-image=` + fmt.Sprintf("%q", scyllaImage) + ` \
--disable-optimizations=` + fmt.Sprintf("%t", nc.Spec.DisableOptimizations) + ` \
--disable-scylla-image-settings=` + fmt.Sprintf("%t", nc.Spec.KernelParameters.DisableScyllaImageSettings) + ` \
--node-multitenancy=` + fmt.Sprintf("%d", nc.Spec.KernelParameters.NodeMultitenancy) + ` \
--tenant-scalable-keys=` + fmt.Sprintf("%q", strings.Join(nc.Spec.KernelParameters.TenantScalableKeys, ",")) + ` \
--custom-key-values=` + fmt.Sprintf("%q", strings.Join(nc.Spec.KernelParameters.CustomKeyValues, ",")) + ` \
--loglevel=` + fmt.Sprintf("%d", 4) + `
`,
},
Expand Down Expand Up @@ -265,9 +316,7 @@ exec chroot ./ /scylla-operator/scylla-operator node-config-daemon \
SecurityContext: &corev1.SecurityContext{
Privileged: pointer.BoolPtr(true),
},
VolumeMounts: []corev1.VolumeMount{
makeVolumeMount("hostfs", "/host", false),
},
VolumeMounts: volumeMounts,
},
},
},
Expand Down
14 changes: 14 additions & 0 deletions pkg/controller/nodeconfigdaemon/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ type Controller struct {
nodeConfigUID types.UID
scyllaImage string

disableScyllaImageSettings bool
nodeMultitenancy int64
tenantScalableKeys []string
customKeyValues []string

cachesToSync []cache.InformerSynced

eventRecorder record.EventRecorder
Expand All @@ -98,6 +103,10 @@ func NewController(
nodeConfigName string,
nodeConfigUID types.UID,
scyllaImage string,
disableScyllaImageSettings bool,
nodeMultitenancy int64,
tenantScalableKeys []string,
customKeyValues []string,
) (*Controller, error) {
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartStructuredLogging(0)
Expand Down Expand Up @@ -132,6 +141,11 @@ func NewController(
nodeConfigUID: nodeConfigUID,
scyllaImage: scyllaImage,

disableScyllaImageSettings: disableScyllaImageSettings,
nodeMultitenancy: nodeMultitenancy,
tenantScalableKeys: tenantScalableKeys,
customKeyValues: customKeyValues,

cachesToSync: []cache.InformerSynced{
nodeConfigInformer.Informer().HasSynced,
localScyllaPodsInformer.Informer().HasSynced,
Expand Down
24 changes: 23 additions & 1 deletion pkg/controller/nodeconfigdaemon/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"math"
"os"
"path"
"strings"

"github.com/scylladb/scylla-operator/pkg/naming"
batchv1 "k8s.io/api/batch/v1"
Expand All @@ -18,7 +19,7 @@ import (

// TODO: set anti affinities so config jobs don't run on the same node at the same time

func makePerftuneJobForNode(controllerRef *metav1.OwnerReference, namespace, nodeConfigName, nodeName string, nodeUID types.UID, image string, podSpec *corev1.PodSpec) *batchv1.Job {
func makePerftuneJobForNode(controllerRef *metav1.OwnerReference, namespace, nodeConfigName, nodeName string, nodeUID types.UID, image string, podSpec *corev1.PodSpec, sysctls []string) *batchv1.Job {
podSpec = podSpec.DeepCopy()

args := []string{
Expand Down Expand Up @@ -97,6 +98,27 @@ func makePerftuneJobForNode(controllerRef *metav1.OwnerReference, namespace, nod
},
}

if len(sysctls) != 0 {
job.Spec.Template.Spec.Containers = append(job.Spec.Template.Spec.Containers, corev1.Container{
Name: naming.SysctlContainerName,
Image: image,
ImagePullPolicy: corev1.PullIfNotPresent,
Command: []string{"/bin/sh",
"-c",
fmt.Sprintf("sysctl -e %s", strings.Join(sysctls, " ")),
},
SecurityContext: &corev1.SecurityContext{
Privileged: pointer.BoolPtr(true),
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("10m"),
corev1.ResourceMemory: resource.MustParse("50Mi"),
},
},
})
}

return job
}

Expand Down
92 changes: 92 additions & 0 deletions pkg/controller/nodeconfigdaemon/sync_jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@ import (
"context"
"encoding/json"
"fmt"
"io/fs"
"os"
"path/filepath"
"sort"
"strconv"
"strings"

"github.com/c9s/goprocinfo/linux"
scyllav1alpha1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1alpha1"
Expand All @@ -17,6 +22,7 @@ import (
"github.com/scylladb/scylla-operator/pkg/util/cloud"
"github.com/scylladb/scylla-operator/pkg/util/cpuset"
"github.com/scylladb/scylla-operator/pkg/util/network"
"github.com/scylladb/scylla-operator/pkg/util/sysctl"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -37,6 +43,11 @@ func (ncdc *Controller) makeJobsForNode(ctx context.Context) ([]*batchv1.Job, er
return nil, fmt.Errorf("can't get controller ref: %w", err)
}

sysctls, err := makeSysctls(ncdc.disableScyllaImageSettings, ncdc.nodeMultitenancy, ncdc.tenantScalableKeys, ncdc.customKeyValues)
if err != nil {
return nil, fmt.Errorf("can't make sysctls: %w", err)
}

jobs = append(jobs, makePerftuneJobForNode(
cr,
ncdc.namespace,
Expand All @@ -45,6 +56,7 @@ func (ncdc *Controller) makeJobsForNode(ctx context.Context) ([]*batchv1.Job, er
ncdc.nodeUID,
ncdc.scyllaImage,
&pod.Spec,
sysctls,
))

return jobs, nil
Expand Down Expand Up @@ -279,3 +291,83 @@ func (ncdc *Controller) pruneJobs(ctx context.Context, jobs map[string]*batchv1.
}
return utilerrors.NewAggregate(errs)
}

func makeSysctls(disableScyllaImageSettings bool, nodeMultitenancy int64, tenantScalableKeys []string, customKeyValues []string) ([]string, error) {
kv := map[string]string{}

if !disableScyllaImageSettings {
err := filepath.WalkDir(naming.ScyllaSysctlsDirName, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}

if path != naming.ScyllaSysctlsDirName && d.IsDir() {
return filepath.SkipDir
}

f, err := os.Open(path)
if err != nil {
return fmt.Errorf("can't open file %q: %w", path, err)
}

kvs, err := sysctl.ParseConfig(f)
if err != nil {
return fmt.Errorf("parse scylla sysctl config %q: %w", d.Name(), err)
}

klog.V(4).InfoS("Parsed scylla sysctls", "name", d.Name(), "parameters", len(kvs))
for k, v := range kvs {
kv[k] = v
}

return nil
})

if err != nil {
return nil, fmt.Errorf("read scylla image settings: %w", err)
}
}

for _, v := range customKeyValues {
if len(v) == 0 {
continue
}
parts := strings.Split(v, "=")
if len(parts) != 2 {
return nil, fmt.Errorf("invalid format of custom kernel parameter: %q", v)
}
kv[parts[0]] = parts[1]
}

for _, scalableKey := range tenantScalableKeys {
if len(scalableKey) == 0 {
continue
}
value, ok := kv[scalableKey]
if !ok {
// TODO: set degraded status
klog.Warning("Key provided in tenantScalableKeys has unknown initial value, it's not going to be applied", "key", scalableKey)
continue
}

v, err := strconv.Atoi(value)
if err != nil {
// TODO: set degraded status
klog.Warning("Non-numerical key cannot be scaled, it's not going to be multiplied", "key", scalableKey)
continue
}

kv[scalableKey] = fmt.Sprintf("%d", int(nodeMultitenancy)*v)
}

klog.V(4).InfoS("Tuning kernel parameters", "parameters", len(kv))
sysctls := make([]string, 0, len(kv))
for k, v := range kv {
klog.V(4).InfoS("Setting kernel parameter", "key", k, "value", v)
sysctls = append(sysctls, fmt.Sprintf("%s=%s", k, v))
}

sort.Strings(sysctls)

return sysctls, nil
}
Loading

0 comments on commit fed874f

Please sign in to comment.