diff --git a/pkg/daemon/controlplane.go b/pkg/daemon/controlplane.go new file mode 100644 index 0000000000..d32c650a5b --- /dev/null +++ b/pkg/daemon/controlplane.go @@ -0,0 +1,79 @@ +package daemon + +// This file provides changes that we make to the control plane +// only. + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/golang/glog" + "github.com/pkg/errors" +) + +// setRootDeviceSchedulerBFQ switches to the `bfq` I/O scheduler +// for the root block device to better share I/O between etcd +// and other processes. See +// https://github.com/openshift/machine-config-operator/issues/1897 +// Note this is the current systemd default in Fedora, but not RHEL8, +// except for NVMe devices. +func setRootDeviceSchedulerBFQ() error { + sched := "bfq" + + rootDevSysfs, err := getRootBlockDeviceSysfs() + if err != nil { + return err + } + + schedulerPath := filepath.Join(rootDevSysfs, "/queue/scheduler") + schedulerContentsBuf, err := ioutil.ReadFile(schedulerPath) + if err != nil { + return err + } + schedulerContents := string(schedulerContentsBuf) + if strings.Contains(schedulerContents, fmt.Sprintf("[%s]", sched)) { + glog.Infof("Device %s already uses scheduler %s", rootDevSysfs, sched) + return nil + } + + f, err := os.OpenFile(schedulerPath, os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return err + } + defer f.Close() + _, err = f.Write([]byte(sched)) + if err != nil { + return err + } + glog.Infof("Set root blockdev %s to use scheduler %v", rootDevSysfs, sched) + + return nil +} + +// updateOstreeObjectSync enables "per-object-fsync" which helps avoid +// latency spikes for etcd; see https://github.com/ostreedev/ostree/pull/2152 +func updateOstreeObjectSync() error { + if err := exec.Command("ostree", "--repo=/sysroot/ostree/repo", "config", "set", "core.per-object-fsync", "true").Run(); err != nil { + return errors.Wrapf(err, "Failed to set per-object-fsync for ostree") + } + return nil +} + +// initializeControlPlane performs setup for the node that should +// only occur on the control plane. Currently this switches the IO +// scheduler and starts a goroutine acting as a small controller +// for reflecting the etcd leader status in the node object to help +// the MCC coordinate control plane updates. +func (dn *Daemon) initializeControlPlane() error { + if err := setRootDeviceSchedulerBFQ(); err != nil { + return err + } + if err := updateOstreeObjectSync(); err != nil { + return err + } + return nil +} diff --git a/pkg/daemon/coreos.go b/pkg/daemon/coreos.go new file mode 100644 index 0000000000..a477ed6ce7 --- /dev/null +++ b/pkg/daemon/coreos.go @@ -0,0 +1,52 @@ +package daemon + +// This file provides routines that apply on Fedora CoreOS style systems, +// including deriviatives like RHEL CoreOS. + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/pkg/errors" +) + +// byLabel returns the udev generated symlink to the block device with the given label +func byLabel(label string) string { + return fmt.Sprintf("/dev/disk/by-label/%s", label) +} + +// getParentDeviceSysfs returns e.g. /sys/devices/pci0000:00/0000:00:05.0/virtio2/block/vda from /dev/vda4, though +// it can be more complex than that with e.g. NVMe. +func getParentDeviceSysfs(device string) (string, error) { + target, err := os.Readlink(device) + if err != nil { + return "", errors.Wrapf(err, "reading %s", device) + } + sysfsDevLink := fmt.Sprintf("/sys/class/block/%s", filepath.Base(target)) + sysfsDev, err := filepath.EvalSymlinks(sysfsDevLink) + if err != nil { + return "", errors.Wrapf(err, "parsing %s", sysfsDevLink) + } + if _, err := os.Stat(filepath.Join(sysfsDev, "partition")); err == nil { + sysfsDev = filepath.Dir(sysfsDev) + } + return sysfsDev, nil +} + +// getRootBlockDeviceSysfs returns the path to the block +// device backing the root partition on a FCOS system +func getRootBlockDeviceSysfs() (string, error) { + // Check for the `crypt_rootfs` label; this exists for RHCOS >= 4.3 but <= 4.6. + // See https://github.com/openshift/enhancements/blob/master/enhancements/rhcos/automated-policy-based-disk-encryption.md + luksRoot := byLabel("crypt_rootfs") + if _, err := os.Stat(luksRoot); err == nil { + return getParentDeviceSysfs(luksRoot) + } + // This is what we expect on FCOS and RHCOS <= 4.2 + root := byLabel("root") + if _, err := os.Stat(root); err == nil { + return getParentDeviceSysfs(root) + } + return "", fmt.Errorf("Failed to find %s", root) +} diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 22b2b6fe82..87842dcb8a 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -113,7 +113,12 @@ type Daemon struct { // isControlPlane is true if this node is a control plane (master). // The machine may also be a worker (with schedulable masters). isControlPlane bool - booting bool + // nodeInitialized is true when we've performed one-time initialization + // after having updated the node object + nodeInitialized bool + // booting is true when all initial synchronization to the target + // machineconfig is done + booting bool currentConfigPath string @@ -382,15 +387,24 @@ func (dn *Daemon) updateErrorState(err error) { } } -// initializeNode is called the first time we get our node object -func (dn *Daemon) initializeNode() { +// initializeNode is called the first time we get our node object; however to +// ensure we handle failures: everything called from here should be idempotent. +func (dn *Daemon) initializeNode() error { + if dn.nodeInitialized { + return nil + } // Some parts of the MCO dispatch on whether or not we're managing a control plane node if _, isControlPlane := dn.node.Labels[ctrlcommon.MasterLabel]; isControlPlane { glog.Infof("Node %s is part of the control plane", dn.node.Name) + if err := dn.initializeControlPlane(); err != nil { + return err + } dn.isControlPlane = true } else { glog.Infof("Node %s is not labeled %s", dn.node.Name, ctrlcommon.MasterLabel) } + dn.nodeInitialized = true + return nil } func (dn *Daemon) syncNode(key string) error { @@ -428,7 +442,9 @@ func (dn *Daemon) syncNode(key string) error { node = node.DeepCopy() if dn.node == nil { dn.node = node - dn.initializeNode() + if err := dn.initializeNode(); err != nil { + return err + } } else { dn.node = node } diff --git a/templates/master/00-master/_base/units/rpm-ostreed.service.yaml b/templates/master/00-master/_base/units/rpm-ostreed.service.yaml new file mode 100644 index 0000000000..93a7e3e6d8 --- /dev/null +++ b/templates/master/00-master/_base/units/rpm-ostreed.service.yaml @@ -0,0 +1,8 @@ +name: rpm-ostreed.service +dropins: +- name: mco-controlplane-nice.conf + contents: | + # See https://github.com/openshift/machine-config-operator/issues/1897 + [Service] + Nice=10 + IOSchedulingClass=idle