Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add docker storage check #3787

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# pylint: disable=missing-docstring
from openshift_checks import OpenShiftCheck, get_var
from openshift_checks.mixins import DockerHostMixin


class DockerImageAvailability(OpenShiftCheck):
class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"""Check that required Docker images are available.

This check attempts to ensure that required docker images are
Expand Down Expand Up @@ -36,19 +37,11 @@ def is_active(cls, task_vars):

def run(self, tmp, task_vars):
msg, failed, changed = self.ensure_dependencies(task_vars)

# exit early if Skopeo update fails
if failed:
if "No package matching" in msg:
msg = "Ensure that all required dependencies can be installed via `yum`.\n"
return {
"failed": True,
"changed": changed,
"msg": (
"Unable to update or install required dependency packages on this host;\n"
"These are required in order to check Docker image availability:"
"\n {deps}\n{msg}"
).format(deps=',\n '.join(self.dependencies), msg=msg),
"msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
}

required_images = self.required_images(task_vars)
Expand Down Expand Up @@ -168,12 +161,3 @@ def is_available_skopeo_image(self, image, registry, task_vars):
args = {"_raw_params": cmd_str}
result = self.module_executor("command", args, task_vars)
return not result.get("failed", False) and result.get("rc", 0) == 0

# ensures that the skopeo and python-docker-py packages exist
# check is skipped on atomic installations
def ensure_dependencies(self, task_vars):
if get_var(task_vars, "openshift", "common", "is_atomic"):
return "", False, False

result = self.module_executor("yum", {"name": self.dependencies, "state": "latest"}, task_vars)
return result.get("msg", ""), result.get("failed", False) or result.get("rc", 0) != 0, result.get("changed")
185 changes: 185 additions & 0 deletions roles/openshift_health_checker/openshift_checks/docker_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
"""Check Docker storage driver and usage."""
import json
import re
from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
from openshift_checks.mixins import DockerHostMixin


class DockerStorage(DockerHostMixin, OpenShiftCheck):
"""Check Docker storage driver compatibility.

This check ensures that Docker is using a supported storage driver,
and that loopback is not being used (if using devicemapper).
Also that storage usage is not above threshold.
"""

name = "docker_storage"
tags = ["pre-install", "health", "preflight"]

dependencies = ["python-docker-py"]
storage_drivers = ["devicemapper", "overlay2"]
max_thinpool_data_usage_percent = 90.0
max_thinpool_meta_usage_percent = 90.0

# pylint: disable=too-many-return-statements
# Reason: permanent stylistic exception;
# it is clearer to return on failures and there are just many ways to fail here.
def run(self, tmp, task_vars):
msg, failed, changed = self.ensure_dependencies(task_vars)
if failed:
return {
"failed": True,
"changed": changed,
"msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
}

# attempt to get the docker info hash from the API
info = self.execute_module("docker_info", {}, task_vars)
if info.get("failed"):
return {"failed": True, "changed": changed,
"msg": "Failed to query Docker API. Is docker running on this host?"}
if not info.get("info"): # this would be very strange
return {"failed": True, "changed": changed,
"msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
info = info["info"]

# check if the storage driver we saw is valid
driver = info.get("Driver", "[NONE]")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why "[NONE]"?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the docker info came back without a driver (which seems unlikely, but you know, code defensively) then we're going to say to the user "Docker is using the X driver which isn't supported". What should X be? I don't really care, just thought it would be good to report something distinctly bogus in this unlikely event.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it.

if driver not in self.storage_drivers:
msg = (
"Detected unsupported Docker storage driver '{driver}'.\n"
"Supported storage drivers are: {drivers}"
).format(driver=driver, drivers=', '.join(self.storage_drivers))
return {"failed": True, "changed": changed, "msg": msg}

# driver status info is a list of tuples; convert to dict and validate based on driver
driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
if driver == "devicemapper":
if driver_status.get("Data loop file"):
msg = (
"Use of loopback devices with the Docker devicemapper storage driver\n"
"(the default storage configuration) is unsupported in production.\n"
"Please use docker-storage-setup to configure a backing storage volume.\n"
"See http://red.ht/2rNperO for further information."
)
return {"failed": True, "changed": changed, "msg": msg}
result = self._check_dm_usage(driver_status, task_vars)
result["changed"] = changed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be something like:

result['changed'] = result.get('changed', False) or changed

?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only if _check_dm_usage ever made changes. it doesn't so... keep it simple?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get we (don't / might not) even use the value of 'changed' for checks right now, but we "code defensively" in lots of places, why wouldn't we do it here? If that assumption changes (that _check_dm_usage never sets the 'changed' key), it is unlikely that we'll go look at every caller to stop overriding the value.

return result

# TODO(lmeyer): determine how to check usage for overlay2

return {"changed": changed}

def _check_dm_usage(self, driver_status, task_vars):
"""
Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Describing the assumptions is great, but what I'm missing here is some more standard info we normally have in a docstring:

https://www.python.org/dev/peps/pep-0257/#one-line-docstrings

  • The docstring is a phrase ending in a period. It prescribes the function or method's effect as a command ("Do this", "Return that"), not as a description; e.g. don't write "Returns the pathname ...".

...
The docstring for a function or method should summarize its behavior and document its arguments, return value(s), side effects, exceptions raised, and restrictions on when it can be called (all if applicable). Optional arguments should be indicated. It should be documented whether keyword arguments are part of the interface.

Looking at the code I see we construct a dict, modify it and return it... maybe we're just missing a line in the beginning, something like:

"""Check whether data and metadata volume usages are below a given threshold and return ...

[Continue with the assumptions...]
"""

implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
devicemapper storage. The LV is "thin" because it does not use all available storage
from its VG, instead expanding as needed; so to determine available space, we gather
current usage as the Docker API reports for the driver as well as space available for
expansion in the pool's VG.
Usage within the LV is divided into pools allocated to data and metadata, either of which
could run out of space first; so we check both.
"""
vals = dict(
vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
data_used=driver_status.get("Data Space Used"),
data_total=driver_status.get("Data Space Total"),
metadata_used=driver_status.get("Metadata Space Used"),
metadata_total=driver_status.get("Metadata Space Total"),
)

# convert all human-readable strings to bytes
for key, value in vals.copy().items():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why .copy()?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because we're modifying vals in the loop and I heard it was a bad idea to iterate while changing

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's correct...

On a second look, I wonder why we mutate vals, and why do we add a key with the _bytes prefix? I couldn't find anywhere that actually reads back that key?!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see I see... we later read "data_used_bytes" and so on.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, and vals is merged into the result, so that the details are all in the ansible spew if you want to debug what you got

try:
vals[key + "_bytes"] = self._convert_to_bytes(value)
except ValueError as err: # unlikely to hit this from API info, but just to be safe
return {
"failed": True,
"values": vals,
"msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err))
}

# determine the threshold percentages which usage should not exceed
for name, default in [("data", self.max_thinpool_data_usage_percent),
("metadata", self.max_thinpool_meta_usage_percent)]:
percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default)
try:
vals[name + "_threshold"] = float(percent)
except ValueError:
return {
"failed": True,
"msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent)
}

# test whether the thresholds are exceeded
messages = []
for name in ["data", "metadata"]:
vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / (
vals[name + "_total_bytes"] + vals["vg_free_bytes"])
if vals[name + "_pct_used"] > vals[name + "_threshold"]:
messages.append(
"Docker thinpool {name} usage percentage {pct:.1f} "
"is higher than threshold {thresh:.1f}.".format(
name=name,
pct=vals[name + "_pct_used"],
thresh=vals[name + "_threshold"],
))
vals["failed"] = True

vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
return vals

def _get_vg_free(self, pool, task_vars):
# Determine which VG to examine according to the pool name, the only indicator currently
# available from the Docker API driver info. We assume a name that looks like
# "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
if not match: # unlikely, but... be clear if we assumed wrong
raise OpenShiftCheckException(
"This host's Docker reports it is using a storage pool named '{}'.\n"
"However this name does not have the expected format of 'vgname-lvname'\n"
"so the available storage in the VG cannot be determined.".format(pool)
)
vg_name = match.groups()[0].replace("--", "-")
vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name
# should return free space like " 12.00g" if the VG exists; empty if it does not

ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars)
if ret.get("failed") or ret.get("rc", 0) != 0:
raise OpenShiftCheckException(
"Is LVM installed? Failed to run /sbin/vgs "
"to determine docker storage usage:\n" + ret.get("msg", "")
)
size = ret.get("stdout", "").strip()
if not size:
raise OpenShiftCheckException(
"This host's Docker reports it is using a storage pool named '{pool}'.\n"
"which we expect to come from local VG '{vg}'.\n"
"However, /sbin/vgs did not find this VG. Is Docker for this host"
"running and using the storage on the host?".format(pool=pool, vg=vg_name)
)
return size

@staticmethod
def _convert_to_bytes(string):
units = dict(
b=1,
k=1024,
m=1024**2,
g=1024**3,
t=1024**4,
p=1024**5,
)
string = string or ""
match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit
if not match:
raise ValueError("Cannot convert to a byte size: " + string)

number, unit = match.groups()
multiplier = 1 if not unit else units.get(unit.lower())
if not multiplier:
raise ValueError("Cannot convert to a byte size: " + string)

return float(number) * multiplier
42 changes: 41 additions & 1 deletion roles/openshift_health_checker/openshift_checks/mixins.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# pylint: disable=missing-docstring,too-few-public-methods
"""
Mixin classes meant to be used with subclasses of OpenShiftCheck.
"""
Expand All @@ -8,8 +7,49 @@

class NotContainerizedMixin(object):
"""Mixin for checks that are only active when not in containerized mode."""
# permanent # pylint: disable=too-few-public-methods
# Reason: The mixin is not intended to stand on its own as a class.

@classmethod
def is_active(cls, task_vars):
"""Only run on non-containerized hosts."""
is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized


class DockerHostMixin(object):
"""Mixin for checks that are only active on hosts that require Docker."""

dependencies = []

@classmethod
def is_active(cls, task_vars):
"""Only run on hosts that depend on Docker."""
is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
is_node = "nodes" in get_var(task_vars, "group_names", default=[])
return super(DockerHostMixin, cls).is_active(task_vars) and (is_containerized or is_node)

def ensure_dependencies(self, task_vars):
"""
Ensure that docker-related packages exist, but not on atomic hosts
(which would not be able to install but should already have them).
Returns: msg, failed, changed
"""
if get_var(task_vars, "openshift", "common", "is_atomic"):
return "", False, False

# NOTE: we would use the "package" module but it's actually an action plugin
# and it's not clear how to invoke one of those. This is about the same anyway:
pkg_manager = get_var(task_vars, "ansible_pkg_mgr", default="yum")
result = self.module_executor(pkg_manager, {"name": self.dependencies, "state": "present"}, task_vars)
msg = result.get("msg", "")
if result.get("failed"):
if "No package matching" in msg:
msg = "Ensure that all required dependencies can be installed via `yum`.\n"
msg = (
"Unable to install required packages on this host:\n"
" {deps}\n{msg}"
).format(deps=',\n '.join(self.dependencies), msg=msg)
failed = result.get("failed", False) or result.get("rc", 0) != 0
changed = result.get("changed", False)
return msg, failed, changed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, i see i missed doing this one. thanks.

Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@
from openshift_checks.docker_image_availability import DockerImageAvailability


@pytest.mark.parametrize('deployment_type,is_active', [
("origin", True),
("openshift-enterprise", True),
("enterprise", False),
("online", False),
("invalid", False),
("", False),
@pytest.mark.parametrize('deployment_type, is_containerized, group_names, expect_active', [
("origin", True, [], True),
("openshift-enterprise", True, [], True),
("enterprise", True, [], False),
("online", True, [], False),
("invalid", True, [], False),
("", True, [], False),
("origin", False, [], False),
("openshift-enterprise", False, [], False),
("origin", False, ["nodes", "masters"], True),
("openshift-enterprise", False, ["etcd"], False),
])
def test_is_active(deployment_type, is_active):
def test_is_active(deployment_type, is_containerized, group_names, expect_active):
task_vars = dict(
openshift=dict(common=dict(is_containerized=is_containerized)),
openshift_deployment_type=deployment_type,
group_names=group_names,
)
assert DockerImageAvailability.is_active(task_vars=task_vars) == is_active
assert DockerImageAvailability.is_active(task_vars=task_vars) == expect_active


@pytest.mark.parametrize("is_containerized,is_atomic", [
Expand Down
Loading