Skip to content

Commit

Permalink
docker checks: finish and refactor
Browse files Browse the repository at this point in the history
Incorporated docker_storage_driver into docker_storage as both need
driver info.
Corrected storage calculation to include VG free space, not just the
current amount in the LV pool. Now makes no assumptions about pool name.
Improved user messaging.
Factored out some methods that can be shared with docker_image_availability.
  • Loading branch information
sosiouxme committed Jun 7, 2017
1 parent 055082c commit bace709
Show file tree
Hide file tree
Showing 7 changed files with 393 additions and 444 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# pylint: disable=missing-docstring
from openshift_checks import OpenShiftCheck, get_var
from openshift_checks.mixins import DockerHostMixin


class DockerImageAvailability(OpenShiftCheck):
class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"""Check that required Docker images are available.
This check attempts to ensure that required docker images are
Expand Down Expand Up @@ -36,19 +37,11 @@ def is_active(cls, task_vars):

def run(self, tmp, task_vars):
msg, failed, changed = self.ensure_dependencies(task_vars)

# exit early if Skopeo update fails
if failed:
if "No package matching" in msg:
msg = "Ensure that all required dependencies can be installed via `yum`.\n"
return {
"failed": True,
"changed": changed,
"msg": (
"Unable to update or install required dependency packages on this host;\n"
"These are required in order to check Docker image availability:"
"\n {deps}\n{msg}"
).format(deps=',\n '.join(self.dependencies), msg=msg),
"msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
}

required_images = self.required_images(task_vars)
Expand Down Expand Up @@ -168,12 +161,3 @@ def is_available_skopeo_image(self, image, registry, task_vars):
args = {"_raw_params": cmd_str}
result = self.module_executor("command", args, task_vars)
return not result.get("failed", False) and result.get("rc", 0) == 0

# ensures that the skopeo and python-docker-py packages exist
# check is skipped on atomic installations
def ensure_dependencies(self, task_vars):
if get_var(task_vars, "openshift", "common", "is_atomic"):
return "", False, False

result = self.module_executor("yum", {"name": self.dependencies, "state": "latest"}, task_vars)
return result.get("msg", ""), result.get("failed", False) or result.get("rc", 0) != 0, result.get("changed")
253 changes: 162 additions & 91 deletions roles/openshift_health_checker/openshift_checks/docker_storage.py
Original file line number Diff line number Diff line change
@@ -1,110 +1,181 @@
# pylint: disable=missing-docstring
"""Check Docker storage driver and usage."""
import json

import re
from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
from openshift_checks.mixins import DockerHostMixin


class DockerStorage(OpenShiftCheck):
"""Check Docker storage sanity.
class DockerStorage(DockerHostMixin, OpenShiftCheck):
"""Check Docker storage driver compatibility.
Check for thinpool usage during a containerized installation
This check ensures that Docker is using a supported storage driver,
and that loopback is not being used (if using devicemapper).
Also that storage usage is not above threshold.
"""

name = "docker_storage"
tags = ["preflight"]
tags = ["pre-install", "health", "preflight"]

dependencies = ["python-docker-py"]
storage_drivers = ["devicemapper", "overlay2"]
max_thinpool_data_usage_percent = 90.0
max_thinpool_meta_usage_percent = 90.0

@classmethod
def is_active(cls, task_vars):
"""Only run on hosts that depend on Docker."""
is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
is_node = "nodes" in get_var(task_vars, "group_names", default=[])
return (super(DockerStorage, cls).is_active(task_vars) and is_containerized) or is_node

# pylint: disable=too-many-return-statements
# Reason: permanent stylistic exception;
# it is clearer to return on failures and there are just many ways to fail here.
def run(self, tmp, task_vars):
try:
self.max_thinpool_data_usage_percent = float(get_var(task_vars, "max_thinpool_data_usage_percent",
default=self.max_thinpool_data_usage_percent))
self.max_thinpool_meta_usage_percent = float(get_var(task_vars, "max_thinpool_metadata_usage_percent",
default=self.max_thinpool_meta_usage_percent))
except ValueError as err:
msg, failed, changed = self.ensure_dependencies(task_vars)
if failed:
return {
"failed": True,
"msg": "Unable to convert thinpool data usage limit to float: {}".format(str(err))
"changed": changed,
"msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
}

err_msg = self.check_thinpool_usage(task_vars)
if err_msg:
return {"failed": True, "msg": err_msg}

return {}

def check_thinpool_usage(self, task_vars):
lvs = self.get_lvs_data(task_vars)
lv_data = self.extract_thinpool_obj(lvs)

data_percent = self.get_thinpool_data_usage(lv_data)
metadata_percent = self.get_thinpool_metadata_usage(lv_data)

if data_percent > self.max_thinpool_data_usage_percent:
msg = "thinpool data usage above maximum threshold of {threshold}%"
return msg.format(threshold=self.max_thinpool_data_usage_percent)

if metadata_percent > self.max_thinpool_meta_usage_percent:
msg = "thinpool metadata usage above maximum threshold of {threshold}%"
return msg.format(threshold=self.max_thinpool_meta_usage_percent)

return ""

def get_lvs_data(self, task_vars):
lvs_cmd = "/sbin/lvs --select vg_name=docker --select lv_name=docker-pool --report-format json"
result = self.exec_cmd(lvs_cmd, task_vars)

if result.get("failed", False):
msg = "no thinpool usage data returned by the host: {}"
raise OpenShiftCheckException(msg.format(result.get("msg", "")))

try:
data_json = json.loads(result.get("stdout", ""))
except ValueError as err:
raise OpenShiftCheckException("Invalid JSON value returned by lvs command: {}".format(str(err)))

data = data_json.get("report")
if not data:
raise OpenShiftCheckException("no thinpool usage data returned by the host.")

return data

@staticmethod
def get_thinpool_data_usage(thinpool_lv_data):
data = thinpool_lv_data.get("data_percent")
if not data:
raise OpenShiftCheckException("no thinpool usage data returned by the host.")

return float(data)
# attempt to get the docker info hash from the API
info = self.execute_module("docker_info", {}, task_vars)
if info.get("failed"):
return {"failed": True, "changed": changed,
"msg": "Failed to query Docker API. Is docker running on this host?"}
if not info.get("info"): # this would be very strange
return {"failed": True, "changed": changed,
"msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
info = info["info"]

# check if the storage driver we saw is valid
driver = info.get("Driver", "[NONE]")
if driver not in self.storage_drivers:
msg = (
"Detected unsupported Docker storage driver '{driver}'.\n"
"Supported storage drivers are: {drivers}"
).format(driver=driver, drivers=', '.join(self.storage_drivers))
return {"failed": True, "changed": changed, "msg": msg}

# driver status info is a list of tuples; convert to dict and validate based on driver
driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
if driver == "devicemapper":
if driver_status.get("Data loop file"):
msg = (
"Use of loopback devices with the Docker devicemapper storage driver\n"
"(the default storage configuration) is unsupported in production.\n"
"Please use docker-storage-setup to configure a backing storage volume.\n"
"See http://red.ht/2rNperO for further information."
)
return {"failed": True, "changed": changed, "msg": msg}
result = self._check_dm_usage(driver_status, task_vars)
result["changed"] = changed
return result

# TODO(lmeyer): determine how to check usage for overlay2

return {"changed": changed}

def _check_dm_usage(self, driver_status, task_vars):
"""
Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
devicemapper storage. The LV is "thin" because it does not use all available storage
from its VG, instead expanding as needed; so to determine available space, we gather
current usage as the Docker API reports for the driver as well as space available for
expansion in the pool's VG.
Usage within the LV is divided into pools allocated to data and metadata, either of which
could run out of space first; so we check both.
"""
vals = dict(
vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
data_used=driver_status.get("Data Space Used"),
data_total=driver_status.get("Data Space Total"),
metadata_used=driver_status.get("Metadata Space Used"),
metadata_total=driver_status.get("Metadata Space Total"),
)

# convert all human-readable strings to bytes
for key, value in vals.copy().items():
try:
vals[key + "_bytes"] = self._convert_to_bytes(value)
except ValueError as err: # unlikely to hit this from API info, but just to be safe
return {
"failed": True,
"values": vals,
"msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err))
}

# determine the threshold percentages usage should not exceed
for name, default in [("data", self.max_thinpool_data_usage_percent),
("metadata", self.max_thinpool_meta_usage_percent)]:
percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default)
try:
vals[name + "_threshold"] = float(percent)
except ValueError:
return {
"failed": True,
"msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent)
}

messages = []
for name in ["data", "metadata"]:
vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / (
vals[name + "_total_bytes"] + vals["vg_free_bytes"])
if vals[name + "_threshold"] < vals[name + "_pct_used"]:
vals["failed"] = True
messages.append((
"Docker thinpool {} usage percentage {:.1f} "
"is higher than threshold {:.1f}."
).format(name, vals[name + "_pct_used"], vals[name + "_threshold"]))
vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])

return vals

def _get_vg_free(self, pool, task_vars):
# Determine which VG to examine according to the pool name, the only indicator currently
# available from the Docker API driver info. We assume a name that looks like
# "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
if not match: # unlikely, but... be clear if we assumed wrong
raise OpenShiftCheckException(
"This host's Docker reports it is using a storage pool named '{}'.\n"
"However this name does not have the expected format of 'vgname-lvname'\n"
"so the available storage in the VG cannot be determined.".format(pool)
)
vg_name = match.groups()[0].replace("--", "-")
vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name
# should return free space like " 12.00g" if the VG exists; empty if it does not

ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars)
if ret.get("failed") or ret.get("rc", 0) != 0:
raise OpenShiftCheckException(
"Is LVM installed? Failed to run /sbin/vgs "
"to determine docker storage usage:\n" + ret.get("msg", "")
)
size = ret.get("stdout", "").strip()
if not size:
raise OpenShiftCheckException(
"This host's Docker reports it is using a storage pool named '{pool}'.\n"
"which we expect to come from local VG '{vg}'.\n"
"However, /sbin/vgs did not find this VG. Is Docker for this host"
"running and using the storage on the host?".format(pool=pool, vg=vg_name)
)
return size

@staticmethod
def get_thinpool_metadata_usage(thinpool_lv_data):
data = thinpool_lv_data.get("metadata_percent")
if not data:
raise OpenShiftCheckException("no thinpool usage data returned by the host.")

return float(data)

@staticmethod
def extract_thinpool_obj(thinpool_data):
if not thinpool_data or not thinpool_data[0]:
raise OpenShiftCheckException("no thinpool usage data returned by the host.")

lv_data = thinpool_data[0].get("lv")
if not lv_data or not lv_data[0]:
raise OpenShiftCheckException("no thinpool usage data returned by the host.")

return lv_data[0]

def exec_cmd(self, cmd_str, task_vars):
return self.execute_module("command", {
"_raw_params": cmd_str,
}, task_vars)
def _convert_to_bytes(string):
units = dict(
b=1,
k=1024,
m=1024**2,
g=1024**3,
t=1024**4,
p=1024**5,
)
string = string or ""
match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit
if not match:
raise ValueError("Cannot convert to a byte size: " + string)

number, unit = match.groups()
multiplier = 1 if not unit else units.get(unit.lower())
if not multiplier:
raise ValueError("Cannot convert to a byte size: " + string)

return float(number) * multiplier

This file was deleted.

Loading

0 comments on commit bace709

Please sign in to comment.