-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Incorporated docker_storage_driver into docker_storage as both need driver info. Corrected storage calculation to include VG free space, not just the current amount in the LV pool. Now makes no assumptions about pool name. Improved user messaging. Factored out some methods that can be shared with docker_image_availability.
- Loading branch information
Showing
7 changed files
with
377 additions
and
441 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
243 changes: 157 additions & 86 deletions
243
roles/openshift_health_checker/openshift_checks/docker_storage.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,110 +1,181 @@ | ||
# pylint: disable=missing-docstring | ||
"""Check Docker storage driver and usage.""" | ||
import json | ||
|
||
import re | ||
from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var | ||
from openshift_checks.mixins import DockerHostMixin | ||
|
||
|
||
class DockerStorage(OpenShiftCheck): | ||
"""Check Docker storage sanity. | ||
class DockerStorage(DockerHostMixin, OpenShiftCheck): | ||
"""Check Docker storage driver compatibility. | ||
Check for thinpool usage during a containerized installation | ||
This check ensures that Docker is using a supported storage driver, | ||
and that loopback is not being used (if using devicemapper). | ||
Also that storage usage is not above threshold. | ||
""" | ||
|
||
name = "docker_storage" | ||
tags = ["preflight"] | ||
tags = ["pre-install", "health", "preflight"] | ||
|
||
dependencies = ["python-docker-py"] | ||
storage_drivers = ["devicemapper", "overlay2"] | ||
max_thinpool_data_usage_percent = 90.0 | ||
max_thinpool_meta_usage_percent = 90.0 | ||
|
||
@classmethod | ||
def is_active(cls, task_vars): | ||
"""Only run on hosts that depend on Docker.""" | ||
is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") | ||
is_node = "nodes" in get_var(task_vars, "group_names", default=[]) | ||
return (super(DockerStorage, cls).is_active(task_vars) and is_containerized) or is_node | ||
|
||
# pylint: disable=too-many-return-statements | ||
# Reason: permanent stylistic exception; | ||
# it is clearer to return on failures and there are just many ways to fail here. | ||
def run(self, tmp, task_vars): | ||
try: | ||
self.max_thinpool_data_usage_percent = float(get_var(task_vars, "max_thinpool_data_usage_percent", | ||
default=self.max_thinpool_data_usage_percent)) | ||
self.max_thinpool_meta_usage_percent = float(get_var(task_vars, "max_thinpool_metadata_usage_percent", | ||
default=self.max_thinpool_meta_usage_percent)) | ||
except ValueError as err: | ||
msg, failed, changed = self.ensure_dependencies(task_vars) | ||
if failed: | ||
return { | ||
"failed": True, | ||
"msg": "Unable to convert thinpool data usage limit to float: {}".format(str(err)) | ||
"changed": changed, | ||
"msg": "Some dependencies are required in order to query docker storage on host:\n" + msg | ||
} | ||
|
||
err_msg = self.check_thinpool_usage(task_vars) | ||
if err_msg: | ||
return {"failed": True, "msg": err_msg} | ||
|
||
return {} | ||
|
||
def check_thinpool_usage(self, task_vars): | ||
lvs = self.get_lvs_data(task_vars) | ||
lv_data = self.extract_thinpool_obj(lvs) | ||
# attempt to get the docker info hash from the API | ||
info = self.execute_module("docker_info", {}, task_vars) | ||
if info.get("failed"): | ||
return {"failed": True, "changed": changed, | ||
"msg": "Failed to query Docker API. Is docker running on this host?"} | ||
if not info.get("info"): # this would be very strange | ||
return {"failed": True, "changed": changed, | ||
"msg": "Docker API query missing info:\n{}".format(json.dumps(info))} | ||
info = info["info"] | ||
|
||
# check if the storage driver we saw is valid | ||
driver = info.get("Driver", "[NONE]") | ||
if driver not in self.storage_drivers: | ||
msg = ( | ||
"Detected unsupported Docker storage driver '{driver}'.\n" | ||
"Supported storage drivers are: {drivers}" | ||
).format(driver=driver, drivers=', '.join(self.storage_drivers)) | ||
return {"failed": True, "changed": changed, "msg": msg} | ||
|
||
# driver status info is a list of tuples; convert to dict and validate based on driver | ||
driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])} | ||
if driver == "devicemapper": | ||
if self._is_dm_using_loopback_device(driver_status): | ||
msg = ( | ||
"Use of loopback devices with the Docker devicemapper storage driver\n" | ||
"(the default storage configuration) is unsupported in production.\n" | ||
"Please use docker-storage-setup to configure a backing storage volume.\n" | ||
"See http://red.ht/2rNperO for further information." | ||
) | ||
return {"failed": True, "changed": changed, "msg": msg} | ||
result = self._check_dm_usage(driver_status, task_vars) | ||
result["changed"] = changed | ||
return result | ||
|
||
# TODO(lmeyer): determine how to check usage for overlay2 | ||
|
||
return {"changed": changed} | ||
|
||
data_percent = self.get_thinpool_data_usage(lv_data) | ||
metadata_percent = self.get_thinpool_metadata_usage(lv_data) | ||
|
||
if data_percent > self.max_thinpool_data_usage_percent: | ||
msg = "thinpool data usage above maximum threshold of {threshold}%" | ||
return msg.format(threshold=self.max_thinpool_data_usage_percent) | ||
|
||
if metadata_percent > self.max_thinpool_meta_usage_percent: | ||
msg = "thinpool metadata usage above maximum threshold of {threshold}%" | ||
return msg.format(threshold=self.max_thinpool_meta_usage_percent) | ||
|
||
return "" | ||
|
||
def get_lvs_data(self, task_vars): | ||
lvs_cmd = "/sbin/lvs --select vg_name=docker --select lv_name=docker-pool --report-format json" | ||
result = self.exec_cmd(lvs_cmd, task_vars) | ||
|
||
if result.get("failed", False): | ||
msg = "no thinpool usage data returned by the host: {}" | ||
raise OpenShiftCheckException(msg.format(result.get("msg", ""))) | ||
@staticmethod | ||
def _is_dm_using_loopback_device(driver_status): | ||
return bool(driver_status.get("Data loop file")) | ||
|
||
def _check_dm_usage(self, driver_status, task_vars): | ||
""" | ||
For background: under thinpool-backed devicemapper, docker-storage-setup creates | ||
an LVM2 LV as a backing storage pool. The LV is "thin" because it does not use | ||
all available storage from its VG, instead expanding as needed; so in considering | ||
available space, it is necessary to include space remaining in the VG (which is | ||
unfortunately not reported by the storage driver in docker info). | ||
Usage of the LV is divided into space allocated to data and metadata (which also | ||
expand automatically), either of which could run out of space while the other is | ||
fine; so both must be checked. | ||
""" | ||
vals = dict( | ||
vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars), | ||
data_used=driver_status.get("Data Space Used"), | ||
data_total=driver_status.get("Data Space Total"), | ||
metadata_used=driver_status.get("Metadata Space Used"), | ||
metadata_total=driver_status.get("Metadata Space Total"), | ||
) | ||
|
||
for key, value in vals.copy().items(): | ||
try: | ||
vals[key + "_bytes"] = self._convert_to_bytes(value) | ||
except ValueError as err: # unlikely to see this from our inputs | ||
return { | ||
"failed": True, | ||
"values": vals, | ||
"msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err)) | ||
} | ||
|
||
try: | ||
data_json = json.loads(result.get("stdout", "")) | ||
vals["data_threshold"] = float(get_var(task_vars, "max_thinpool_data_usage_percent", | ||
default=self.max_thinpool_data_usage_percent)) | ||
vals["metadata_threshold"] = float(get_var(task_vars, "max_thinpool_metadata_usage_percent", | ||
default=self.max_thinpool_meta_usage_percent)) | ||
except ValueError as err: | ||
raise OpenShiftCheckException("Invalid JSON value returned by lvs command: {}".format(str(err))) | ||
|
||
data = data_json.get("report") | ||
if not data: | ||
raise OpenShiftCheckException("no thinpool usage data returned by the host.") | ||
|
||
return data | ||
|
||
@staticmethod | ||
def get_thinpool_data_usage(thinpool_lv_data): | ||
data = thinpool_lv_data.get("data_percent") | ||
if not data: | ||
raise OpenShiftCheckException("no thinpool usage data returned by the host.") | ||
|
||
return float(data) | ||
|
||
@staticmethod | ||
def get_thinpool_metadata_usage(thinpool_lv_data): | ||
data = thinpool_lv_data.get("metadata_percent") | ||
if not data: | ||
raise OpenShiftCheckException("no thinpool usage data returned by the host.") | ||
return { | ||
"failed": True, | ||
"msg": "Unable to convert thinpool data usage limit to float: {}".format(str(err)) | ||
} | ||
|
||
return float(data) | ||
vals["data_pct_used"] = 100 * vals["data_used_bytes"] / ( | ||
vals["data_total_bytes"] + vals["vg_free_bytes"]) | ||
vals["metadata_pct_used"] = 100 * vals["metadata_used_bytes"] / ( | ||
vals["metadata_total_bytes"] + vals["vg_free_bytes"]) | ||
msg = "" | ||
if vals["data_threshold"] < vals["data_pct_used"]: | ||
vals["failed"] = True | ||
msg = ( | ||
"Docker thinpool data usage percentage {:.1f} " | ||
"is higher than threshold {:.1f}." | ||
).format(vals["data_pct_used"], vals["data_threshold"]) | ||
if vals["metadata_threshold"] < vals["metadata_pct_used"]: | ||
vals["failed"] = True | ||
msg = msg + "\n" if msg else "" | ||
msg += ( | ||
"\nDocker thinpool metadata usage percentage {:.1f} " | ||
"is higher than threshold {:.1f}." | ||
).format(vals["metadata_pct_used"], vals["metadata_threshold"]) | ||
|
||
vals["msg"] = msg or "Thinpool usage is below thresholds." | ||
return vals | ||
|
||
def _get_vg_free(self, pool, task_vars): | ||
# pool looks like "foo--vg-docker--pool"; vg and lv joined by hyphen, inner hyphens doubled! | ||
match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen | ||
if not match: | ||
raise OpenShiftCheckException( | ||
"Host's docker says it is using LVM storage pool with invalid name '{}'".format(pool) | ||
) | ||
vg_name = match.groups()[0].replace("--", "-") | ||
vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name | ||
# should return free space like " 12.00g" if the VG exists; empty if it does not | ||
|
||
ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars) | ||
if ret.get("failed") or ret.get("rc", 0) != 0: | ||
raise OpenShiftCheckException( | ||
"Is LVM installed? Failed to run /sbin/vgs " | ||
"to determine docker storage usage:\n" + ret.get("msg", "") | ||
) | ||
size = ret.get("stdout", "").strip() | ||
if not size: | ||
raise OpenShiftCheckException( | ||
"Host's docker says it is using LVM storage pool '{pool}'\n" | ||
"which we expect to come from local VG '{vg}'.\n" | ||
"However, /sbin/vgs did not find this VG. Is docker for this host" | ||
"running and using the storage on the host?".format(pool=pool, vg=vg_name) | ||
) | ||
return size | ||
|
||
@staticmethod | ||
def extract_thinpool_obj(thinpool_data): | ||
if not thinpool_data or not thinpool_data[0]: | ||
raise OpenShiftCheckException("no thinpool usage data returned by the host.") | ||
|
||
lv_data = thinpool_data[0].get("lv") | ||
if not lv_data or not lv_data[0]: | ||
raise OpenShiftCheckException("no thinpool usage data returned by the host.") | ||
|
||
return lv_data[0] | ||
|
||
def exec_cmd(self, cmd_str, task_vars): | ||
return self.execute_module("command", { | ||
"_raw_params": cmd_str, | ||
}, task_vars) | ||
def _convert_to_bytes(string): | ||
units = dict( | ||
k=1024, | ||
m=1024**2, | ||
g=1024**3, | ||
t=1024**4, | ||
p=1024**5, | ||
) | ||
string = string or "" | ||
match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit | ||
if not match: | ||
raise ValueError("Cannot convert to a byte size: " + string) | ||
number, unit = match.groups() | ||
return float(number) * units.get(str(unit).lower(), 1) |
50 changes: 0 additions & 50 deletions
50
roles/openshift_health_checker/openshift_checks/docker_storage_driver.py
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.