diff --git a/etc/kayobe/ansible/scripts/generate_fixtures.py b/etc/kayobe/ansible/scripts/generate_fixtures.py new file mode 100644 index 000000000..5f8f7cc64 --- /dev/null +++ b/etc/kayobe/ansible/scripts/generate_fixtures.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +import json +import re +from pySMART import DeviceList + +SMARTMON_ATTRS = { + "airflow_temperature_cel", + "command_timeout", + "current_pending_sector", + "end_to_end_error", + "erase_fail_count", + "g_sense_error_rate", + "hardware_ecc_recovered", + "host_reads_32mib", + "host_reads_mib", + "host_writes_32mib", + "host_writes_mib", + "load_cycle_count", + "media_wearout_indicator", + "nand_writes_1gib", + "offline_uncorrectable", + "power_cycle_count", + "power_on_hours", + "program_fail_cnt_total", + "program_fail_count", + "raw_read_error_rate", + "reallocated_event_count", + "reallocated_sector_ct", + "reported_uncorrect", + "runtime_bad_block", + "sata_downshift_count", + "seek_error_rate", + "spin_retry_count", + "spin_up_time", + "start_stop_count", + "temperature_case", + "temperature_celsius", + "temperature_internal", + "total_lbas_read", + "total_lbas_written", + "udma_crc_error_count", + "unsafe_shutdown_count", + "unused_rsvd_blk_cnt_tot", + "wear_leveling_count", + "workld_host_reads_perc", + "workld_media_wear_indic", + "workload_minutes", + "critical_warning", + "temperature", + "available_spare", + "available_spare_threshold", + "percentage_used", + "data_units_read", + "data_units_written", + "host_reads", + "host_writes", + "controller_busy_time", + "power_cycles", + "unsafe_shutdowns", + "media_errors", + "num_err_log_entries", + "warning_temp_time", + "critical_comp_time", +} + +DISK_INFO = { + "name", + "interface", + "vendor", + "family", + "model", + "serial", + "firmware", + "smart_capable", + "smart_enabled", + "assessment", +} + +def camel_to_snake(name): + """ + Convert a CamelCase string to snake_case. + + Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case + """ + return re.sub(r'(? (c) 2012 -# source at: http://devel.dob.sk/collectd-scripts/ - -# TODO: This probably needs to be a little more complex. The raw numbers can have more -# data in them than you'd think. -# http://arstechnica.com/civis/viewtopic.php?p=22062211 - -# Formatting done via shfmt -i 2 -# https://github.com/mvdan/sh - -parse_smartctl_attributes_awk="$( - cat <<'SMARTCTLAWK' -$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { - gsub(/-/, "_"); - printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 - printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 - printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 - printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 -} -SMARTCTLAWK -)" - -smartmon_attrs="$( - cat <<'SMARTMONATTRS' -airflow_temperature_cel -command_timeout -current_pending_sector -end_to_end_error -erase_fail_count -g_sense_error_rate -hardware_ecc_recovered -host_reads_32mib -host_reads_mib -host_writes_32mib -host_writes_mib -load_cycle_count -media_wearout_indicator -nand_writes_1gib -offline_uncorrectable -power_cycle_count -power_on_hours -program_fail_cnt_total -program_fail_count -raw_read_error_rate -reallocated_event_count -reallocated_sector_ct -reported_uncorrect -runtime_bad_block -sata_downshift_count -seek_error_rate -spin_retry_count -spin_up_time -start_stop_count -temperature_case -temperature_celsius -temperature_internal -total_lbas_read -total_lbas_written -udma_crc_error_count -unsafe_shutdown_count -unused_rsvd_blk_cnt_tot -wear_leveling_count -workld_host_reads_perc -workld_media_wear_indic -workload_minutes -SMARTMONATTRS -)" -smartmon_attrs="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" - -parse_smartctl_attributes() { - local disk="$1" - local disk_type="$2" - local serial="$3" - local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" - sed 's/^ \+//g' | - awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | - tr '[:upper:]' '[:lower:]' | - grep -E "(${smartmon_attrs})" -} - -parse_smartctl_scsi_attributes() { - local disk="$1" - local disk_type="$2" - local serial="$3" - local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" - while read -r line; do - attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" - attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" - case "${attr_type}" in - number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Current_Drive_Temperature) temp_cel="$(echo "${attr_value}" | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; - Blocks_sent_to_initiator_) lbas_read="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Blocks_received_from_initiator_) lbas_written="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Accumulated_start-stop_cycles) power_cycle="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Elements_in_grown_defect_list) grown_defects="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - esac - done - [ -n "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" - [ -n "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" - [ -n "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" - [ -n "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"241\"} ${lbas_written}" - [ -n "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" - [ -n "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"-1\"} ${grown_defects}" -} - -parse_smartctl_info() { - shopt -s nocasematch - local -i smart_available=0 smart_enabled=0 smart_healthy= - local disk="$1" disk_type="$2" - local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' - while read -r line; do - info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" - info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" - case "${info_type}" in - Model_Family) model_family="${info_value}" ;; - Device_Model) device_model="${info_value}" ;; - Serial_Number) serial_number="$(echo ${info_value} | tr '[:upper:]' '[:lower:]')" ;; - Firmware_Version) fw_version="${info_value}" ;; - Vendor) vendor="${info_value}" ;; - Product) product="${info_value}" ;; - Revision) revision="${info_value}" ;; - Logical_Unit_id) lun_id="${info_value}" ;; - esac - if [[ "${info_type}" == 'SMART_support_is' ]]; then - case "${info_value:0:7}" in - Enabled) smart_available=1; smart_enabled=1 ;; - Availab) smart_available=1; smart_enabled=0 ;; - Unavail) smart_available=0; smart_enabled=0 ;; - esac - fi - if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then - case "${info_value:0:6}" in - PASSED) smart_healthy=1 ;; - *) smart_healthy=0 ;; - esac - elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then - case "${info_value:0:2}" in - OK) smart_healthy=1 ;; - *) smart_healthy=0 ;; - esac - fi - done - echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1" - echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_available}" - [[ "${smart_available}" == "1" ]] && echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_enabled}" - [[ "${smart_available}" == "1" ]] && [[ "${smart_healthy}" != "" ]] && echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_healthy}" -} - -output_format_awk="$( - cat <<'OUTPUTAWK' -BEGIN { v = "" } -v != $1 { - print "# HELP smartmon_" $1 " SMART metric " $1; - print "# TYPE smartmon_" $1 " gauge"; - v = $1 -} -{print "smartmon_" $0} -OUTPUTAWK -)" - -format_output() { - sort | - awk -F'{' "${output_format_awk}" -} - -smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" - -echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output - -if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then - exit -fi - -device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')" - -for device in ${device_list}; do - disk="$(echo "${device}" | cut -f1 -d'|')" - type="$(echo "${device}" | cut -f2 -d'|')" - # Use REGEX to extract the serial number from the parsed information and save that to a variable - serial_number="$(/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"| sed -E ':a;N;$!ba;s/.*serial_number=\"([^"]+)\".*/\1/g' | sed -E 's/^device_info\{.*//g')" - active=1 - echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" - # Check if the device is in a low-power mode - /usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0 - echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" - # Skip further metrics to prevent the disk from spinning up - test ${active} -eq 0 && continue - # Get the SMART information and health - /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" - # Get the SMART attributes - case ${type} in - sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; - sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; - scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; - megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; - *) - (>&2 echo "disk type is not sat, scsi or megaraid but ${type}") - exit - ;; - esac -done | format_output diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py new file mode 100644 index 000000000..4749808a5 --- /dev/null +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -0,0 +1,278 @@ +import glob +import json +import os +import unittest +import tempfile +import math +from time import sleep + +from unittest.mock import patch, MagicMock +from smartmon import ( + parse_device_info, + parse_if_attributes, + main, + SMARTMON_ATTRS, + camel_to_snake, + write_metrics_to_textfile, +) + +def load_json_fixture(filename): + """ + Load a JSON file from the 'tests' subfolder. + """ + path = os.path.join(os.path.dirname(__file__), "tests", filename) + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +class TestSmartMon(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Collect all *.json files from ./tests/ + data_folder = os.path.join(os.path.dirname(__file__), "tests") + cls.fixture_files = glob.glob(os.path.join(data_folder, "*.json")) + + def create_mock_device_from_json(self, device_info, if_attributes=None): + """ + Given a 'device_info' dict and optional 'if_attributes', build + a MagicMock that mimics a pySMART Device object. + """ + device = MagicMock() + device.name = device_info.get("name", "") + device.interface = device_info.get("interface", "") + device.vendor = device_info.get("vendor", "") + device.family = device_info.get("family", "") + device.model = device_info.get("model", "") + device.serial = device_info.get("serial", "") + device.firmware = device_info.get("firmware", "") + device.smart_capable = device_info.get("smart_capable", False) + device.smart_enabled = device_info.get("smart_enabled", False) + device.assessment = device_info.get("assessment", "") + + if if_attributes: + class IfAttributesMock: + pass + + if_mock = IfAttributesMock() + for key, val in if_attributes.items(): + setattr(if_mock, key, val) + device.if_attributes = if_mock + else: + device.if_attributes = None + + return device + + def _test_parse_device_info(self, fixture_name): + """ + Helper method to test parse_device_info() for a single JSON fixture. + """ + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + + device = self.create_mock_device_from_json(device_info) + metrics = parse_device_info(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # The device_info line should exist for every device + device_info_found = any( + line.startswith("smartmon_device_info{") and + f'disk="{dev_name}"' in line and + f'type="{dev_iface}"' in line and + f'serial_number="{dev_serial}"' in line + for line in metrics + ) + self.assertTrue( + device_info_found, + f"Expected a smartmon_device_info metric line for {dev_name} but didn't find it." + ) + + # If smart_capable is true, we expect device_smart_available = 1 + if device_info.get("smart_capable"): + smart_available_found = any( + line.startswith("smartmon_device_smart_available{") and + f'disk="{dev_name}"' in line and + f'serial_number="{dev_serial}"' in line and + line.endswith(" 1.0") + for line in metrics + ) + self.assertTrue( + smart_available_found, + f"Expected smartmon_device_smart_available=1.0 for {dev_name}, not found." + ) + + # If smart_enabled is true, we expect device_smart_enabled = 1 + if device_info.get("smart_enabled"): + smart_enabled_found = any( + line.startswith("smartmon_device_smart_enabled{") and + f'disk="{dev_name}"' in line and + line.endswith(" 1.0") + for line in metrics + ) + self.assertTrue( + smart_enabled_found, + f"Expected smartmon_device_smart_enabled=1.0 for {dev_name}, not found." + ) + + # device_smart_healthy if assessment in [PASS, WARN, FAIL] + # PASS => 1, otherwise => 0 + assessment = device_info.get("assessment", "").upper() + if assessment in ["PASS", "WARN", "FAIL"]: + expected_val = float(1) if assessment == "PASS" else float(0) + smart_healthy_found = any( + line.startswith("smartmon_device_smart_healthy{") and + f'disk="{dev_name}"' in line and + line.endswith(f" {expected_val}") + for line in metrics + ) + self.assertTrue( + smart_healthy_found, + f"Expected smartmon_device_smart_healthy={expected_val} for {dev_name}, not found." + ) + + def test_parse_device_info(self): + """ + Test parse_device_info() for every JSON fixture in ./tests/. + Each fixture is tested individually with clear error reporting. + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(fixture=fixture_name): + self._test_parse_device_info(fixture_name) + + def _test_parse_if_attributes(self, fixture_name): + """ + Helper method to test parse_if_attributes() for a single JSON fixture. + """ + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + device = self.create_mock_device_from_json(device_info, if_attrs) + metrics = parse_if_attributes(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # For each numeric attribute in JSON, if it's in SMARTMON_ATTRS, + # we expect a line in the script's output. + for attr_key, attr_val in if_attrs.items(): + snake_key = camel_to_snake(attr_key) + + if isinstance(attr_val, (int, float)) and snake_key in SMARTMON_ATTRS: + expected_line = ( + f"smartmon_{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" + ) + self.assertIn( + expected_line, + metrics, + f"Expected metric '{expected_line}' for attribute '{attr_key}' not found." + ) + else: + # If it's not in SMARTMON_ATTRS or not numeric, + # we do NOT expect a line with that name+value + unexpected_line = ( + f"smartmon_{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" + ) + self.assertNotIn( + unexpected_line, + metrics, + f"Unexpected metric '{unexpected_line}' found for {attr_key}." + ) + + # Also ensure that non-numeric or disallowed attributes do not appear + # For instance "notInSmartmonAttrs" should never appear. + for line in metrics: + self.assertNotIn( + "not_in_smartmon_attrs", + line, + f"'notInSmartmonAttrs' attribute unexpectedly found in metric line: {line}" + ) + + def test_parse_if_attributes(self): + """ + Test parse_if_attributes() for every JSON fixture in ./tests/. + Each fixture is tested individually with clear error reporting. + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(fixture=fixture_name): + self._test_parse_if_attributes(fixture_name) + + @patch("smartmon.run_command") + @patch("smartmon.DeviceList") + @patch("smartmon.write_metrics_to_textfile", wraps=write_metrics_to_textfile) + def test_main(self, mock_write_metrics, mock_devicelist_class, mock_run_cmd): + """ + End-to-end test of main() for every JSON fixture in ./tests/. + This ensures we can handle multiple disks (multiple fixture files). + Checks metrics written to a temp file, and that write_metrics_to_textfile is called once. + """ + + # Patch run_command to return a version & "active" power_mode + def run_command_side_effect(cmd, parse_json=False): + if "--version" in cmd: + return "smartctl 7.3 5422 [x86_64-linux-5.15.0]\n..." + if "-n" in cmd and "standby" in cmd and parse_json: + return {"power_mode": "active"} + return "" + + mock_run_cmd.side_effect = run_command_side_effect + + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(msg=f"Testing main() with {fixture_name}"): + mock_write_metrics.reset_mock() + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + # Mock a single device from the fixture + device_mock = self.create_mock_device_from_json(device_info, if_attrs) + + # Make DeviceList() return our single mock device + mock_dev_list = MagicMock() + mock_dev_list.devices = [device_mock] + mock_devicelist_class.return_value = mock_dev_list + + with tempfile.NamedTemporaryFile(mode="r+", delete_on_close=False) as tmpfile: + path= tmpfile.name + main(output_path=path) + tmpfile.close() + + # Ensure write_metrics_to_textfile was called once + self.assertEqual(mock_write_metrics.call_count, 1) + + with open(path, "r") as f: + # Read the metrics from the file + metrics_lines = [line.strip() for line in f.readlines() if line.strip() and not line.startswith('#')] + print(f"Metrics lines: {metrics_lines}") + + # Generate expected metrics using the parse functions + expected_metrics = [] + expected_metrics.extend(parse_device_info(device_mock)) + expected_metrics.extend(parse_if_attributes(device_mock)) + + # Check that all expected metrics are present in the file + for expected in expected_metrics: + exp_metric, exp_val_str = expected.rsplit(" ", 1) + exp_val = float(exp_val_str) + found = any( + (exp_metric in line) and + math.isclose(float(line.rsplit(" ", 1)[1]), exp_val) + for line in metrics_lines + ) + self.assertTrue(found, f"Expected metric '{expected}' not found") + + # Check that smartctl_version metric is present + version_found = any(line.startswith("smartmon_smartctl_version{") for line in metrics_lines) + self.assertTrue(version_found, "Expected 'smartmon_smartctl_version' metric not found in output file.") + + # Check that the output file is not empty + self.assertTrue(metrics_lines, "Metrics output file is empty.") + +if __name__ == "__main__": + unittest.main() diff --git a/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json b/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json new file mode 100644 index 000000000..d867910ae --- /dev/null +++ b/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json @@ -0,0 +1,26 @@ +{ + "device_info": { + "assessment": "PASS", + "firmware": "2.1.8", + "interface": "nvme", + "model": "Dell Ent NVMe CM6 RI 7.68TB", + "name": "nvme8", + "serial": "Y2Q0A0BPTCF8", + "smart_capable": true, + "smart_enabled": true, + "vendor": "Dell" + }, + "if_attributes": { + "availableSpare": 100, + "availableSpareThreshold": 10, + "controllerBusyTime": 2478, + "criticalWarning": 0, + "dataUnitsRead": 177817765, + "dataUnitsWritten": 127992843, + "percentageUsed": 1, + "powerCycles": 750, + "powerOnHours": 17427, + "temperature": 36, + "unsafeShutdowns": 37 + } +} diff --git a/etc/kayobe/ansible/scripts/tests/nvme.json b/etc/kayobe/ansible/scripts/tests/nvme.json new file mode 100644 index 000000000..bbff19ec0 --- /dev/null +++ b/etc/kayobe/ansible/scripts/tests/nvme.json @@ -0,0 +1,24 @@ +{ + "device_info": { + "name": "/dev/nvme0", + "interface": "nvme", + "vendor": "AcmeCorp", + "family": "Acme NVMe Family", + "model": "Acme NVMe 1TB", + "serial": "ABCD1234", + "firmware": "3.0.1", + "smart_capable": true, + "smart_enabled": true, + "assessment": "PASS" + }, + "if_attributes": { + "criticalWarning": 0, + "temperature": 36, + "availableSpare": 100, + "availableSpareThreshold": 10, + "percentageUsed": 0, + "dataUnitsRead": 117446405, + "dataUnitsWritten": 84630284, + "notInSmartmonAttrs": 999 + } +} diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml index 00cdfa495..351ce0325 100644 --- a/etc/kayobe/ansible/smartmon-tools.yml +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -13,6 +13,30 @@ state: present become: true + - name: Ensure Python 3, venv, and pip are installed + ansible.builtin.package: + name: + - python3 + - python3-venv + - python3-pip + state: present + become: true + + - name: Create smartmon Python virtual environment + ansible.builtin.command: + cmd: python3 -m venv /opt/smartmon-venv + creates: /opt/smartmon-venv/bin/activate + become: true + + - name: Install prometheus_client and pySMART in venv + ansible.builtin.pip: + name: + - prometheus_client + - pySMART + virtualenv: /opt/smartmon-venv + virtualenv_python: python3 + become: true + - name: Ensure the cron/crond service is running ansible.builtin.service: name: "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'crond' }}" @@ -20,15 +44,15 @@ enabled: true become: true - - name: Copy smartmon.sh and nvmemon.sh from scripts folder + - name: Copy smartmon.py and nvmemon.sh from scripts folder ansible.builtin.copy: src: scripts/{{ item }} - dest: /usr/local/bin/ + dest: /usr/local/bin/{{ item }} owner: root group: root mode: "0700" loop: - - smartmon.sh + - smartmon.py - nvmemon.sh become: true @@ -40,16 +64,39 @@ job: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin become: true - - name: Schedule cronjob to run both scripts every 5 minutes and save output to file + - name: Schedule cronjob to run smartmon.py every 5 minutes and save output to file ansible.builtin.cron: - name: SMART metrics for drive monitoring using {{ item }} + name: SMART metrics for drive monitoring using smartmon.py + user: root + minute: "*/5" + job: >- + umask 0022 && /opt/smartmon-venv/bin/python /usr/local/bin/smartmon.py --output /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp && + mv -f /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp /var/lib/docker/volumes/textfile/_data/smartmon.prom + become: true + + - name: Schedule cronjob to run nvmemon.sh every 5 minutes and save output to file + ansible.builtin.cron: + name: SMART metrics for drive monitoring using nvmemon.sh user: root minute: "*/5" job: >- - umask 0022 && /usr/local/bin/{{ item }}.sh > - /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp && - mv -f /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp /var/lib/docker/volumes/textfile/_data/{{ item }}.prom + umask 0022 && /usr/local/bin/nvmemon.sh > + /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp && + mv -f /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp /var/lib/docker/volumes/textfile/_data/nvmemon.prom + become: true + + - name: Remove old cronjobs if present + ansible.builtin.cron: + name: SMART metrics for drive monitoring using {{ item }} + user: root + state: absent + become: true loop: - smartmon - nvmemon + + - name: Remove old smartmon.sh if present + ansible.builtin.file: + path: /usr/local/bin/smartmon.sh + state: absent become: true