From d0063e7d05747e21e6c4d14aad5f461709f15b57 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:11:53 +0100 Subject: [PATCH 01/16] Convert smartmon script to python --- etc/kayobe/ansible/scripts/smartmon.py | 156 +++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 etc/kayobe/ansible/scripts/smartmon.py diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py new file mode 100644 index 000000000..2a50c9187 --- /dev/null +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 + +import subprocess +import json +from datetime import datetime + +SMARTCTL_PATH = "/usr/sbin/smartctl" + +def run_command(command, parse_json=False): + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if parse_json: + return json.loads(result.stdout) + else: + return result.stdout.strip() + +def parse_smartctl_attributes(disk, disk_type, serial, json_data): + labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"' + metrics = [] + smartmon_attrs = set([ + "airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count", + "g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib", + "host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable", + "power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate", + "reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count", + "seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius", + "temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count", + "unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes", + "critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used", + "data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time", + "power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries", + "warning_temp_time", "critical_comp_time" + ]) + if 'nvme_smart_health_information_log' in json_data: + smart_log = json_data['nvme_smart_health_information_log'] + for attr_name, value in smart_log.items(): + attr_name = attr_name.replace(' ', '_').lower() + if attr_name in smartmon_attrs: + metrics.append(f"{attr_name}{{{labels}}} {value}") + elif 'scsi_grown_defect_list' in json_data: + scsi_attrs = json_data.get('scsi_grown_defect_list', {}) + for attr_name, value in scsi_attrs.items(): + attr_name = attr_name.replace(' ', '_').lower() + if attr_name in smartmon_attrs: + metrics.append(f"{attr_name}{{{labels}}} {value}") + elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']: + for attr in json_data['ata_smart_attributes']['table']: + attr_name = attr['name'].replace('-', '_').lower() + if attr_name in smartmon_attrs: + attr_id = attr.get('id', '') + value = attr.get('value', '') + worst = attr.get('worst', '') + threshold = attr.get('thresh', '') + raw_value = attr.get('raw', {}).get('value', '') + metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}") + metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}") + metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}") + metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}") + return metrics + +def parse_smartctl_info(disk, disk_type, json_data): + info = json_data.get('device', {}) + smart_status = json_data.get('smart_status', {}) + labels = { + 'disk': disk, + 'type': disk_type, + 'vendor': info.get('vendor', ''), + 'product': info.get('product', ''), + 'revision': info.get('revision', ''), + 'lun_id': info.get('lun_id', ''), + 'model_family': json_data.get('model_family', ''), + 'device_model': json_data.get('model_name', ''), + 'serial_number': json_data.get('serial_number', '').lower(), + 'firmware_version': json_data.get('firmware_version', '') + } + label_str = ','.join(f'{k}="{v}"' for k, v in labels.items()) + metrics = [ + f'device_info{{{label_str}}} 1', + f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}', + ] + if smart_status.get("available", False): + metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}') + if 'passed' in smart_status: + metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}') + return metrics + +def format_output(metrics): + output = [] + last_metric = "" + for metric in sorted(metrics): + metric_name = metric.split('{')[0] + if metric_name != last_metric: + output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}") + output.append(f"# TYPE smartmon_{metric_name} gauge") + last_metric = metric_name + output.append(f"smartmon_{metric}") + return '\n'.join(output) + +def main(): + try: + version_output = run_command([SMARTCTL_PATH, '-j'], parse_json=True) + smartctl_version_list = version_output.get('smartctl', {}).get('version', []) + if smartctl_version_list: + smartctl_version_str = '.'.join(map(str, smartctl_version_list)) + else: + smartctl_version_str = "unknown" + except json.JSONDecodeError: + smartctl_version_str = "unknown" + metrics = [f'smartctl_version{{version="{smartctl_version_str}"}} 1'] + + try: + device_list_output = run_command([SMARTCTL_PATH, '--scan-open', '-j'], parse_json=True) + devices = [] + for device in device_list_output.get('devices', []): + disk = device.get('name', '') + disk_type = device.get('type', 'auto') + if disk: + devices.append((disk, disk_type)) + except json.JSONDecodeError: + devices = [] + + for disk, disk_type in devices: + serial_number = '' + active = 1 + metrics.append(f'smartctl_run{{disk="{disk}",type="{disk_type}"}} {int(datetime.utcnow().timestamp())}') + + try: + standby_output = run_command([SMARTCTL_PATH, '-n', 'standby', '-d', disk_type, '-j', disk], parse_json=True) + power_mode = standby_output.get('power_mode', '') + if power_mode == 'standby': + active = 0 + except json.JSONDecodeError: + active = 0 # Assume device is inactive if we can't parse the output + + metrics.append(f'device_active{{disk="{disk}",type="{disk_type}"}} {active}') + + if active == 0: + continue + + try: + info_output = run_command([SMARTCTL_PATH, '-i', '-H', '-d', disk_type, '-j', disk], parse_json=True) + except json.JSONDecodeError: + continue + metrics.extend(parse_smartctl_info(disk, disk_type, info_output)) + serial_number = info_output.get('serial_number', '').lower() + + try: + attributes_output = run_command([SMARTCTL_PATH, '-A', '-d', disk_type, '-j', disk], parse_json=True) + except json.JSONDecodeError: + continue + metrics.extend(parse_smartctl_attributes(disk, disk_type, serial_number, attributes_output)) + + formatted_output = format_output(metrics) + print(formatted_output) + +if __name__ == "__main__": + main() From 46216b54c41ea412a8d5c9e1d13d057b3e12cdd4 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:12:32 +0100 Subject: [PATCH 02/16] Create tests for smartmon --- etc/kayobe/ansible/scripts/test_smartmon.py | 265 ++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 etc/kayobe/ansible/scripts/test_smartmon.py diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py new file mode 100644 index 000000000..a771a7ee6 --- /dev/null +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -0,0 +1,265 @@ +import unittest +from unittest.mock import patch +from smartmon import ( + parse_smartctl_info, + parse_smartctl_attributes, + main, +) + +class TestSmartMon(unittest.TestCase): + @patch('smartmon.run_command') + def test_parse_smartctl_info(self, mock_run_command): + devices_info = [ + { + 'disk': '/dev/nvme0', + 'disk_type': 'nvme', + 'json_output': { + 'device': { + 'name': '/dev/nvme0', + 'info_name': '/dev/nvme0', + 'type': 'nvme', + 'protocol': 'NVMe', + }, + 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', + 'serial_number': 'Y2Q0A0BGTCF8', + 'firmware_version': '2.2.0', + 'smart_status': { + 'passed': True, + 'available': True, + 'enabled': True + }, + } + }, + { + 'disk': '/dev/nvme1', + 'disk_type': 'nvme', + 'json_output': { + 'device': { + 'name': '/dev/nvme1', + 'info_name': '/dev/nvme1', + 'type': 'nvme', + 'protocol': 'NVMe', + }, + 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', + 'serial_number': 'Y2Q0A09PTCF8', + 'firmware_version': '2.2.0', + 'smart_status': { + 'passed': True, + 'available': True, + 'enabled': True + }, + } + }, + ] + + for device_info in devices_info: + disk = device_info['disk'] + disk_type = device_info['disk_type'] + json_output = device_info['json_output'] + serial_number = json_output.get('serial_number', '').lower() + + expected_metrics = [ + f'device_info{{disk="{disk}",type="{disk_type}",vendor="",product="",revision="",lun_id="",model_family="",device_model="{json_output.get("model_name", "")}",serial_number="{serial_number}",firmware_version="{json_output.get("firmware_version", "")}"}} 1', + f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', + f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', + f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', + ] + + metrics = parse_smartctl_info(disk, disk_type, json_output) + for expected_metric in expected_metrics: + self.assertIn(expected_metric, metrics) + + @patch('smartmon.run_command') + def test_parse_smartctl_attributes(self, mock_run_command): + devices_attributes = [ + { + 'disk': '/dev/nvme0', + 'disk_type': 'nvme', + 'serial': 'y2q0a0bgtcf8', + 'json_output': { + 'nvme_smart_health_information_log': { + 'critical_warning': 0, + 'temperature': 36, + 'available_spare': 100, + 'available_spare_threshold': 10, + 'percentage_used': 0, + 'data_units_read': 117446405, + 'data_units_written': 84630284, + 'host_reads': 634894145, + 'host_writes': 4502620984, + 'controller_busy_time': 92090, + 'power_cycles': 746, + 'power_on_hours': 12494, + 'unsafe_shutdowns': 35, + 'media_errors': 0, + 'num_err_log_entries': 827, + 'warning_temp_time': 0, + 'critical_comp_time': 0 + } + } + }, + { + 'disk': '/dev/nvme1', + 'disk_type': 'nvme', + 'serial': 'y2q0a09ptcf8', + 'json_output': { + 'nvme_smart_health_information_log': { + 'critical_warning': 0, + 'temperature': 35, + 'available_spare': 99, + 'available_spare_threshold': 10, + 'percentage_used': 1, + 'data_units_read': 50000000, + 'data_units_written': 40000000, + 'host_reads': 300000000, + 'host_writes': 2000000000, + 'controller_busy_time': 80000, + 'power_cycles': 700, + 'power_on_hours': 12000, + 'unsafe_shutdowns': 30, + 'media_errors': 0, + 'num_err_log_entries': 800, + 'warning_temp_time': 0, + 'critical_comp_time': 0 + } + } + }, + ] + + for device_attr in devices_attributes: + disk = device_attr['disk'] + disk_type = device_attr['disk_type'] + serial = device_attr['serial'] + json_output = device_attr['json_output'] + + metrics = parse_smartctl_attributes(disk, disk_type, serial, json_output) + + expected_metrics = [ + f'temperature{{disk="{disk}",type="{disk_type}",serial_number="{serial}"}} {json_output["nvme_smart_health_information_log"]["temperature"]}', + f'available_spare{{disk="{disk}",type="{disk_type}",serial_number="{serial}"}} {json_output["nvme_smart_health_information_log"]["available_spare"]}', + ] + + for expected_metric in expected_metrics: + self.assertIn(expected_metric, metrics) + + @patch('smartmon.run_command') + def test_main(self, mock_run_command): + def side_effect(command, parse_json=False): + if '--scan-open' in command: + return { + 'devices': [ + {'name': '/dev/nvme0', 'info_name': '/dev/nvme0', 'type': 'nvme'}, + {'name': '/dev/nvme1', 'info_name': '/dev/nvme1', 'type': 'nvme'}, + ] + } if parse_json else '' + elif '-n' in command: + return {'power_mode': 'active'} if parse_json else '' + elif '-i' in command: + if '/dev/nvme0' in command: + return { + 'device': { + 'name': '/dev/nvme0', + 'info_name': '/dev/nvme0', + 'type': 'nvme', + 'protocol': 'NVMe', + }, + 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', + 'serial_number': 'Y2Q0A0BGTCF8', + 'firmware_version': '2.2.0', + 'smart_status': { + 'passed': True, + 'available': True, + 'enabled': True + }, + } if parse_json else '' + elif '/dev/nvme1' in command: + return { + 'device': { + 'name': '/dev/nvme1', + 'info_name': '/dev/nvme1', + 'type': 'nvme', + 'protocol': 'NVMe', + }, + 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', + 'serial_number': 'Y2Q0A09PTCF8', + 'firmware_version': '2.2.0', + 'smart_status': { + 'passed': True, + 'available': True, + 'enabled': True + }, + } if parse_json else '' + elif '-A' in command: + if '/dev/nvme0' in command: + return { + 'nvme_smart_health_information_log': { + 'critical_warning': 0, + 'temperature': 36, + 'available_spare': 100, + 'available_spare_threshold': 10, + 'percentage_used': 0, + 'data_units_read': 117446405, + 'data_units_written': 84630284, + 'host_reads': 634894145, + 'host_writes': 4502620984, + 'controller_busy_time': 92090, + 'power_cycles': 746, + 'power_on_hours': 12494, + 'unsafe_shutdowns': 35, + 'media_errors': 0, + 'num_err_log_entries': 827, + 'warning_temp_time': 0, + 'critical_comp_time': 0 + } + } if parse_json else '' + elif '/dev/nvme1' in command: + return { + 'nvme_smart_health_information_log': { + 'critical_warning': 0, + 'temperature': 35, + 'available_spare': 99, + 'available_spare_threshold': 10, + 'percentage_used': 1, + 'data_units_read': 50000000, + 'data_units_written': 40000000, + 'host_reads': 300000000, + 'host_writes': 2000000000, + 'controller_busy_time': 80000, + 'power_cycles': 700, + 'power_on_hours': 12000, + 'unsafe_shutdowns': 30, + 'media_errors': 0, + 'num_err_log_entries': 800, + 'warning_temp_time': 0, + 'critical_comp_time': 0 + } + } if parse_json else '' + elif '-j' in command and len(command) == 2: + return { + 'smartctl': { + 'version': [7, 2], + 'svn_revision': '5155', + 'platform_info': 'x86_64-linux-5.15.0-122-generic', + 'build_info': '(local build)', + } + } if parse_json else '' + else: + return {} if parse_json else '' + + mock_run_command.side_effect = side_effect + + with patch('builtins.print') as mock_print: + main() + output_lines = [] + for call in mock_print.call_args_list: + output_lines.extend(call[0][0].split('\n')) + expected_metrics = [ + 'smartmon_device_info{disk="/dev/nvme0",type="nvme",vendor="",product="",revision="",lun_id="",model_family="",device_model="Dell Ent NVMe CM6 RI 7.68TB",serial_number="y2q0a0bgtcf8",firmware_version="2.2.0"} 1', + 'smartmon_device_info{disk="/dev/nvme1",type="nvme",vendor="",product="",revision="",lun_id="",model_family="",device_model="Dell Ent NVMe CM6 RI 7.68TB",serial_number="y2q0a09ptcf8",firmware_version="2.2.0"} 1', + ] + for expected_metric in expected_metrics: + self.assertIn(expected_metric, output_lines) + + +if __name__ == '__main__': + unittest.main() From 23fc74779411246c89e502ff2675eab506c69a35 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 17 Jan 2025 16:08:49 +0000 Subject: [PATCH 03/16] Use pySMART --- etc/kayobe/ansible/scripts/smartmon.py | 279 +++++++++++++++---------- 1 file changed, 168 insertions(+), 111 deletions(-) diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index 2a50c9187..bd4bb36bc 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -2,155 +2,212 @@ import subprocess import json +import re from datetime import datetime +from pySMART import DeviceList + SMARTCTL_PATH = "/usr/sbin/smartctl" +SMARTMON_ATTRS = { + "airflow_temperature_cel", + "command_timeout", + "current_pending_sector", + "end_to_end_error", + "erase_fail_count", + "g_sense_error_rate", + "hardware_ecc_recovered", + "host_reads_32mib", + "host_reads_mib", + "host_writes_32mib", + "host_writes_mib", + "load_cycle_count", + "media_wearout_indicator", + "nand_writes_1gib", + "offline_uncorrectable", + "power_cycle_count", + "power_on_hours", + "program_fail_cnt_total", + "program_fail_count", + "raw_read_error_rate", + "reallocated_event_count", + "reallocated_sector_ct", + "reported_uncorrect", + "runtime_bad_block", + "sata_downshift_count", + "seek_error_rate", + "spin_retry_count", + "spin_up_time", + "start_stop_count", + "temperature_case", + "temperature_celsius", + "temperature_internal", + "total_lbas_read", + "total_lbas_written", + "udma_crc_error_count", + "unsafe_shutdown_count", + "unused_rsvd_blk_cnt_tot", + "wear_leveling_count", + "workld_host_reads_perc", + "workld_media_wear_indic", + "workload_minutes", + "critical_warning", + "temperature", + "available_spare", + "available_spare_threshold", + "percentage_used", + "data_units_read", + "data_units_written", + "host_reads", + "host_writes", + "controller_busy_time", + "power_cycles", + "unsafe_shutdowns", + "media_errors", + "num_err_log_entries", + "warning_temp_time", + "critical_comp_time", +} + def run_command(command, parse_json=False): + """ + Helper to run a subprocess command and optionally parse JSON output. + """ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if parse_json: return json.loads(result.stdout) - else: - return result.stdout.strip() - -def parse_smartctl_attributes(disk, disk_type, serial, json_data): - labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"' - metrics = [] - smartmon_attrs = set([ - "airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count", - "g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib", - "host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable", - "power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate", - "reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count", - "seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius", - "temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count", - "unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes", - "critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used", - "data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time", - "power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries", - "warning_temp_time", "critical_comp_time" - ]) - if 'nvme_smart_health_information_log' in json_data: - smart_log = json_data['nvme_smart_health_information_log'] - for attr_name, value in smart_log.items(): - attr_name = attr_name.replace(' ', '_').lower() - if attr_name in smartmon_attrs: - metrics.append(f"{attr_name}{{{labels}}} {value}") - elif 'scsi_grown_defect_list' in json_data: - scsi_attrs = json_data.get('scsi_grown_defect_list', {}) - for attr_name, value in scsi_attrs.items(): - attr_name = attr_name.replace(' ', '_').lower() - if attr_name in smartmon_attrs: - metrics.append(f"{attr_name}{{{labels}}} {value}") - elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']: - for attr in json_data['ata_smart_attributes']['table']: - attr_name = attr['name'].replace('-', '_').lower() - if attr_name in smartmon_attrs: - attr_id = attr.get('id', '') - value = attr.get('value', '') - worst = attr.get('worst', '') - threshold = attr.get('thresh', '') - raw_value = attr.get('raw', {}).get('value', '') - metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}") - metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}") - metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}") - metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}") - return metrics - -def parse_smartctl_info(disk, disk_type, json_data): - info = json_data.get('device', {}) - smart_status = json_data.get('smart_status', {}) + return result.stdout.strip() + +def parse_device_info(device): + """ + Produce Prometheus lines describing the device's identity and SMART status: + - device_info + - device_smart_available + - device_smart_enabled + - device_smart_healthy + """ + serial_number = (device.serial or "").lower() labels = { - 'disk': disk, - 'type': disk_type, - 'vendor': info.get('vendor', ''), - 'product': info.get('product', ''), - 'revision': info.get('revision', ''), - 'lun_id': info.get('lun_id', ''), - 'model_family': json_data.get('model_family', ''), - 'device_model': json_data.get('model_name', ''), - 'serial_number': json_data.get('serial_number', '').lower(), - 'firmware_version': json_data.get('firmware_version', '') + "disk": device.name, + "type": device.interface or "", + "vendor": device.vendor or "", + "model_family": device.family or "", + "device_model": device.model or "", + "serial_number": serial_number, + "firmware_version": device.firmware or "", } - label_str = ','.join(f'{k}="{v}"' for k, v in labels.items()) + label_str = ",".join(f'{k}="{v}"' for k, v in labels.items()) + metrics = [ f'device_info{{{label_str}}} 1', - f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}', + f'device_smart_available{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_capable else 0}', ] - if smart_status.get("available", False): - metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}') - if 'passed' in smart_status: - metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}') + + if device.smart_capable: + metrics.append( + f'device_smart_enabled{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_enabled else 0}' + ) + if device.assessment: + is_healthy = 1 if device.assessment.upper() == "PASS" else 0 + metrics.append( + f'device_smart_healthy{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {is_healthy}' + ) + + return metrics + +def parse_if_attributes(device): + """ + For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes. + We'll iterate over its public fields, convert them to snake_case, + and if it's in SMARTMON_ATTRS and numeric, we produce metrics. + """ + metrics = [] + + if not device.if_attributes: + return metrics + + disk = device.name + disk_type = device.interface or "" + serial_number = (device.serial or "").lower() + labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial_number}"' + + # Inspect all public attributes on device.if_attributes + for attr_name in dir(device.if_attributes): + if attr_name.startswith("_"): + continue # skip private / special methods + val = getattr(device.if_attributes, attr_name, None) + if callable(val): + continue # skip methods + + # Convert CamelCase or PascalCase -> snake_case, e.g. dataUnitsRead -> data_units_read + snake_name = re.sub(r'(? Date: Fri, 17 Jan 2025 16:13:05 +0000 Subject: [PATCH 04/16] Add tests for pysmart --- etc/kayobe/ansible/scripts/drives/nvme.json | 24 + etc/kayobe/ansible/scripts/test_smartmon.py | 513 ++++++++++---------- 2 files changed, 279 insertions(+), 258 deletions(-) create mode 100644 etc/kayobe/ansible/scripts/drives/nvme.json diff --git a/etc/kayobe/ansible/scripts/drives/nvme.json b/etc/kayobe/ansible/scripts/drives/nvme.json new file mode 100644 index 000000000..bbff19ec0 --- /dev/null +++ b/etc/kayobe/ansible/scripts/drives/nvme.json @@ -0,0 +1,24 @@ +{ + "device_info": { + "name": "/dev/nvme0", + "interface": "nvme", + "vendor": "AcmeCorp", + "family": "Acme NVMe Family", + "model": "Acme NVMe 1TB", + "serial": "ABCD1234", + "firmware": "3.0.1", + "smart_capable": true, + "smart_enabled": true, + "assessment": "PASS" + }, + "if_attributes": { + "criticalWarning": 0, + "temperature": 36, + "availableSpare": 100, + "availableSpareThreshold": 10, + "percentageUsed": 0, + "dataUnitsRead": 117446405, + "dataUnitsWritten": 84630284, + "notInSmartmonAttrs": 999 + } +} diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index a771a7ee6..a22df8ee1 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -1,265 +1,262 @@ +import glob +import json +import os +import re import unittest -from unittest.mock import patch + +from unittest.mock import patch, MagicMock + from smartmon import ( - parse_smartctl_info, - parse_smartctl_attributes, + parse_device_info, + parse_if_attributes, main, + SMARTMON_ATTRS ) +def load_json_fixture(filename): + """ + Load a JSON file from the 'drives' subfolder. + """ + path = os.path.join(os.path.dirname(__file__), "drives", filename) + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + class TestSmartMon(unittest.TestCase): - @patch('smartmon.run_command') - def test_parse_smartctl_info(self, mock_run_command): - devices_info = [ - { - 'disk': '/dev/nvme0', - 'disk_type': 'nvme', - 'json_output': { - 'device': { - 'name': '/dev/nvme0', - 'info_name': '/dev/nvme0', - 'type': 'nvme', - 'protocol': 'NVMe', - }, - 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', - 'serial_number': 'Y2Q0A0BGTCF8', - 'firmware_version': '2.2.0', - 'smart_status': { - 'passed': True, - 'available': True, - 'enabled': True - }, - } - }, - { - 'disk': '/dev/nvme1', - 'disk_type': 'nvme', - 'json_output': { - 'device': { - 'name': '/dev/nvme1', - 'info_name': '/dev/nvme1', - 'type': 'nvme', - 'protocol': 'NVMe', - }, - 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', - 'serial_number': 'Y2Q0A09PTCF8', - 'firmware_version': '2.2.0', - 'smart_status': { - 'passed': True, - 'available': True, - 'enabled': True - }, - } - }, - ] - - for device_info in devices_info: - disk = device_info['disk'] - disk_type = device_info['disk_type'] - json_output = device_info['json_output'] - serial_number = json_output.get('serial_number', '').lower() - - expected_metrics = [ - f'device_info{{disk="{disk}",type="{disk_type}",vendor="",product="",revision="",lun_id="",model_family="",device_model="{json_output.get("model_name", "")}",serial_number="{serial_number}",firmware_version="{json_output.get("firmware_version", "")}"}} 1', - f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', - f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', - f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', - ] - - metrics = parse_smartctl_info(disk, disk_type, json_output) - for expected_metric in expected_metrics: - self.assertIn(expected_metric, metrics) - - @patch('smartmon.run_command') - def test_parse_smartctl_attributes(self, mock_run_command): - devices_attributes = [ - { - 'disk': '/dev/nvme0', - 'disk_type': 'nvme', - 'serial': 'y2q0a0bgtcf8', - 'json_output': { - 'nvme_smart_health_information_log': { - 'critical_warning': 0, - 'temperature': 36, - 'available_spare': 100, - 'available_spare_threshold': 10, - 'percentage_used': 0, - 'data_units_read': 117446405, - 'data_units_written': 84630284, - 'host_reads': 634894145, - 'host_writes': 4502620984, - 'controller_busy_time': 92090, - 'power_cycles': 746, - 'power_on_hours': 12494, - 'unsafe_shutdowns': 35, - 'media_errors': 0, - 'num_err_log_entries': 827, - 'warning_temp_time': 0, - 'critical_comp_time': 0 - } - } - }, - { - 'disk': '/dev/nvme1', - 'disk_type': 'nvme', - 'serial': 'y2q0a09ptcf8', - 'json_output': { - 'nvme_smart_health_information_log': { - 'critical_warning': 0, - 'temperature': 35, - 'available_spare': 99, - 'available_spare_threshold': 10, - 'percentage_used': 1, - 'data_units_read': 50000000, - 'data_units_written': 40000000, - 'host_reads': 300000000, - 'host_writes': 2000000000, - 'controller_busy_time': 80000, - 'power_cycles': 700, - 'power_on_hours': 12000, - 'unsafe_shutdowns': 30, - 'media_errors': 0, - 'num_err_log_entries': 800, - 'warning_temp_time': 0, - 'critical_comp_time': 0 - } - } - }, - ] - - for device_attr in devices_attributes: - disk = device_attr['disk'] - disk_type = device_attr['disk_type'] - serial = device_attr['serial'] - json_output = device_attr['json_output'] - - metrics = parse_smartctl_attributes(disk, disk_type, serial, json_output) - - expected_metrics = [ - f'temperature{{disk="{disk}",type="{disk_type}",serial_number="{serial}"}} {json_output["nvme_smart_health_information_log"]["temperature"]}', - f'available_spare{{disk="{disk}",type="{disk_type}",serial_number="{serial}"}} {json_output["nvme_smart_health_information_log"]["available_spare"]}', - ] - - for expected_metric in expected_metrics: - self.assertIn(expected_metric, metrics) - - @patch('smartmon.run_command') - def test_main(self, mock_run_command): - def side_effect(command, parse_json=False): - if '--scan-open' in command: - return { - 'devices': [ - {'name': '/dev/nvme0', 'info_name': '/dev/nvme0', 'type': 'nvme'}, - {'name': '/dev/nvme1', 'info_name': '/dev/nvme1', 'type': 'nvme'}, - ] - } if parse_json else '' - elif '-n' in command: - return {'power_mode': 'active'} if parse_json else '' - elif '-i' in command: - if '/dev/nvme0' in command: - return { - 'device': { - 'name': '/dev/nvme0', - 'info_name': '/dev/nvme0', - 'type': 'nvme', - 'protocol': 'NVMe', - }, - 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', - 'serial_number': 'Y2Q0A0BGTCF8', - 'firmware_version': '2.2.0', - 'smart_status': { - 'passed': True, - 'available': True, - 'enabled': True - }, - } if parse_json else '' - elif '/dev/nvme1' in command: - return { - 'device': { - 'name': '/dev/nvme1', - 'info_name': '/dev/nvme1', - 'type': 'nvme', - 'protocol': 'NVMe', - }, - 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', - 'serial_number': 'Y2Q0A09PTCF8', - 'firmware_version': '2.2.0', - 'smart_status': { - 'passed': True, - 'available': True, - 'enabled': True - }, - } if parse_json else '' - elif '-A' in command: - if '/dev/nvme0' in command: - return { - 'nvme_smart_health_information_log': { - 'critical_warning': 0, - 'temperature': 36, - 'available_spare': 100, - 'available_spare_threshold': 10, - 'percentage_used': 0, - 'data_units_read': 117446405, - 'data_units_written': 84630284, - 'host_reads': 634894145, - 'host_writes': 4502620984, - 'controller_busy_time': 92090, - 'power_cycles': 746, - 'power_on_hours': 12494, - 'unsafe_shutdowns': 35, - 'media_errors': 0, - 'num_err_log_entries': 827, - 'warning_temp_time': 0, - 'critical_comp_time': 0 - } - } if parse_json else '' - elif '/dev/nvme1' in command: - return { - 'nvme_smart_health_information_log': { - 'critical_warning': 0, - 'temperature': 35, - 'available_spare': 99, - 'available_spare_threshold': 10, - 'percentage_used': 1, - 'data_units_read': 50000000, - 'data_units_written': 40000000, - 'host_reads': 300000000, - 'host_writes': 2000000000, - 'controller_busy_time': 80000, - 'power_cycles': 700, - 'power_on_hours': 12000, - 'unsafe_shutdowns': 30, - 'media_errors': 0, - 'num_err_log_entries': 800, - 'warning_temp_time': 0, - 'critical_comp_time': 0 - } - } if parse_json else '' - elif '-j' in command and len(command) == 2: - return { - 'smartctl': { - 'version': [7, 2], - 'svn_revision': '5155', - 'platform_info': 'x86_64-linux-5.15.0-122-generic', - 'build_info': '(local build)', - } - } if parse_json else '' - else: - return {} if parse_json else '' - - mock_run_command.side_effect = side_effect - - with patch('builtins.print') as mock_print: - main() - output_lines = [] - for call in mock_print.call_args_list: - output_lines.extend(call[0][0].split('\n')) - expected_metrics = [ - 'smartmon_device_info{disk="/dev/nvme0",type="nvme",vendor="",product="",revision="",lun_id="",model_family="",device_model="Dell Ent NVMe CM6 RI 7.68TB",serial_number="y2q0a0bgtcf8",firmware_version="2.2.0"} 1', - 'smartmon_device_info{disk="/dev/nvme1",type="nvme",vendor="",product="",revision="",lun_id="",model_family="",device_model="Dell Ent NVMe CM6 RI 7.68TB",serial_number="y2q0a09ptcf8",firmware_version="2.2.0"} 1', - ] - for expected_metric in expected_metrics: - self.assertIn(expected_metric, output_lines) - - -if __name__ == '__main__': + @classmethod + def setUpClass(cls): + # Collect all *.json files from ./drives/ + data_folder = os.path.join(os.path.dirname(__file__), "drives") + cls.fixture_files = glob.glob(os.path.join(data_folder, "*.json")) + + def create_mock_device_from_json(self, device_info, if_attributes=None): + """ + Given a 'device_info' dict and optional 'if_attributes', build + a MagicMock that mimics a pySMART Device object. + """ + device = MagicMock() + device.name = device_info.get("name", "") + device.interface = device_info.get("interface", "") + device.vendor = device_info.get("vendor", "") + device.family = device_info.get("family", "") + device.model = device_info.get("model", "") + device.serial = device_info.get("serial", "") + device.firmware = device_info.get("firmware", "") + device.smart_capable = device_info.get("smart_capable", False) + device.smart_enabled = device_info.get("smart_enabled", False) + device.assessment = device_info.get("assessment", "") + + if if_attributes: + class IfAttributesMock: + pass + + if_mock = IfAttributesMock() + for key, val in if_attributes.items(): + setattr(if_mock, key, val) + device.if_attributes = if_mock + else: + device.if_attributes = None + + return device + + def test_parse_device_info(self): + """ + Test parse_device_info() for every JSON fixture in ./drives/. + We do subTest() so each fixture is tested individually. + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(msg=f"Testing device_info with {fixture_name}"): + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + + device = self.create_mock_device_from_json(device_info) + metrics = parse_device_info(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # The device_info line should exist for every device + # e.g. device_info{disk="/dev/...",type="...",serial_number="..."} 1 + device_info_found = any( + line.startswith("device_info{") and + f'disk="{dev_name}"' in line and + f'type="{dev_iface}"' in line and + f'serial_number="{dev_serial}"' in line + for line in metrics + ) + self.assertTrue( + device_info_found, + f"Expected a device_info metric line for {dev_name} but didn't find it." + ) + + # If smart_capable is true, we expect device_smart_available = 1 + if device_info.get("smart_capable"): + smart_available_found = any( + line.startswith("device_smart_available{") and + f'disk="{dev_name}"' in line and + f'serial_number="{dev_serial}"' in line and + line.endswith(" 1") + for line in metrics + ) + self.assertTrue( + smart_available_found, + f"Expected device_smart_available=1 for {dev_name}, not found." + ) + + # If smart_enabled is true, we expect device_smart_enabled = 1 + if device_info.get("smart_enabled"): + smart_enabled_found = any( + line.startswith("device_smart_enabled{") and + f'disk="{dev_name}"' in line and + line.endswith(" 1") + for line in metrics + ) + self.assertTrue( + smart_enabled_found, + f"Expected device_smart_enabled=1 for {dev_name}, not found." + ) + + # device_smart_healthy if assessment in [PASS, WARN, FAIL] + # PASS => 1, otherwise => 0 + assessment = device_info.get("assessment", "").upper() + if assessment in ["PASS", "WARN", "FAIL"]: + expected_val = 1 if assessment == "PASS" else 0 + smart_healthy_found = any( + line.startswith("device_smart_healthy{") and + f'disk="{dev_name}"' in line and + line.endswith(f" {expected_val}") + for line in metrics + ) + self.assertTrue( + smart_healthy_found, + f"Expected device_smart_healthy={expected_val} for {dev_name}, not found." + ) + + def test_parse_if_attributes(self): + """ + Test parse_if_attributes() for every JSON fixture in ./drives/. + We do subTest() so each fixture is tested individually. + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(msg=f"Testing if_attributes with {fixture_name}"): + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + device = self.create_mock_device_from_json(device_info, if_attrs) + metrics = parse_if_attributes(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # For each numeric attribute in JSON, if it's in SMARTMON_ATTRS, + # we expect a line in the script's output. + for attr_key, attr_val in if_attrs.items(): + # Convert from e.g. "criticalWarning" -> "critical_warning" + snake_key = re.sub(r'(? + expected_line = ( + f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" + ) + self.assertIn( + expected_line, + metrics, + f"Expected metric '{expected_line}' for attribute '{attr_key}' not found." + ) + else: + # If it's not in SMARTMON_ATTRS or not numeric, + # we do NOT expect a line with that name+value + unexpected_line = ( + f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" + ) + self.assertNotIn( + unexpected_line, + metrics, + f"Unexpected metric '{unexpected_line}' found for {attr_key}." + ) + + # Also ensure that non-numeric or disallowed attributes do not appear + # For instance "notInSmartmonAttrs" should never appear. + for line in metrics: + self.assertNotIn( + "not_in_smartmon_attrs", + line, + f"'notInSmartmonAttrs' attribute unexpectedly found in metric line: {line}" + ) + + @patch("smartmon.run_command") + @patch("smartmon.DeviceList") + def test_main(self, mock_devicelist_class, mock_run_cmd): + """ + End-to-end test of main() for every JSON fixture in ./drives/. + This ensures we can handle multiple disks (multiple fixture files). + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(msg=f"Testing main() with {fixture_name}"): + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + # Patch run_command to return a version & "active" power_mode + def run_command_side_effect(cmd, parse_json=False): + if "--version" in cmd: + return "smartctl 7.3 5422 [x86_64-linux-5.15.0]\n..." + if "-n" in cmd and "standby" in cmd and parse_json: + return {"power_mode": "active"} + return "" + + mock_run_cmd.side_effect = run_command_side_effect + + # Mock a single device from the fixture + device_mock = self.create_mock_device_from_json(device_info, if_attrs) + + # Make DeviceList() return our single mock device + mock_dev_list = MagicMock() + mock_dev_list.devices = [device_mock] + mock_devicelist_class.return_value = mock_dev_list + + with patch("builtins.print") as mock_print: + main() + + printed_lines = [] + for call_args in mock_print.call_args_list: + printed_lines.extend(call_args[0][0].split("\n")) + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # We expect a line for the run timestamp, e.g.: + # smartmon_smartctl_run{disk="/dev/...",type="..."} 1671234567 + run_line_found = any( + line.startswith("smartmon_smartctl_run{") and + f'disk="{dev_name}"' in line and + f'type="{dev_iface}"' in line + for line in printed_lines + ) + self.assertTrue( + run_line_found, + f"Expected 'smartmon_smartctl_run' metric line for {dev_name} not found." + ) + + # Because we mocked "power_mode": "active", we expect device_active=1 + active_line_found = any( + line.startswith("smartmon_device_active{") and + f'disk="{dev_name}"' in line and + f'serial_number="{dev_serial}"' in line and + line.endswith(" 1") + for line in printed_lines + ) + self.assertTrue( + active_line_found, + f"Expected 'device_active{{...}} 1' line for {dev_name} not found." + ) + +if __name__ == "__main__": unittest.main() From b694537509084051566348a3009f8e7ef1f09af4 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Mon, 17 Mar 2025 23:45:02 +0000 Subject: [PATCH 05/16] Update docstring to state expected Device --- etc/kayobe/ansible/scripts/smartmon.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index bd4bb36bc..e52b6332b 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -85,6 +85,12 @@ def parse_device_info(device): - device_smart_available - device_smart_enabled - device_smart_healthy + + Args: + device (Device): A pySMART Device object with attributes such as name, interface, etc. + + Returns: + List[str]: A list of Prometheus formatted metric strings. """ serial_number = (device.serial or "").lower() labels = { From 2d25e92f9fc2460ea2f70f7d6f5e9720936b7f89 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Mon, 17 Mar 2025 23:46:29 +0000 Subject: [PATCH 06/16] Add a function for converting to camelcase --- etc/kayobe/ansible/scripts/smartmon.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index e52b6332b..7ec922b03 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -78,6 +78,14 @@ def run_command(command, parse_json=False): return json.loads(result.stdout) return result.stdout.strip() +def camel_to_snake(name): + """ + Convert a CamelCase string to snake_case. + + Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case + """ + return re.sub(r'(? snake_case, e.g. dataUnitsRead -> data_units_read - snake_name = re.sub(r'(? Date: Mon, 24 Mar 2025 00:54:49 +0000 Subject: [PATCH 07/16] Split device and attribute tests into individual test cases per fixture for better error reporting --- etc/kayobe/ansible/scripts/smartmon.py | 4 +- etc/kayobe/ansible/scripts/test_smartmon.py | 238 ++++++++++---------- 2 files changed, 127 insertions(+), 115 deletions(-) diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index 7ec922b03..202e6981c 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -3,7 +3,7 @@ import subprocess import json import re -from datetime import datetime +import datetime from pySMART import DeviceList @@ -197,7 +197,7 @@ def main(): disk_type = dev.interface or "" serial_number = (dev.serial or "").lower() - run_timestamp = int(datetime.utcnow().timestamp()) + run_timestamp = int(datetime.datetime.now(datetime.UTC).timestamp()) all_metrics.append(f'smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') active = 1 diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index a22df8ee1..212e5f063 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -59,136 +59,148 @@ class IfAttributesMock: return device + def _test_parse_device_info(self, fixture_name): + """ + Helper method to test parse_device_info() for a single JSON fixture. + """ + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + + device = self.create_mock_device_from_json(device_info) + metrics = parse_device_info(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # The device_info line should exist for every device + # e.g. device_info{disk="/dev/...",type="...",serial_number="..."} 1 + device_info_found = any( + line.startswith("device_info{") and + f'disk="{dev_name}"' in line and + f'type="{dev_iface}"' in line and + f'serial_number="{dev_serial}"' in line + for line in metrics + ) + self.assertTrue( + device_info_found, + f"Expected a device_info metric line for {dev_name} but didn't find it." + ) + + # If smart_capable is true, we expect device_smart_available = 1 + if device_info.get("smart_capable"): + smart_available_found = any( + line.startswith("device_smart_available{") and + f'disk="{dev_name}"' in line and + f'serial_number="{dev_serial}"' in line and + line.endswith(" 1") + for line in metrics + ) + self.assertTrue( + smart_available_found, + f"Expected device_smart_available=1 for {dev_name}, not found." + ) + + # If smart_enabled is true, we expect device_smart_enabled = 1 + if device_info.get("smart_enabled"): + smart_enabled_found = any( + line.startswith("device_smart_enabled{") and + f'disk="{dev_name}"' in line and + line.endswith(" 1") + for line in metrics + ) + self.assertTrue( + smart_enabled_found, + f"Expected device_smart_enabled=1 for {dev_name}, not found." + ) + + # device_smart_healthy if assessment in [PASS, WARN, FAIL] + # PASS => 1, otherwise => 0 + assessment = device_info.get("assessment", "").upper() + if assessment in ["PASS", "WARN", "FAIL"]: + expected_val = 1 if assessment == "PASS" else 0 + smart_healthy_found = any( + line.startswith("device_smart_healthy{") and + f'disk="{dev_name}"' in line and + line.endswith(f" {expected_val}") + for line in metrics + ) + self.assertTrue( + smart_healthy_found, + f"Expected device_smart_healthy={expected_val} for {dev_name}, not found." + ) + def test_parse_device_info(self): """ Test parse_device_info() for every JSON fixture in ./drives/. - We do subTest() so each fixture is tested individually. + Each fixture is tested individually with clear error reporting. """ for fixture_path in self.fixture_files: fixture_name = os.path.basename(fixture_path) - with self.subTest(msg=f"Testing device_info with {fixture_name}"): - data = load_json_fixture(fixture_name) - device_info = data["device_info"] - - device = self.create_mock_device_from_json(device_info) - metrics = parse_device_info(device) + with self.subTest(fixture=fixture_name): + self._test_parse_device_info(fixture_name) - dev_name = device_info["name"] - dev_iface = device_info["interface"] - dev_serial = device_info["serial"].lower() - - # The device_info line should exist for every device - # e.g. device_info{disk="/dev/...",type="...",serial_number="..."} 1 - device_info_found = any( - line.startswith("device_info{") and - f'disk="{dev_name}"' in line and - f'type="{dev_iface}"' in line and - f'serial_number="{dev_serial}"' in line - for line in metrics + def _test_parse_if_attributes(self, fixture_name): + """ + Helper method to test parse_if_attributes() for a single JSON fixture. + """ + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + device = self.create_mock_device_from_json(device_info, if_attrs) + metrics = parse_if_attributes(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # For each numeric attribute in JSON, if it's in SMARTMON_ATTRS, + # we expect a line in the script's output. + for attr_key, attr_val in if_attrs.items(): + # Convert from e.g. "criticalWarning" -> "critical_warning" + snake_key = re.sub(r'(? + expected_line = ( + f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" ) - self.assertTrue( - device_info_found, - f"Expected a device_info metric line for {dev_name} but didn't find it." + self.assertIn( + expected_line, + metrics, + f"Expected metric '{expected_line}' for attribute '{attr_key}' not found." + ) + else: + # If it's not in SMARTMON_ATTRS or not numeric, + # we do NOT expect a line with that name+value + unexpected_line = ( + f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" + ) + self.assertNotIn( + unexpected_line, + metrics, + f"Unexpected metric '{unexpected_line}' found for {attr_key}." ) - # If smart_capable is true, we expect device_smart_available = 1 - if device_info.get("smart_capable"): - smart_available_found = any( - line.startswith("device_smart_available{") and - f'disk="{dev_name}"' in line and - f'serial_number="{dev_serial}"' in line and - line.endswith(" 1") - for line in metrics - ) - self.assertTrue( - smart_available_found, - f"Expected device_smart_available=1 for {dev_name}, not found." - ) - - # If smart_enabled is true, we expect device_smart_enabled = 1 - if device_info.get("smart_enabled"): - smart_enabled_found = any( - line.startswith("device_smart_enabled{") and - f'disk="{dev_name}"' in line and - line.endswith(" 1") - for line in metrics - ) - self.assertTrue( - smart_enabled_found, - f"Expected device_smart_enabled=1 for {dev_name}, not found." - ) - - # device_smart_healthy if assessment in [PASS, WARN, FAIL] - # PASS => 1, otherwise => 0 - assessment = device_info.get("assessment", "").upper() - if assessment in ["PASS", "WARN", "FAIL"]: - expected_val = 1 if assessment == "PASS" else 0 - smart_healthy_found = any( - line.startswith("device_smart_healthy{") and - f'disk="{dev_name}"' in line and - line.endswith(f" {expected_val}") - for line in metrics - ) - self.assertTrue( - smart_healthy_found, - f"Expected device_smart_healthy={expected_val} for {dev_name}, not found." - ) + # Also ensure that non-numeric or disallowed attributes do not appear + # For instance "notInSmartmonAttrs" should never appear. + for line in metrics: + self.assertNotIn( + "not_in_smartmon_attrs", + line, + f"'notInSmartmonAttrs' attribute unexpectedly found in metric line: {line}" + ) def test_parse_if_attributes(self): """ Test parse_if_attributes() for every JSON fixture in ./drives/. - We do subTest() so each fixture is tested individually. + Each fixture is tested individually with clear error reporting. """ for fixture_path in self.fixture_files: fixture_name = os.path.basename(fixture_path) - with self.subTest(msg=f"Testing if_attributes with {fixture_name}"): - data = load_json_fixture(fixture_name) - device_info = data["device_info"] - if_attrs = data.get("if_attributes", {}) - - device = self.create_mock_device_from_json(device_info, if_attrs) - metrics = parse_if_attributes(device) - - dev_name = device_info["name"] - dev_iface = device_info["interface"] - dev_serial = device_info["serial"].lower() - - # For each numeric attribute in JSON, if it's in SMARTMON_ATTRS, - # we expect a line in the script's output. - for attr_key, attr_val in if_attrs.items(): - # Convert from e.g. "criticalWarning" -> "critical_warning" - snake_key = re.sub(r'(? - expected_line = ( - f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" - ) - self.assertIn( - expected_line, - metrics, - f"Expected metric '{expected_line}' for attribute '{attr_key}' not found." - ) - else: - # If it's not in SMARTMON_ATTRS or not numeric, - # we do NOT expect a line with that name+value - unexpected_line = ( - f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" - ) - self.assertNotIn( - unexpected_line, - metrics, - f"Unexpected metric '{unexpected_line}' found for {attr_key}." - ) - - # Also ensure that non-numeric or disallowed attributes do not appear - # For instance "notInSmartmonAttrs" should never appear. - for line in metrics: - self.assertNotIn( - "not_in_smartmon_attrs", - line, - f"'notInSmartmonAttrs' attribute unexpectedly found in metric line: {line}" - ) + with self.subTest(fixture=fixture_name): + self._test_parse_if_attributes(fixture_name) @patch("smartmon.run_command") @patch("smartmon.DeviceList") From 172e0340ba34ef2647e5d634ec4954b2ceeab78a Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Mon, 24 Mar 2025 01:06:07 +0000 Subject: [PATCH 08/16] Use function for snake case conversion --- etc/kayobe/ansible/scripts/test_smartmon.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index 212e5f063..5727f6a47 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -10,7 +10,8 @@ parse_device_info, parse_if_attributes, main, - SMARTMON_ATTRS + SMARTMON_ATTRS, + camel_to_snake ) def load_json_fixture(filename): @@ -158,8 +159,7 @@ def _test_parse_if_attributes(self, fixture_name): # For each numeric attribute in JSON, if it's in SMARTMON_ATTRS, # we expect a line in the script's output. for attr_key, attr_val in if_attrs.items(): - # Convert from e.g. "criticalWarning" -> "critical_warning" - snake_key = re.sub(r'(? From aad34c174429f1c8513289a8e1f524d3de269251 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 10 Apr 2025 13:17:58 +0100 Subject: [PATCH 09/16] Move fixtures to tests folder --- etc/kayobe/ansible/scripts/test_smartmon.py | 14 +++++++------- .../ansible/scripts/{drives => tests}/nvme.json | 0 2 files changed, 7 insertions(+), 7 deletions(-) rename etc/kayobe/ansible/scripts/{drives => tests}/nvme.json (100%) diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index 5727f6a47..e131846e0 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -16,9 +16,9 @@ def load_json_fixture(filename): """ - Load a JSON file from the 'drives' subfolder. + Load a JSON file from the 'tests' subfolder. """ - path = os.path.join(os.path.dirname(__file__), "drives", filename) + path = os.path.join(os.path.dirname(__file__), "tests", filename) with open(path, "r", encoding="utf-8") as f: return json.load(f) @@ -26,8 +26,8 @@ def load_json_fixture(filename): class TestSmartMon(unittest.TestCase): @classmethod def setUpClass(cls): - # Collect all *.json files from ./drives/ - data_folder = os.path.join(os.path.dirname(__file__), "drives") + # Collect all *.json files from ./tests/ + data_folder = os.path.join(os.path.dirname(__file__), "tests") cls.fixture_files = glob.glob(os.path.join(data_folder, "*.json")) def create_mock_device_from_json(self, device_info, if_attributes=None): @@ -133,7 +133,7 @@ def _test_parse_device_info(self, fixture_name): def test_parse_device_info(self): """ - Test parse_device_info() for every JSON fixture in ./drives/. + Test parse_device_info() for every JSON fixture in ./tests/. Each fixture is tested individually with clear error reporting. """ for fixture_path in self.fixture_files: @@ -194,7 +194,7 @@ def _test_parse_if_attributes(self, fixture_name): def test_parse_if_attributes(self): """ - Test parse_if_attributes() for every JSON fixture in ./drives/. + Test parse_if_attributes() for every JSON fixture in ./tests/. Each fixture is tested individually with clear error reporting. """ for fixture_path in self.fixture_files: @@ -206,7 +206,7 @@ def test_parse_if_attributes(self): @patch("smartmon.DeviceList") def test_main(self, mock_devicelist_class, mock_run_cmd): """ - End-to-end test of main() for every JSON fixture in ./drives/. + End-to-end test of main() for every JSON fixture in ./tests/. This ensures we can handle multiple disks (multiple fixture files). """ for fixture_path in self.fixture_files: diff --git a/etc/kayobe/ansible/scripts/drives/nvme.json b/etc/kayobe/ansible/scripts/tests/nvme.json similarity index 100% rename from etc/kayobe/ansible/scripts/drives/nvme.json rename to etc/kayobe/ansible/scripts/tests/nvme.json From 44601cabc58a8f38eb48f4ae49297aec9fa70e38 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 1 May 2025 14:20:56 +0100 Subject: [PATCH 10/16] Use prometheus_client for writing out metrics --- etc/kayobe/ansible/scripts/smartmon.py | 80 +++++++++---- etc/kayobe/ansible/scripts/test_smartmon.py | 120 ++++++++++---------- 2 files changed, 117 insertions(+), 83 deletions(-) diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index 202e6981c..f2dbdde13 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -4,7 +4,9 @@ import json import re import datetime +import os +from prometheus_client import CollectorRegistry, Gauge, write_to_textfile from pySMART import DeviceList SMARTCTL_PATH = "/usr/sbin/smartctl" @@ -110,21 +112,24 @@ def parse_device_info(device): "serial_number": serial_number, "firmware_version": device.firmware or "", } - label_str = ",".join(f'{k}="{v}"' for k, v in labels.items()) + sorted_labels = sorted(labels.items()) + label_str = ",".join(f'{k}="{v}"' for k, v in sorted_labels) + + metric_labels = f'disk="{device.name}",serial_number="{serial_number}",type="{device.interface}"' metrics = [ - f'device_info{{{label_str}}} 1', - f'device_smart_available{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_capable else 0}', + f'device_info{{{label_str}}} 1.0', + f'device_smart_available{{{metric_labels}}} {float(1) if device.smart_capable else float(0)}', ] if device.smart_capable: metrics.append( - f'device_smart_enabled{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_enabled else 0}' + f'device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}' ) if device.assessment: is_healthy = 1 if device.assessment.upper() == "PASS" else 0 metrics.append( - f'device_smart_healthy{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {is_healthy}' + f'device_smart_healthy{{{metric_labels}}} {float(is_healthy)}' ) return metrics @@ -143,7 +148,7 @@ def parse_if_attributes(device): disk = device.name disk_type = device.interface or "" serial_number = (device.serial or "").lower() - labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial_number}"' + labels = f'disk="{disk}",serial_number="{serial_number}",type="{disk_type}"' # Inspect all public attributes on device.if_attributes for attr_name in dir(device.if_attributes): @@ -156,27 +161,48 @@ def parse_if_attributes(device): snake_name = camel_to_snake(attr_name) if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)): - metrics.append(f"{snake_name}{{{labels}}} {val}") + metrics.append(f"{snake_name}{{{labels}}} {float(val)}") return metrics -def format_output(metrics): +def write_metrics_to_textfile(metrics, output_path=None): """ - Convert a list of lines like "some_metric{...} value" - into a Prometheus text output with # HELP / # TYPE lines. + Write metrics to a Prometheus textfile using prometheus_client. + Args: + metrics (List[str]): List of metric strings in 'name{labels} value' format. + output_path (str): Path to write the metrics file. Defaults to node_exporter textfile collector path. """ - output = [] - last_metric = "" - for metric in sorted(metrics): - metric_name = metric.split("{")[0] - if metric_name != last_metric: - output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}") - output.append(f"# TYPE smartmon_{metric_name} gauge") - last_metric = metric_name - output.append(f"smartmon_{metric}") - return "\n".join(output) - -def main(): + registry = CollectorRegistry() + metric_gauges = {} + for metric in metrics: + # Split metric into name, labels, and value + metric_name, rest = metric.split('{', 1) + label_str, value = rest.split('}', 1) + value = value.strip() + # Parse labels into a dictionary + labels = {} + label_keys = [] + label_values = [] + for label in label_str.split(','): + if '=' in label: + k, v = label.split('=', 1) + k = k.strip() + v = v.strip('"') + labels[k] = v + label_keys.append(k) + label_values.append(v) + help_str = f"SMART metric {metric_name}" + # Create Gauge if not already present + if metric_name not in metric_gauges: + metric_gauges[metric_name] = Gauge(metric_name, help_str, label_keys, registry=registry) + # Set metric value + gauge = metric_gauges[metric_name] + gauge.labels(*label_values).set(float(value)) + if output_path is None: + output_path = '/var/lib/node_exporter/textfile_collector/smartmon.prom' + write_to_textfile(output_path, registry) # Write all metrics to file + +def main(output_path=None): all_metrics = [] try: @@ -197,7 +223,7 @@ def main(): disk_type = dev.interface or "" serial_number = (dev.serial or "").lower() - run_timestamp = int(datetime.datetime.now(datetime.UTC).timestamp()) + run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) all_metrics.append(f'smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') active = 1 @@ -220,7 +246,11 @@ def main(): all_metrics.extend(parse_device_info(dev)) all_metrics.extend(parse_if_attributes(dev)) - print(format_output(all_metrics)) + write_metrics_to_textfile(all_metrics, output_path) if __name__ == "__main__": - main() + import argparse + parser = argparse.ArgumentParser(description="Export SMART metrics to Prometheus textfile format.") + parser.add_argument('--output', type=str, default=None, help='Output path for Prometheus textfile (default: /var/lib/node_exporter/textfile_collector/smartmon.prom)') + args = parser.parse_args() + main(args.output) diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index e131846e0..38bc66e52 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -1,17 +1,19 @@ import glob import json import os -import re import unittest +import tempfile +import math +from time import sleep from unittest.mock import patch, MagicMock - from smartmon import ( parse_device_info, parse_if_attributes, main, SMARTMON_ATTRS, - camel_to_snake + camel_to_snake, + write_metrics_to_textfile, ) def load_json_fixture(filename): @@ -75,7 +77,6 @@ def _test_parse_device_info(self, fixture_name): dev_serial = device_info["serial"].lower() # The device_info line should exist for every device - # e.g. device_info{disk="/dev/...",type="...",serial_number="..."} 1 device_info_found = any( line.startswith("device_info{") and f'disk="{dev_name}"' in line and @@ -94,12 +95,12 @@ def _test_parse_device_info(self, fixture_name): line.startswith("device_smart_available{") and f'disk="{dev_name}"' in line and f'serial_number="{dev_serial}"' in line and - line.endswith(" 1") + line.endswith(" 1.0") for line in metrics ) self.assertTrue( smart_available_found, - f"Expected device_smart_available=1 for {dev_name}, not found." + f"Expected device_smart_available=1.0 for {dev_name}, not found." ) # If smart_enabled is true, we expect device_smart_enabled = 1 @@ -107,19 +108,19 @@ def _test_parse_device_info(self, fixture_name): smart_enabled_found = any( line.startswith("device_smart_enabled{") and f'disk="{dev_name}"' in line and - line.endswith(" 1") + line.endswith(" 1.0") for line in metrics ) self.assertTrue( smart_enabled_found, - f"Expected device_smart_enabled=1 for {dev_name}, not found." + f"Expected device_smart_enabled=1.0 for {dev_name}, not found." ) # device_smart_healthy if assessment in [PASS, WARN, FAIL] # PASS => 1, otherwise => 0 assessment = device_info.get("assessment", "").upper() if assessment in ["PASS", "WARN", "FAIL"]: - expected_val = 1 if assessment == "PASS" else 0 + expected_val = float(1) if assessment == "PASS" else float(0) smart_healthy_found = any( line.startswith("device_smart_healthy{") and f'disk="{dev_name}"' in line and @@ -162,9 +163,8 @@ def _test_parse_if_attributes(self, fixture_name): snake_key = camel_to_snake(attr_key) if isinstance(attr_val, (int, float)) and snake_key in SMARTMON_ATTRS: - # We expect e.g. critical_warning{disk="/dev/..."} expected_line = ( - f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" + f"{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" ) self.assertIn( expected_line, @@ -175,7 +175,7 @@ def _test_parse_if_attributes(self, fixture_name): # If it's not in SMARTMON_ATTRS or not numeric, # we do NOT expect a line with that name+value unexpected_line = ( - f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" + f"{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" ) self.assertNotIn( unexpected_line, @@ -204,28 +204,32 @@ def test_parse_if_attributes(self): @patch("smartmon.run_command") @patch("smartmon.DeviceList") - def test_main(self, mock_devicelist_class, mock_run_cmd): + @patch("smartmon.write_metrics_to_textfile", wraps=write_metrics_to_textfile) + def test_main(self, mock_write_metrics, mock_devicelist_class, mock_run_cmd): """ End-to-end test of main() for every JSON fixture in ./tests/. This ensures we can handle multiple disks (multiple fixture files). + Checks metrics written to a temp file, and that write_metrics_to_textfile is called once. """ + + # Patch run_command to return a version & "active" power_mode + def run_command_side_effect(cmd, parse_json=False): + if "--version" in cmd: + return "smartctl 7.3 5422 [x86_64-linux-5.15.0]\n..." + if "-n" in cmd and "standby" in cmd and parse_json: + return {"power_mode": "active"} + return "" + + mock_run_cmd.side_effect = run_command_side_effect + for fixture_path in self.fixture_files: fixture_name = os.path.basename(fixture_path) with self.subTest(msg=f"Testing main() with {fixture_name}"): + mock_write_metrics.reset_mock() data = load_json_fixture(fixture_name) device_info = data["device_info"] if_attrs = data.get("if_attributes", {}) - # Patch run_command to return a version & "active" power_mode - def run_command_side_effect(cmd, parse_json=False): - if "--version" in cmd: - return "smartctl 7.3 5422 [x86_64-linux-5.15.0]\n..." - if "-n" in cmd and "standby" in cmd and parse_json: - return {"power_mode": "active"} - return "" - - mock_run_cmd.side_effect = run_command_side_effect - # Mock a single device from the fixture device_mock = self.create_mock_device_from_json(device_info, if_attrs) @@ -234,41 +238,41 @@ def run_command_side_effect(cmd, parse_json=False): mock_dev_list.devices = [device_mock] mock_devicelist_class.return_value = mock_dev_list - with patch("builtins.print") as mock_print: - main() - - printed_lines = [] - for call_args in mock_print.call_args_list: - printed_lines.extend(call_args[0][0].split("\n")) - dev_name = device_info["name"] - dev_iface = device_info["interface"] - dev_serial = device_info["serial"].lower() - - # We expect a line for the run timestamp, e.g.: - # smartmon_smartctl_run{disk="/dev/...",type="..."} 1671234567 - run_line_found = any( - line.startswith("smartmon_smartctl_run{") and - f'disk="{dev_name}"' in line and - f'type="{dev_iface}"' in line - for line in printed_lines - ) - self.assertTrue( - run_line_found, - f"Expected 'smartmon_smartctl_run' metric line for {dev_name} not found." - ) - - # Because we mocked "power_mode": "active", we expect device_active=1 - active_line_found = any( - line.startswith("smartmon_device_active{") and - f'disk="{dev_name}"' in line and - f'serial_number="{dev_serial}"' in line and - line.endswith(" 1") - for line in printed_lines - ) - self.assertTrue( - active_line_found, - f"Expected 'device_active{{...}} 1' line for {dev_name} not found." - ) + with tempfile.NamedTemporaryFile(mode="r+", delete_on_close=False) as tmpfile: + path= tmpfile.name + main(output_path=path) + tmpfile.close() + + # Ensure write_metrics_to_textfile was called once + self.assertEqual(mock_write_metrics.call_count, 1) + + with open(path, "r") as f: + # Read the metrics from the file + metrics_lines = [line.strip() for line in f.readlines() if line.strip() and not line.startswith('#')] + print(f"Metrics lines: {metrics_lines}") + + # Generate expected metrics using the parse functions + expected_metrics = [] + expected_metrics.extend(parse_device_info(device_mock)) + expected_metrics.extend(parse_if_attributes(device_mock)) + + # Check that all expected metrics are present in the file + for expected in expected_metrics: + exp_metric, exp_val_str = expected.rsplit(" ", 1) + exp_val = float(exp_val_str) + found = any( + (exp_metric in line) and + math.isclose(float(line.rsplit(" ", 1)[1]), exp_val) + for line in metrics_lines + ) + self.assertTrue(found, f"Expected metric '{expected}' not found") + + # Check that smartctl_version metric is present + version_found = any(line.startswith("smartctl_version{") for line in metrics_lines) + self.assertTrue(version_found, "Expected 'smartctl_version' metric not found in output file.") + + # Check that the output file is not empty + self.assertTrue(metrics_lines, "Metrics output file is empty.") if __name__ == "__main__": unittest.main() From 783a68c19bbf2d43e4339e139d965136603e8563 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 1 May 2025 14:25:40 +0100 Subject: [PATCH 11/16] Add args and returns to doc string for parse_ifattributes --- etc/kayobe/ansible/scripts/smartmon.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index f2dbdde13..27f735bcc 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -139,6 +139,11 @@ def parse_if_attributes(device): For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes. We'll iterate over its public fields, convert them to snake_case, and if it's in SMARTMON_ATTRS and numeric, we produce metrics. + + Args: + device (Device): A pySMART Device object with attributes such as name, interface, etc. + Returns: + List[str]: A list of Prometheus formatted metric strings. """ metrics = [] From 4b1fc1f2598245b98530847e635d3a325da9919d Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 1 May 2025 19:11:36 +0100 Subject: [PATCH 12/16] Ensure metric names start with smartmon --- etc/kayobe/ansible/scripts/smartmon.py | 16 +++++++------- etc/kayobe/ansible/scripts/test_smartmon.py | 24 ++++++++++----------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index 27f735bcc..033ddbb94 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -118,18 +118,18 @@ def parse_device_info(device): metric_labels = f'disk="{device.name}",serial_number="{serial_number}",type="{device.interface}"' metrics = [ - f'device_info{{{label_str}}} 1.0', - f'device_smart_available{{{metric_labels}}} {float(1) if device.smart_capable else float(0)}', + f'smartmon_device_info{{{label_str}}} 1.0', + f'smartmon_device_smart_available{{{metric_labels}}} {float(1) if device.smart_capable else float(0)}', ] if device.smart_capable: metrics.append( - f'device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}' + f'smartmon_device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}' ) if device.assessment: is_healthy = 1 if device.assessment.upper() == "PASS" else 0 metrics.append( - f'device_smart_healthy{{{metric_labels}}} {float(is_healthy)}' + f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}' ) return metrics @@ -166,7 +166,7 @@ def parse_if_attributes(device): snake_name = camel_to_snake(attr_name) if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)): - metrics.append(f"{snake_name}{{{labels}}} {float(val)}") + metrics.append(f"smartmon_{snake_name}{{{labels}}} {float(val)}") return metrics @@ -219,7 +219,7 @@ def main(output_path=None): version_num = "unknown" except Exception: version_num = "unknown" - all_metrics.append(f'smartctl_version{{version="{version_num}"}} 1') + all_metrics.append(f'smartmon_smartctl_version{{version="{version_num}"}} 1') dev_list = DeviceList() @@ -229,7 +229,7 @@ def main(output_path=None): serial_number = (dev.serial or "").lower() run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) - all_metrics.append(f'smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') + all_metrics.append(f'smartmon_smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') active = 1 try: @@ -243,7 +243,7 @@ def main(output_path=None): active = 0 all_metrics.append( - f'device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}' + f'smartmon_device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}' ) if active == 0: continue diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index 38bc66e52..4749808a5 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -78,7 +78,7 @@ def _test_parse_device_info(self, fixture_name): # The device_info line should exist for every device device_info_found = any( - line.startswith("device_info{") and + line.startswith("smartmon_device_info{") and f'disk="{dev_name}"' in line and f'type="{dev_iface}"' in line and f'serial_number="{dev_serial}"' in line @@ -86,13 +86,13 @@ def _test_parse_device_info(self, fixture_name): ) self.assertTrue( device_info_found, - f"Expected a device_info metric line for {dev_name} but didn't find it." + f"Expected a smartmon_device_info metric line for {dev_name} but didn't find it." ) # If smart_capable is true, we expect device_smart_available = 1 if device_info.get("smart_capable"): smart_available_found = any( - line.startswith("device_smart_available{") and + line.startswith("smartmon_device_smart_available{") and f'disk="{dev_name}"' in line and f'serial_number="{dev_serial}"' in line and line.endswith(" 1.0") @@ -100,20 +100,20 @@ def _test_parse_device_info(self, fixture_name): ) self.assertTrue( smart_available_found, - f"Expected device_smart_available=1.0 for {dev_name}, not found." + f"Expected smartmon_device_smart_available=1.0 for {dev_name}, not found." ) # If smart_enabled is true, we expect device_smart_enabled = 1 if device_info.get("smart_enabled"): smart_enabled_found = any( - line.startswith("device_smart_enabled{") and + line.startswith("smartmon_device_smart_enabled{") and f'disk="{dev_name}"' in line and line.endswith(" 1.0") for line in metrics ) self.assertTrue( smart_enabled_found, - f"Expected device_smart_enabled=1.0 for {dev_name}, not found." + f"Expected smartmon_device_smart_enabled=1.0 for {dev_name}, not found." ) # device_smart_healthy if assessment in [PASS, WARN, FAIL] @@ -122,14 +122,14 @@ def _test_parse_device_info(self, fixture_name): if assessment in ["PASS", "WARN", "FAIL"]: expected_val = float(1) if assessment == "PASS" else float(0) smart_healthy_found = any( - line.startswith("device_smart_healthy{") and + line.startswith("smartmon_device_smart_healthy{") and f'disk="{dev_name}"' in line and line.endswith(f" {expected_val}") for line in metrics ) self.assertTrue( smart_healthy_found, - f"Expected device_smart_healthy={expected_val} for {dev_name}, not found." + f"Expected smartmon_device_smart_healthy={expected_val} for {dev_name}, not found." ) def test_parse_device_info(self): @@ -164,7 +164,7 @@ def _test_parse_if_attributes(self, fixture_name): if isinstance(attr_val, (int, float)) and snake_key in SMARTMON_ATTRS: expected_line = ( - f"{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" + f"smartmon_{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" ) self.assertIn( expected_line, @@ -175,7 +175,7 @@ def _test_parse_if_attributes(self, fixture_name): # If it's not in SMARTMON_ATTRS or not numeric, # we do NOT expect a line with that name+value unexpected_line = ( - f"{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" + f"smartmon_{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" ) self.assertNotIn( unexpected_line, @@ -268,8 +268,8 @@ def run_command_side_effect(cmd, parse_json=False): self.assertTrue(found, f"Expected metric '{expected}' not found") # Check that smartctl_version metric is present - version_found = any(line.startswith("smartctl_version{") for line in metrics_lines) - self.assertTrue(version_found, "Expected 'smartctl_version' metric not found in output file.") + version_found = any(line.startswith("smartmon_smartctl_version{") for line in metrics_lines) + self.assertTrue(version_found, "Expected 'smartmon_smartctl_version' metric not found in output file.") # Check that the output file is not empty self.assertTrue(metrics_lines, "Metrics output file is empty.") From 1bc57332572f78b6f9f5a19bc4574976c2af9069 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 1 May 2025 19:17:03 +0100 Subject: [PATCH 13/16] Update smartmon playbook for smartmon.py --- etc/kayobe/ansible/smartmon-tools.yml | 63 +++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml index 00cdfa495..351ce0325 100644 --- a/etc/kayobe/ansible/smartmon-tools.yml +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -13,6 +13,30 @@ state: present become: true + - name: Ensure Python 3, venv, and pip are installed + ansible.builtin.package: + name: + - python3 + - python3-venv + - python3-pip + state: present + become: true + + - name: Create smartmon Python virtual environment + ansible.builtin.command: + cmd: python3 -m venv /opt/smartmon-venv + creates: /opt/smartmon-venv/bin/activate + become: true + + - name: Install prometheus_client and pySMART in venv + ansible.builtin.pip: + name: + - prometheus_client + - pySMART + virtualenv: /opt/smartmon-venv + virtualenv_python: python3 + become: true + - name: Ensure the cron/crond service is running ansible.builtin.service: name: "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'crond' }}" @@ -20,15 +44,15 @@ enabled: true become: true - - name: Copy smartmon.sh and nvmemon.sh from scripts folder + - name: Copy smartmon.py and nvmemon.sh from scripts folder ansible.builtin.copy: src: scripts/{{ item }} - dest: /usr/local/bin/ + dest: /usr/local/bin/{{ item }} owner: root group: root mode: "0700" loop: - - smartmon.sh + - smartmon.py - nvmemon.sh become: true @@ -40,16 +64,39 @@ job: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin become: true - - name: Schedule cronjob to run both scripts every 5 minutes and save output to file + - name: Schedule cronjob to run smartmon.py every 5 minutes and save output to file ansible.builtin.cron: - name: SMART metrics for drive monitoring using {{ item }} + name: SMART metrics for drive monitoring using smartmon.py + user: root + minute: "*/5" + job: >- + umask 0022 && /opt/smartmon-venv/bin/python /usr/local/bin/smartmon.py --output /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp && + mv -f /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp /var/lib/docker/volumes/textfile/_data/smartmon.prom + become: true + + - name: Schedule cronjob to run nvmemon.sh every 5 minutes and save output to file + ansible.builtin.cron: + name: SMART metrics for drive monitoring using nvmemon.sh user: root minute: "*/5" job: >- - umask 0022 && /usr/local/bin/{{ item }}.sh > - /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp && - mv -f /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp /var/lib/docker/volumes/textfile/_data/{{ item }}.prom + umask 0022 && /usr/local/bin/nvmemon.sh > + /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp && + mv -f /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp /var/lib/docker/volumes/textfile/_data/nvmemon.prom + become: true + + - name: Remove old cronjobs if present + ansible.builtin.cron: + name: SMART metrics for drive monitoring using {{ item }} + user: root + state: absent + become: true loop: - smartmon - nvmemon + + - name: Remove old smartmon.sh if present + ansible.builtin.file: + path: /usr/local/bin/smartmon.sh + state: absent become: true From df8c944eed7305c62e5afb373a51a43865b0a161 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 1 May 2025 19:17:36 +0100 Subject: [PATCH 14/16] Add python script to generate test fixtures from real drives --- .../ansible/scripts/generate_fixtures.py | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 etc/kayobe/ansible/scripts/generate_fixtures.py diff --git a/etc/kayobe/ansible/scripts/generate_fixtures.py b/etc/kayobe/ansible/scripts/generate_fixtures.py new file mode 100644 index 000000000..5f8f7cc64 --- /dev/null +++ b/etc/kayobe/ansible/scripts/generate_fixtures.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +import json +import re +from pySMART import DeviceList + +SMARTMON_ATTRS = { + "airflow_temperature_cel", + "command_timeout", + "current_pending_sector", + "end_to_end_error", + "erase_fail_count", + "g_sense_error_rate", + "hardware_ecc_recovered", + "host_reads_32mib", + "host_reads_mib", + "host_writes_32mib", + "host_writes_mib", + "load_cycle_count", + "media_wearout_indicator", + "nand_writes_1gib", + "offline_uncorrectable", + "power_cycle_count", + "power_on_hours", + "program_fail_cnt_total", + "program_fail_count", + "raw_read_error_rate", + "reallocated_event_count", + "reallocated_sector_ct", + "reported_uncorrect", + "runtime_bad_block", + "sata_downshift_count", + "seek_error_rate", + "spin_retry_count", + "spin_up_time", + "start_stop_count", + "temperature_case", + "temperature_celsius", + "temperature_internal", + "total_lbas_read", + "total_lbas_written", + "udma_crc_error_count", + "unsafe_shutdown_count", + "unused_rsvd_blk_cnt_tot", + "wear_leveling_count", + "workld_host_reads_perc", + "workld_media_wear_indic", + "workload_minutes", + "critical_warning", + "temperature", + "available_spare", + "available_spare_threshold", + "percentage_used", + "data_units_read", + "data_units_written", + "host_reads", + "host_writes", + "controller_busy_time", + "power_cycles", + "unsafe_shutdowns", + "media_errors", + "num_err_log_entries", + "warning_temp_time", + "critical_comp_time", +} + +DISK_INFO = { + "name", + "interface", + "vendor", + "family", + "model", + "serial", + "firmware", + "smart_capable", + "smart_enabled", + "assessment", +} + +def camel_to_snake(name): + """ + Convert a CamelCase string to snake_case. + + Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case + """ + return re.sub(r'(? Date: Thu, 1 May 2025 19:20:22 +0100 Subject: [PATCH 15/16] Remove smartmon.sh --- etc/kayobe/ansible/scripts/smartmon.sh | 203 ------------------------- 1 file changed, 203 deletions(-) delete mode 100644 etc/kayobe/ansible/scripts/smartmon.sh diff --git a/etc/kayobe/ansible/scripts/smartmon.sh b/etc/kayobe/ansible/scripts/smartmon.sh deleted file mode 100644 index c08c46e60..000000000 --- a/etc/kayobe/ansible/scripts/smartmon.sh +++ /dev/null @@ -1,203 +0,0 @@ -#!/bin/bash -# Script informed by the collectd monitoring script for smartmontools (using smartctl) -# by Samuel B. (c) 2012 -# source at: http://devel.dob.sk/collectd-scripts/ - -# TODO: This probably needs to be a little more complex. The raw numbers can have more -# data in them than you'd think. -# http://arstechnica.com/civis/viewtopic.php?p=22062211 - -# Formatting done via shfmt -i 2 -# https://github.com/mvdan/sh - -parse_smartctl_attributes_awk="$( - cat <<'SMARTCTLAWK' -$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { - gsub(/-/, "_"); - printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 - printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 - printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 - printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 -} -SMARTCTLAWK -)" - -smartmon_attrs="$( - cat <<'SMARTMONATTRS' -airflow_temperature_cel -command_timeout -current_pending_sector -end_to_end_error -erase_fail_count -g_sense_error_rate -hardware_ecc_recovered -host_reads_32mib -host_reads_mib -host_writes_32mib -host_writes_mib -load_cycle_count -media_wearout_indicator -nand_writes_1gib -offline_uncorrectable -power_cycle_count -power_on_hours -program_fail_cnt_total -program_fail_count -raw_read_error_rate -reallocated_event_count -reallocated_sector_ct -reported_uncorrect -runtime_bad_block -sata_downshift_count -seek_error_rate -spin_retry_count -spin_up_time -start_stop_count -temperature_case -temperature_celsius -temperature_internal -total_lbas_read -total_lbas_written -udma_crc_error_count -unsafe_shutdown_count -unused_rsvd_blk_cnt_tot -wear_leveling_count -workld_host_reads_perc -workld_media_wear_indic -workload_minutes -SMARTMONATTRS -)" -smartmon_attrs="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" - -parse_smartctl_attributes() { - local disk="$1" - local disk_type="$2" - local serial="$3" - local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" - sed 's/^ \+//g' | - awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | - tr '[:upper:]' '[:lower:]' | - grep -E "(${smartmon_attrs})" -} - -parse_smartctl_scsi_attributes() { - local disk="$1" - local disk_type="$2" - local serial="$3" - local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" - while read -r line; do - attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" - attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" - case "${attr_type}" in - number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Current_Drive_Temperature) temp_cel="$(echo "${attr_value}" | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; - Blocks_sent_to_initiator_) lbas_read="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Blocks_received_from_initiator_) lbas_written="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Accumulated_start-stop_cycles) power_cycle="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Elements_in_grown_defect_list) grown_defects="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - esac - done - [ -n "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" - [ -n "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" - [ -n "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" - [ -n "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"241\"} ${lbas_written}" - [ -n "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" - [ -n "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"-1\"} ${grown_defects}" -} - -parse_smartctl_info() { - shopt -s nocasematch - local -i smart_available=0 smart_enabled=0 smart_healthy= - local disk="$1" disk_type="$2" - local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' - while read -r line; do - info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" - info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" - case "${info_type}" in - Model_Family) model_family="${info_value}" ;; - Device_Model) device_model="${info_value}" ;; - Serial_Number) serial_number="$(echo ${info_value} | tr '[:upper:]' '[:lower:]')" ;; - Firmware_Version) fw_version="${info_value}" ;; - Vendor) vendor="${info_value}" ;; - Product) product="${info_value}" ;; - Revision) revision="${info_value}" ;; - Logical_Unit_id) lun_id="${info_value}" ;; - esac - if [[ "${info_type}" == 'SMART_support_is' ]]; then - case "${info_value:0:7}" in - Enabled) smart_available=1; smart_enabled=1 ;; - Availab) smart_available=1; smart_enabled=0 ;; - Unavail) smart_available=0; smart_enabled=0 ;; - esac - fi - if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then - case "${info_value:0:6}" in - PASSED) smart_healthy=1 ;; - *) smart_healthy=0 ;; - esac - elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then - case "${info_value:0:2}" in - OK) smart_healthy=1 ;; - *) smart_healthy=0 ;; - esac - fi - done - echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1" - echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_available}" - [[ "${smart_available}" == "1" ]] && echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_enabled}" - [[ "${smart_available}" == "1" ]] && [[ "${smart_healthy}" != "" ]] && echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_healthy}" -} - -output_format_awk="$( - cat <<'OUTPUTAWK' -BEGIN { v = "" } -v != $1 { - print "# HELP smartmon_" $1 " SMART metric " $1; - print "# TYPE smartmon_" $1 " gauge"; - v = $1 -} -{print "smartmon_" $0} -OUTPUTAWK -)" - -format_output() { - sort | - awk -F'{' "${output_format_awk}" -} - -smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" - -echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output - -if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then - exit -fi - -device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')" - -for device in ${device_list}; do - disk="$(echo "${device}" | cut -f1 -d'|')" - type="$(echo "${device}" | cut -f2 -d'|')" - # Use REGEX to extract the serial number from the parsed information and save that to a variable - serial_number="$(/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"| sed -E ':a;N;$!ba;s/.*serial_number=\"([^"]+)\".*/\1/g' | sed -E 's/^device_info\{.*//g')" - active=1 - echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" - # Check if the device is in a low-power mode - /usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0 - echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" - # Skip further metrics to prevent the disk from spinning up - test ${active} -eq 0 && continue - # Get the SMART information and health - /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" - # Get the SMART attributes - case ${type} in - sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; - sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; - scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; - megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; - *) - (>&2 echo "disk type is not sat, scsi or megaraid but ${type}") - exit - ;; - esac -done | format_output From 5142d7905d27ff356b824fb19d6f1090e1c63ca1 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 1 May 2025 19:20:57 +0100 Subject: [PATCH 16/16] Add test fixture for Dell ENT NVMe --- .../scripts/tests/Dell_ENT_NVMe_CM6.json | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json diff --git a/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json b/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json new file mode 100644 index 000000000..d867910ae --- /dev/null +++ b/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json @@ -0,0 +1,26 @@ +{ + "device_info": { + "assessment": "PASS", + "firmware": "2.1.8", + "interface": "nvme", + "model": "Dell Ent NVMe CM6 RI 7.68TB", + "name": "nvme8", + "serial": "Y2Q0A0BPTCF8", + "smart_capable": true, + "smart_enabled": true, + "vendor": "Dell" + }, + "if_attributes": { + "availableSpare": 100, + "availableSpareThreshold": 10, + "controllerBusyTime": 2478, + "criticalWarning": 0, + "dataUnitsRead": 177817765, + "dataUnitsWritten": 127992843, + "percentageUsed": 1, + "powerCycles": 750, + "powerOnHours": 17427, + "temperature": 36, + "unsafeShutdowns": 37 + } +}