From 878d19ecc0ee423e4887d26104417acd75ddb2d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= Date: Sun, 2 Feb 2020 18:03:31 +0100 Subject: [PATCH] Add microstate accounting collector (#98) * Add microstate accounting collector * Add _microseconds suffix as per prometheus requirements here https://prometheus.io/docs/practices/naming/#metric-and-label-naming * Use seconds to represent total time Also convert msacc statistics to `native` time, then divide to get seconds. Part of deadtrickster/prometheus.erl#98 * microstate stats use perf_counter time unit, so no need to do multiple conversions Co-authored-by: Luke Bakken --- .../vm/prometheus_vm_msacc_collector.erl | 276 ++++++++++++++++++ src/prometheus_collector.erl | 1 + .../prometheus_vm_msacc_collector_tests.erl | 135 +++++++++ test/eunit/prometheus_collector_tests.erl | 1 + test/eunit/prometheus_registry_tests.erl | 1 + 5 files changed, 414 insertions(+) create mode 100644 src/collectors/vm/prometheus_vm_msacc_collector.erl create mode 100644 test/eunit/collectors/vm/prometheus_vm_msacc_collector_tests.erl diff --git a/src/collectors/vm/prometheus_vm_msacc_collector.erl b/src/collectors/vm/prometheus_vm_msacc_collector.erl new file mode 100644 index 00000000..f9bf1a20 --- /dev/null +++ b/src/collectors/vm/prometheus_vm_msacc_collector.erl @@ -0,0 +1,276 @@ +%% @doc +%% Collects microstate accounting metrics using +%% +%% erlang:statistics(microstate_accounting) +%% . +%% +%% In order for values to increase, microstate +%% accounting must be enabled. This is done by +%% calling erlang:system_flag(microstate_accounting, true). +%% +%% ==Exported metrics== +%% Some metrics are not available by default. They require a VM +%% configured with ./configure --with-microstate-accounting=extra. +%% +%% +%% +%% ==Configuration== +%% +%% Metrics exported by this collector can be configured via +%% `vm_msacc_collector_metrics' key of `prometheus' app environment. +%% +%% Options are the same as MSAcc_Thread_State return type from +%% +%% erlang:statistics(microstate_accounting) +%% with `_seconds_total' as the suffix: +%% +%% +%% By default all metrics are enabled as far as Prometheus is concerned, +%% although some metrics could not be enabled by the VM itself. +%% @end +-module(prometheus_vm_msacc_collector). + +-export([deregister_cleanup/1, + collect_mf/2]). + +-import(prometheus_model_helpers, [create_mf/4]). + +-include("prometheus.hrl"). + +-behaviour(prometheus_collector). + +%%==================================================================== +%% Macros +%%==================================================================== + +-define(METRIC_NAME_PREFIX, "erlang_vm_msacc_"). + +%%==================================================================== +%% Collector API +%%==================================================================== + +%% @private +deregister_cleanup(_) -> ok. + +-spec collect_mf(_Registry, Callback) -> ok when + _Registry :: prometheus_registry:registry(), + Callback :: prometheus_collector:callback(). +%% @private +collect_mf(_Registry, Callback) -> + Metrics = metrics(), + EnabledMetrics = enabled_metrics(), + [add_metric_family(Metric, Callback) + || {Name, _, _, _}=Metric <- Metrics, metric_enabled(Name, EnabledMetrics)], + ok. + +add_metric_family({Name, Type, Help, Metrics}, Callback) -> + Callback(create_mf(?METRIC_NAME(Name), Help, Type, Metrics)). + +%%==================================================================== +%% Private Parts +%%==================================================================== + +metrics() -> + Data = erlang:statistics(microstate_accounting), + SecondAsPerfCounter = erlang:convert_time_unit(1, second, perf_counter), + [ + %% Base states. + {aux_seconds_total, counter, + "Total time in seconds spent handling auxiliary jobs.", + metric(aux, Data, SecondAsPerfCounter)}, + {check_io_seconds_total, counter, + "Total time in seconds spent checking for new I/O events.", + metric(check_io, Data, SecondAsPerfCounter)}, + {emulator_seconds_total, counter, + "Total time in seconds spent executing Erlang processes.", + metric(emulator, Data, SecondAsPerfCounter)}, + {gc_seconds_total, counter, + "Total time in seconds spent doing garbage collection. " + "When extra states are enabled this is the time spent " + "doing non-fullsweep garbage collections.", + metric(gc, Data, SecondAsPerfCounter)}, + {other_seconds_total, counter, + "Total time in seconds spent doing unaccounted things.", + metric(other, Data, SecondAsPerfCounter)}, + {port_seconds_total, counter, + "Total time in seconds spent executing ports.", + metric(port, Data, SecondAsPerfCounter)}, + {sleep_seconds_total, counter, + "Total time in seconds spent sleeping.", + metric(sleep, Data, SecondAsPerfCounter)}, + %% Extra states. + {alloc_seconds_total, counter, + "Total time in seconds spent managing memory. " + "Without extra states this time is spread out over all other states.", + metric(alloc, Data, SecondAsPerfCounter)}, + {bif_seconds_total, counter, + "Total time in seconds spent in BIFs. " + "Without extra states this time is part of the 'emulator' state.", + metric(bif, Data, SecondAsPerfCounter)}, + {busy_wait_seconds_total, counter, + "Total time in seconds spent busy waiting. " + "Without extra states this time is part of the 'other' state.", + metric(busy_wait, Data, SecondAsPerfCounter)}, + {ets_seconds_total, counter, + "Total time in seconds spent executing ETS BIFs. " + "Without extra states this time is part of the 'emulator' state.", + metric(ets, Data, SecondAsPerfCounter)}, + {gc_full_seconds_total, counter, + "Total time in seconds spent doing fullsweep garbage collection. " + "Without extra states this time is part of the 'gc' state.", + metric(gc_full, Data, SecondAsPerfCounter)}, + {nif_seconds_total, counter, + "Total time in seconds spent in NIFs. " + "Without extra states this time is part of the 'emulator' state.", + metric(nif, Data, SecondAsPerfCounter)}, + {send_seconds_total, counter, + "Total time in seconds spent sending messages (processes only). " + "Without extra states this time is part of the 'emulator' state.", + metric(send, Data, SecondAsPerfCounter)}, + {timers_seconds_total, counter, + "Total time in seconds spent managing timers. " + "Without extra states this time is part of the 'other' state.", + metric(timers, Data, SecondAsPerfCounter)} + ]. + +metric(Counter, Data, SecondAsPerfCounter) -> + [{[{type, Type}, {id, ID}], Value / SecondAsPerfCounter} + || #{type := Type, id := ID, counters := #{Counter := Value}} <- Data]. + +enabled_metrics() -> + application:get_env(prometheus, vm_msacc_collector_metrics, all). + +metric_enabled(Name, Metrics) -> + Metrics =:= all orelse lists:member(Name, Metrics). diff --git a/src/prometheus_collector.erl b/src/prometheus_collector.erl index 6202f767..33ab3021 100644 --- a/src/prometheus_collector.erl +++ b/src/prometheus_collector.erl @@ -86,6 +86,7 @@ prometheus_summary, prometheus_vm_dist_collector, prometheus_vm_memory_collector, + prometheus_vm_msacc_collector, prometheus_vm_statistics_collector, prometheus_vm_system_info_collector]). diff --git a/test/eunit/collectors/vm/prometheus_vm_msacc_collector_tests.erl b/test/eunit/collectors/vm/prometheus_vm_msacc_collector_tests.erl new file mode 100644 index 00000000..efb5183b --- /dev/null +++ b/test/eunit/collectors/vm/prometheus_vm_msacc_collector_tests.erl @@ -0,0 +1,135 @@ +-module(prometheus_vm_msacc_collector_tests). + +-include_lib("eunit/include/eunit.hrl"). + +prometheus_format_test_() -> + {foreach, + fun prometheus_eunit_common:start/0, + fun prometheus_eunit_common:stop/1, + [fun test_default_metrics/1, + fun test_all_metrics/1, + fun test_custom_metrics/1, + fun test_global_labels/1]}. + +test_default_metrics(_) -> + prometheus_registry:register_collector(prometheus_vm_msacc_collector), + Metrics = prometheus_text_format:format(), + [ + %% Base. + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_aux_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_check_io_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_emulator_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_gc_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_other_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_port_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_sleep_seconds_total")), + %% Extra. + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_alloc_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_bif_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_busy_wait_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_ets_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_gc_full_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_nif_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_send_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_timers_seconds_total")) + ]. + + +test_all_metrics(_) -> + try + application:set_env(prometheus, vm_msacc_collector_metrics, + [ + %% Base. + aux_seconds_total, + check_io_seconds_total, + emulator_seconds_total, + gc_seconds_total, + other_seconds_total, + port_seconds_total, + sleep_seconds_total, + %% Extra. + alloc_seconds_total, + bif_seconds_total, + busy_wait_seconds_total, + ets_seconds_total, + gc_full_seconds_total, + nif_seconds_total, + send_seconds_total, + timers_seconds_total + ]), + prometheus_registry:register_collector(prometheus_vm_msacc_collector), + Metrics = prometheus_text_format:format(), + [ + %% Base. + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_aux_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_check_io_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_emulator_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_gc_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_other_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_port_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_sleep_seconds_total")), + %% Extra. + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_alloc_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_bif_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_busy_wait_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_ets_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_gc_full_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_nif_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_send_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_timers_seconds_total")) + ] + + after + application:unset_env(prometheus, vm_msacc_collector_metrics) + end. + + +test_custom_metrics(_) -> + try + application:set_env(prometheus, vm_msacc_collector_metrics, + [ + aux_seconds_total, + emulator_seconds_total, + gc_seconds_total, + gc_full_seconds_total, + nif_seconds_total + ]), + prometheus_registry:register_collector(prometheus_vm_msacc_collector), + Metrics = prometheus_text_format:format(), + [ + %% Base. + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_aux_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_check_io_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_emulator_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_gc_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_other_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_port_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_sleep_seconds_total")), + %% Extra. + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_alloc_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_bif_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_busy_wait_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_ets_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_gc_full_seconds_total")), + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_nif_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_send_seconds_total")), + ?_assertMatch(nomatch, re:run(Metrics, "erlang_vm_msacc_timers_seconds_total")) + ] + + after + application:unset_env(prometheus, vm_msacc_collector_metrics) + end. + + +test_global_labels(_) -> + Metrics = try + prometheus:start(), + application:set_env(prometheus, global_labels, [{node, node()}]), + prometheus_registry:register_collector(prometheus_vm_msacc_collector), + prometheus_text_format:format() + after + application:unset_env(prometheus, global_labels) + end, + [ + ?_assertMatch({match, _}, re:run(Metrics, "erlang_vm_msacc_aux_seconds_total{node=")) + ]. diff --git a/test/eunit/prometheus_collector_tests.erl b/test/eunit/prometheus_collector_tests.erl index 8649e20b..d8d7052c 100644 --- a/test/eunit/prometheus_collector_tests.erl +++ b/test/eunit/prometheus_collector_tests.erl @@ -10,6 +10,7 @@ prometheus_summary, prometheus_vm_dist_collector, prometheus_vm_memory_collector, + prometheus_vm_msacc_collector, prometheus_vm_statistics_collector, prometheus_vm_system_info_collector]). diff --git a/test/eunit/prometheus_registry_tests.erl b/test/eunit/prometheus_registry_tests.erl index 866ff0e0..2b445dca 100644 --- a/test/eunit/prometheus_registry_tests.erl +++ b/test/eunit/prometheus_registry_tests.erl @@ -32,6 +32,7 @@ default_registry() -> prometheus_summary, prometheus_vm_dist_collector, prometheus_vm_memory_collector, + prometheus_vm_msacc_collector, prometheus_vm_statistics_collector, prometheus_vm_system_info_collector], prometheus_registry:collectors(default)).