diff --git a/Makefile b/Makefile index a4896b2f7..bc988e9e2 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ BIN_PLATFORM ?= linux BRANCH := $(shell git rev-parse --abbrev-ref HEAD) LINT_EXISTS := $(shell which golangci-lint) GOVULNCHECK_EXISTS := $(shell which govulncheck) +MKDOCS_EXISTS := $(shell which mkdocs) help: ## Display this help @@ -103,6 +104,15 @@ ifeq (${GOVULNCHECK_EXISTS}, ) endif govulncheck ./... +mkdocs: +ifeq (${MKDOCS_EXISTS}, ) + @echo + @echo "mkdocs task requires that you have https://squidfunk.github.io/mkdocs-material/getting-started/ installed." + @echo + @exit 1 +endif + mkdocs serve + build: clean deps fmt harvest fetch-asup ## Build the project package: clean deps build test dist-tar ## Package Harvest binary @@ -171,5 +181,6 @@ dev: build lint govulncheck fetch-asup: @./.github/fetch-asup ${ASUP_BIN} ${ASUP_BIN_VERSION} ${BIN_PLATFORM} 2>/dev/null #Suppress Error in case of internet connectivity +docs: mkdocs ## Serve docs for local dev diff --git a/README.md b/README.md index 8f6aca348..0bec5533c 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ We provide pre-compiled binaries for Linux, RPMs, and Debs. ## Pre-compiled Binaries -### Installation +### Installation Visit the [Releases page](https://github.com/NetApp/harvest/releases) and copy the `tar.gz` link you want to download. For example, to download the `v21.08.0` release: ``` wget https://github.com/NetApp/harvest/releases/download/v21.08.0/harvest-21.08.0-6_linux_amd64.tar.gz @@ -47,8 +47,8 @@ curl -L -O https://github.com/NetApp/harvest/releases/download/v21.08.0/harvest- It's best to run Harvest as a non-root user. Make sure the user running Harvest can write to `/var/log/harvest/` or tell Harvest to write the logs somewhere else with the `HARVEST_LOGS` environment variable. -If something goes wrong, examine the logs files in `/var/log/harvest`, check out -the [troubleshooting](https://github.com/NetApp/harvest/wiki/Troubleshooting-Harvest) section of the wiki and jump +If something goes wrong, examine the logs files in `/var/log/harvest`, check out +the [troubleshooting](https://github.com/NetApp/harvest/wiki/Troubleshooting-Harvest) section of the wiki and jump onto [Discord](https://github.com/NetApp/harvest/blob/main/SUPPORT.md#getting-help) and ask for help. ### Upgrade @@ -87,7 +87,7 @@ Download the latest rpm of [Harvest](https://github.com/NetApp/harvest/releases/ Once the installation has finished, edit the [harvest.yml configuration](#harvest-configuration) file located in `/opt/harvest/harvest.yml` -After editing `/opt/harvest/harvest.yml`, manage Harvest with `systemctl start|stop|restart harvest`. +After editing `/opt/harvest/harvest.yml`, manage Harvest with `systemctl start|stop|restart harvest`. After upgrade, you should re-import all dashboards (either grafana import cli or grafana UI) to get any new enhancements in dashboards. @@ -113,7 +113,7 @@ Download the latest deb of [Harvest](https://github.com/NetApp/harvest/releases/ Once the installation has finished, edit the [harvest.yml configuration](#harvest-configuration) file located in `/opt/harvest/harvest.yml` -After editing `/opt/harvest/harvest.yml`, manage Harvest with `systemctl start|stop|restart harvest`. +After editing `/opt/harvest/harvest.yml`, manage Harvest with `systemctl start|stop|restart harvest`. After upgrade, You should re-import all dashboards (either grafana import cli or grafana UI) to get any new enhancements in dashboards. @@ -187,7 +187,7 @@ Note: the current dashboards specify Prometheus as the datasource. If you use th ## 4. Verify the metrics -If you use a Prometheus Exporter, open a browser and navigate to [http://0.0.0.0:12990/](http://0.0.0.0:12990/) (replace `12990` with the port number of your poller). This is the Harvest created HTTP end-point for your Prometheus exporter. This page provides a real-time generated list of running collectors and names of exported metrics. +If you use a Prometheus Exporter, open a browser and navigate to [http://0.0.0.0:12990/](http://0.0.0.0:12990/) (replace `12990` with the port number of your poller). This is the Harvest created HTTP end-point for your Prometheus exporter. This page provides a real-time generated list of running collectors and names of exported metrics. The metric data that's exposed for Prometheus to scrap is available at [http://0.0.0.0:12990/metrics/](http://0.0.0.0:12990/metrics/). For more help on how to configure Prometheus DB, see the [Prometheus exporter](cmd/exporters/prometheus/README.md) documentation. @@ -195,32 +195,32 @@ If you can't access the URL, check the logs of your pollers. These are located i ## 5. (Optional) Setup Systemd service files -If you're running Harvest on a system with Systemd, you may want to [take advantage of systemd instantiated units](https://github.com/NetApp/harvest/tree/main/service/contrib) to manage your pollers. +If you're running Harvest on a system with Systemd, you may want to [take advantage of systemd instantiated units](https://github.com/NetApp/harvest/tree/main/service/contrib) to manage your pollers. -# Harvest Configuration +# Harvest Configuration The main configuration file, `harvest.yml`, consists of the following sections, described below: ## Pollers -All pollers are defined in `harvest.yml`, the main configuration file of Harvest, under the section `Pollers`. - -| parameter | type | description | default | -|------------------------|------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------| -| Poller name (header) | **required** | Poller name, user-defined value | | -| `datacenter` | **required** | Datacenter name, user-defined value | | -| `addr` | required by some collectors | IPv4 or FQDN of the target system | | -| `collectors` | **required** | List of collectors to run for this poller | | -| `exporters` | **required** | List of exporter names from the `Exporters` section. Note: this should be the name of the exporter (e.g. `prometheus1`), not the value of the `exporter` key (e.g. `Prometheus`) | | -| `auth_style` | required by Zapi* collectors | Either `basic_auth` or `certificate_auth` | `basic_auth` | -| `username`, `password` | required if `auth_style` is `basic_auth` | | | -| `ssl_cert`, `ssl_key` | optional if `auth_style` is `certificate_auth` | Absolute paths to SSL (client) certificate and key used to authenticate with the target system.

If not provided, the poller will look for `.key` and `.pem` in `$HARVEST_HOME/cert/`.

To create certificates for ONTAP systems, see [using certificate authentication](docs/AuthAndPermissions.md#using-certificate-authentication) | | -| `use_insecure_tls` | optional, bool | If true, disable TLS verification when connecting to ONTAP cluster | false | -| `credentials_file` | optional, string | Path to a yaml file that contains cluster credentials. The file should have the same shape as `harvest.yml`. See [here](#credentials file) for examples. Path can be relative to `harvest.yml` or absolute | | - | `tls_min_version` | optional, string | Minimum TLS version to use when connecting to ONTAP cluster: One of tls10, tls11, tls12 or tls13 | Platform decides | -| `labels` | optional, list of key-value pairs | Each of the key-value pairs will be added to a poller's metrics. Details [below](#labels) | | -| `log_max_bytes` | | Maximum size of the log file before it will be rotated | `5_242_880` (5 MB) | -| `log_max_files` | | Number of rotated log files to keep | `5` | -| `log` | optional, list of collector names | Matching collectors log their ZAPI request/response | | +All pollers are defined in `harvest.yml`, the main configuration file of Harvest, under the section `Pollers`. + +| parameter | type | description | default | +|------------------------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------| +| Poller name (header) | **required** | Poller name, user-defined value | | +| `datacenter` | **required** | Datacenter name, user-defined value | | +| `addr` | required by some collectors | IPv4 or FQDN of the target system | | +| `collectors` | **required** | List of collectors to run for this poller | | +| `exporters` | **required** | List of exporter names from the `Exporters` section. Note: this should be the name of the exporter (e.g. `prometheus1`), not the value of the `exporter` key (e.g. `Prometheus`) | | +| `auth_style` | required by Zapi* collectors | Either `basic_auth` or `certificate_auth` | `basic_auth` | +| `username`, `password` | required if `auth_style` is `basic_auth` | | | +| `ssl_cert`, `ssl_key` | optional if `auth_style` is `certificate_auth` | Absolute paths to SSL (client) certificate and key used to authenticate with the target system.

If not provided, the poller will look for `.key` and `.pem` in `$HARVEST_HOME/cert/`.

To create certificates for ONTAP systems, see [using certificate authentication](pkg/docs/AuthAndPermissions.md#using-certificate-authentication) | | +| `use_insecure_tls` | optional, bool | If true, disable TLS verification when connecting to ONTAP cluster | false | +| `credentials_file` | optional, string | Path to a yaml file that contains cluster credentials. The file should have the same shape as `harvest.yml`. See [here](#credentials file) for examples. Path can be relative to `harvest.yml` or absolute | | +| `tls_min_version` | optional, string | Minimum TLS version to use when connecting to ONTAP cluster: One of tls10, tls11, tls12 or tls13 | Platform decides | +| `labels` | optional, list of key-value pairs | Each of the key-value pairs will be added to a poller's metrics. Details [below](#labels) | | +| `log_max_bytes` | | Maximum size of the log file before it will be rotated | `5_242_880` (5 MB) | +| `log_max_files` | | Number of rotated log files to keep | `5` | +| `log` | optional, list of collector names | Matching collectors log their ZAPI request/response | | ## Defaults This section is optional. If there are parameters identical for all your pollers (e.g. datacenter, authentication method, login preferences), they can be grouped under this section. The poller section will be checked first and if the values aren't found there, the defaults will be consulted. @@ -258,7 +258,7 @@ Tools: Collectors are configured by their own configuration files (templates), which are stored in subdirectories in [conf/](conf/). Most collectors run concurrently and collect a subset of related metrics. For example, node related metrics are grouped together and run independently of the disk related metrics. Below is a snippet from `conf/zapi/default.yaml` -In this example, the `default.yaml` template contains a list of objects (e.g. Node) that reference sub-templates (e.g. node.yaml). This decomposition groups related metrics together and at runtime, a `Zapi` collector per object will be created and each of these collectors will run concurrently. +In this example, the `default.yaml` template contains a list of objects (e.g. Node) that reference sub-templates (e.g. node.yaml). This decomposition groups related metrics together and at runtime, a `Zapi` collector per object will be created and each of these collectors will run concurrently. Using the snippet below, we expect there to be four `Zapi` collectors running, each with a different subtemplate and object. @@ -309,11 +309,11 @@ Keep in mind that each unique combination of key-value pairs increases the amoun ## Credentials File If you would rather not list cluster credentials in your `harvest.yml`, you can use the `credentials_file` section -in your `harvest.yml` to point to a file that contains the credentials. +in your `harvest.yml` to point to a file that contains the credentials. At runtime, the `credentials_file` will be read and the included credentials will be used to authenticate with the matching cluster(s). -This is handy when integrating with 3rd party credential stores. -See #884 for examples. +This is handy when integrating with 3rd party credential stores. +See #884 for examples. The format of the `credentials_file` is similar to `harvest.yml` and can contain multiple cluster credentials. diff --git a/docs/assets/grafana/grafana_api.png b/docs/assets/grafana/grafana_api.png new file mode 100644 index 000000000..8a8929828 Binary files /dev/null and b/docs/assets/grafana/grafana_api.png differ diff --git a/docs/assets/harvest.svg b/docs/assets/harvest.svg new file mode 100644 index 000000000..ff0a223ed --- /dev/null +++ b/docs/assets/harvest.svg @@ -0,0 +1,154 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/assets/matrix.png b/docs/assets/matrix.png new file mode 100644 index 000000000..d42ff0faf Binary files /dev/null and b/docs/assets/matrix.png differ diff --git a/docs/assets/prepare-ontap/dashboard_cluster.png b/docs/assets/prepare-ontap/dashboard_cluster.png new file mode 100644 index 000000000..99b0acbf9 Binary files /dev/null and b/docs/assets/prepare-ontap/dashboard_cluster.png differ diff --git a/docs/assets/prepare-ontap/dashboard_shelf.png b/docs/assets/prepare-ontap/dashboard_shelf.png new file mode 100644 index 000000000..8b634fd01 Binary files /dev/null and b/docs/assets/prepare-ontap/dashboard_shelf.png differ diff --git a/docs/assets/prepare-ontap/ontap_user_sm_0.png b/docs/assets/prepare-ontap/ontap_user_sm_0.png new file mode 100644 index 000000000..a8d00bdc2 Binary files /dev/null and b/docs/assets/prepare-ontap/ontap_user_sm_0.png differ diff --git a/docs/assets/prepare-ontap/ontap_user_sm_1.png b/docs/assets/prepare-ontap/ontap_user_sm_1.png new file mode 100644 index 000000000..bd9eb5719 Binary files /dev/null and b/docs/assets/prepare-ontap/ontap_user_sm_1.png differ diff --git a/docs/assets/prepare-ontap/ontap_user_sm_2.png b/docs/assets/prepare-ontap/ontap_user_sm_2.png new file mode 100644 index 000000000..66699244c Binary files /dev/null and b/docs/assets/prepare-ontap/ontap_user_sm_2.png differ diff --git a/docs/assets/prepare-ontap/ontap_user_sm_3.png b/docs/assets/prepare-ontap/ontap_user_sm_3.png new file mode 100644 index 000000000..bbaf882a0 Binary files /dev/null and b/docs/assets/prepare-ontap/ontap_user_sm_3.png differ diff --git a/docs/assets/prepare-ontap/ontap_user_smc_0.png b/docs/assets/prepare-ontap/ontap_user_smc_0.png new file mode 100644 index 000000000..2310ed3a6 Binary files /dev/null and b/docs/assets/prepare-ontap/ontap_user_smc_0.png differ diff --git a/docs/assets/prepare-ontap/ontap_user_smc_1.png b/docs/assets/prepare-ontap/ontap_user_smc_1.png new file mode 100644 index 000000000..5a806d065 Binary files /dev/null and b/docs/assets/prepare-ontap/ontap_user_smc_1.png differ diff --git a/docs/assets/prepare-ontap/ontap_user_smc_2.png b/docs/assets/prepare-ontap/ontap_user_smc_2.png new file mode 100644 index 000000000..3836d2224 Binary files /dev/null and b/docs/assets/prepare-ontap/ontap_user_smc_2.png differ diff --git a/docs/assets/prepare-ontap/ontap_user_smc_3.png b/docs/assets/prepare-ontap/ontap_user_smc_3.png new file mode 100644 index 000000000..5d8004d11 Binary files /dev/null and b/docs/assets/prepare-ontap/ontap_user_smc_3.png differ diff --git a/docs/configure-ems.md b/docs/configure-ems.md new file mode 100644 index 000000000..a5e0e6bfe --- /dev/null +++ b/docs/configure-ems.md @@ -0,0 +1,182 @@ +## EMS collector + +The `EMS collector` +collects [ONTAP event management system](https://mysupport.netapp.com/documentation/productlibrary/index.html?productID=62286) ( +EMS) events via the ONTAP REST API. + +This collector uses a YAML template file to define which events to collect, export, and what labels to attach to each +metric. This means you can collect new EMS events or attach new labels by editing +the [default template](https://github.com/NetApp/harvest/blob/main/conf/ems/default.yaml) file or by [extending existing +templates](configure-templates.md#how-to-extend-a-restrestperfems-collectors-existing-object-template). + +The [default template](https://github.com/NetApp/harvest/blob/main/conf/ems/default.yaml) file contains 60+ EMS events. + +### Supported ONTAP Systems + +Any cDOT ONTAP system using 9.6 or higher. + +### Requirements + +It is recommended to create a read-only user on the ONTAP system. +See [prepare an ONTAP cDOT cluster](prepare-cdot-clusters.md) for details. + +### Metrics + +This collector collects EMS events from ONTAP and for each received EMS event, creates new metrics prefixed +with `ems_events`. + +Harvest supports two types of ONTAP EMS events: + +1. **Normal EMS events** + +Single shot events. When ONTAP detects a problem, an event is raised. +When the issue is addressed, ONTAP does **not** raise another event reflecting that the problem was resolved. + +2. **Bookend EMS events** + +ONTAP creates bookend events in matching pairs. +ONTAP creates an event when an issue is detected and another paired event when the event is resolved. +Typically, these events share a common set of properties. + +### Collector Configuration + +The parameters of the collector are distributed across three files: + +- [Harvest configuration file](configure-harvest-basic.md#pollers) (default: `harvest.yml`) +- [EMS collector configuration](#ems-collector-configuration-file) file (default: `conf/ems/default.yaml`) +- [EMS template file](#ems-template-file) (located in `conf/ems/9.6.0/ems.yaml`) + +Except for `addr`, `datacenter`, and `auth_style`, all other parameters of the EMS collector can be defined +in either of these three files. +Parameters defined in the lower-level files, override parameters in the higher-level file. +This allows you to configure each EMS event individually, or use the same parameters for all events. + +#### EMS Collector Configuration File + +This configuration file contains the parameters that are used to configure the EMS collector. +These parameters can be defined in your `harvest.yml` or `conf/ems/default.yaml` file. + +| parameter | type | description | default | +|------------------|----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------| +| `client_timeout` | Go duration | how long to wait for server responses | 1m | +| `schedule` | list, required | the polling frequency of the collector/object. Should include exactly the following two elements in the order specified: | | +| - `instance` | Go duration | polling frequency for updating the instance cache (example value: `24h` = `1440m`) | | +| - `data` | Go duration | polling frequency for updating the data cache (example value: `3m`)

**Note** Harvest allows defining poll intervals on sub-second level (e.g. `1ms`), however keep in mind the following:
  • API response of an ONTAP system can take several seconds, so the collector is likely to enter failed state if the poll interval is less than `client_timeout`.
  • Small poll intervals will create significant workload on the ONTAP system.
| | + +The EMS configuration file should contain the following section mapping the `Ems` object to the corresponding template +file. + +```yaml +objects: + Ems: ems.yaml +``` + +Even though the EMS mapping shown above references a single file named `ems.yaml`, +there may be multiple versions of that file across subdirectories named after ONTAP releases. +See [cDOT](`https://github.com/NetApp/harvest/tree/main/conf/zapiperf/cdot`) for examples. + +At runtime, the EMS collector will select the appropriate object configuration file that most closely matches the +targeted ONTAP system. + +#### EMS Template File + +The EMS template file should contain the following parameters: + +| parameter | type | description | default | +|-----------|--------|----------------------------------------------------------------------------------------------------|--------------------------| +| `name` | string | display name of the collector. this matches the named defined in your `conf/ems/default.yaml` file | EMS | +| `object` | string | short name of the object, used to prefix metrics | ems | +| `query` | string | REST API endpoint used to query EMS events | `api/support/ems/events` | +| `exports` | list | list of default labels attached to each exported metric | | +| `events` | list | list of EMS events to collect. See [Event Parameters](#event-parameters) | | + +##### Event Parameters + +This section defines the list of EMS events you want to collect, which properties to export, what labels to attach, and +how to handle bookend pairs. +The EMS event template parameters are explained below along with an example for reference. + +- `name` is the ONTAP EMS event name. (collect ONTAP EMS events with the name of `LUN.offline`) +- `matches` list of name-value pairs used to further filter ONTAP events. + Some EMS events include arguments and these name-value pairs provide a way to filter on those arguments. + (Only collect ONTAP EMS events where `volume_name` has the value `abc_vol`) +- `exports` list of EMS event parameters to export. These exported parameters are attached as labels to each matching + EMS event. + - labels that are prefixed with `^^` use that parameter to + define [instance uniqueness](resources/templates-and-metrics.md#harvest-object-template). +- `resolve_when_ems` (applicable to bookend events only). Lists the resolving event that pairs with the issuing event + - `name` is the ONTAP EMS event name of the resolving EMS event (`LUN.online`). + When the resolving event is received, the issuing EMS event will be resolved. In this example, Harvest will raise + an event when it finds the ONTAP EMS event named `LUN.offline` and that event will be resolved when the EMS event + named `LUN.online` is received. + - `resolve_after` (optional, Go duration, default = 28 days) resolve the issuing EMS after the specified duration + has elapsed (`672h` = `28d`). + If the bookend pair is not received within the `resolve_after` duration, the issuing EMS event expires. + - `resolve_key` (optional) bookend key used to match bookend EMS events. Defaults to prefixed (`^^`) labels + in `exports` section. `resolve_key` allows you to override what is defined in the `exports` section. + +Labels are only exported if they are included in the `exports` section. + +Example template definition for the `LUN.offline` EMS event: + +```yaml + - name: LUN.offline + matches: + - name: volume_name + value: abc_vol + exports: + - ^^parameters.object_uuid => object_uuid + - parameters.object_type => object_type + - parameters.lun_path => lun_path + - parameters.volume_name => volume + - parameters.volume_dsid => volume_ds_id + resolve_when_ems: + - name: LUN.online + resolve_after: 672h + resolve_key: + - ^^parameters.object_uuid => object_uuid +``` + +### How do I find the full list of supported EMS events? + +ONTAP documents the list of EMS events created in +the [ONTAP EMS Event Catalog](https://mysupport.netapp.com/documentation/productlibrary/index.html?productID=62286). + +You can also query a live system and ask the cluster for its event catalog like so: + +``` +curl --insecure --user "user:password" 'https://10.61.124.110/api/support/ems/messages?fields=*' +``` + +Example Output + +``` +{ + "records": [ + { + "name": "AccessCache.NearLimits", + "severity": "alert", + "description": "This message occurs when the access cache module is near its limits for entries or export rules. Reaching these limits can prevent new clients from being able to mount and perform I/O on the storage system, and can also cause clients to be granted or denied access based on stale cached information.", + "corrective_action": "Ensure that the number of clients accessing the storage system continues to be below the limits for access cache entries and export rules across those entries. If the set of clients accessing the storage system is constantly changing, consider using the \"vserver export-policy access-cache config modify\" command to reduce the harvest timeout parameter so that cache entries for clients that are no longer accessing the storage system can be evicted sooner.", + "snmp_trap_type": "severity_based", + "deprecated": false + }, +... + { + "name": "ztl.smap.online.status", + "severity": "notice", + "description": "This message occurs when the specified partition on a Software Defined Flash drive could not be onlined due to internal S/W or device error.", + "corrective_action": "NONE", + "snmp_trap_type": "severity_based", + "deprecated": false + } + ], + "num_records": 7273 +} +``` + +## Ems Prometheus Alerts + +Refer [Prometheus-Alerts](prometheus-exporter.md#prometheus-alerts) + + diff --git a/docs/configure-grafana.md b/docs/configure-grafana.md new file mode 100644 index 000000000..ca69ef1e0 --- /dev/null +++ b/docs/configure-grafana.md @@ -0,0 +1 @@ +Coming Soon \ No newline at end of file diff --git a/docs/configure-harvest-advanced.md b/docs/configure-harvest-advanced.md new file mode 100644 index 000000000..98e2df86e --- /dev/null +++ b/docs/configure-harvest-advanced.md @@ -0,0 +1,3 @@ +This chapter describes additional advanced configuration possibilities of NetApp Harvest. For a typical installation +this level of detail is likely not needed. + diff --git a/docs/configure-harvest-basic.md b/docs/configure-harvest-basic.md new file mode 100644 index 000000000..4f0cb471a --- /dev/null +++ b/docs/configure-harvest-basic.md @@ -0,0 +1,164 @@ +The main configuration file, `harvest.yml`, consists of the following sections, described below: + +## Pollers + +All pollers are defined in `harvest.yml`, the main configuration file of Harvest, under the section `Pollers`. + +| parameter | type | description | default | +|------------------------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------| +| Poller name (header) | **required** | Poller name, user-defined value | | +| `datacenter` | **required** | Datacenter name, user-defined value | | +| `addr` | required by some collectors | IPv4 or FQDN of the target system | | +| `collectors` | **required** | List of collectors to run for this poller | | +| `exporters` | **required** | List of exporter names from the `Exporters` section. Note: this should be the name of the exporter (e.g. `prometheus1`), not the value of the `exporter` key (e.g. `Prometheus`) | | +| `auth_style` | required by Zapi* collectors | Either `basic_auth` or `certificate_auth` | `basic_auth` | +| `username`, `password` | required if `auth_style` is `basic_auth` | | | +| `ssl_cert`, `ssl_key` | optional if `auth_style` is `certificate_auth` | Absolute paths to SSL (client) certificate and key used to authenticate with the target system.

If not provided, the poller will look for `.key` and `.pem` in `$HARVEST_HOME/cert/`.

To create certificates for ONTAP systems, see [using certificate authentication](prepare-cdot-clusters.md#using-certificate-authentication) | | +| `use_insecure_tls` | optional, bool | If true, disable TLS verification when connecting to ONTAP cluster | false | +| `credentials_file` | optional, string | Path to a yaml file that contains cluster credentials. The file should have the same shape as `harvest.yml`. See [here](configure-harvest-basic.md#credentials-file) for examples. Path can be relative to `harvest.yml` or absolute | | +| `tls_min_version` | optional, string | Minimum TLS version to use when connecting to ONTAP cluster: One of tls10, tls11, tls12 or tls13 | Platform decides | +| `labels` | optional, list of key-value pairs | Each of the key-value pairs will be added to a poller's metrics. Details [below](configure-harvest-basic.md#labels) | | +| `log_max_bytes` | | Maximum size of the log file before it will be rotated | `5_242_880` (5 MB) | +| `log_max_files` | | Number of rotated log files to keep | `5` | +| `log` | optional, list of collector names | Matching collectors log their ZAPI request/response | | + +## Defaults + +This section is optional. If there are parameters identical for all your pollers (e.g. datacenter, authentication +method, login preferences), they can be grouped under this section. The poller section will be checked first and if the +values aren't found there, the defaults will be consulted. + +## Exporters + +All exporters need two types of parameters: + +- `exporter parameters` - defined in `harvest.yml` under `Exporters` section +- `export_options` - these options are defined in the `Matrix` data structure that is emitted from collectors and + plugins + +The following two parameters are required for all exporters: + +| parameter | type | description | default | +|------------------------|--------------|------------------------------------------------------------------------------------------------------------------------|---------| +| Exporter name (header) | **required** | Name of the exporter instance, this is a user-defined value | | +| `exporter` | **required** | Name of the exporter class (e.g. Prometheus, InfluxDB, Http) - these can be found under the `cmd/exporters/` directory | | + +Note: when we talk about the *Prometheus Exporter* or *InfluxDB Exporter*, we mean the Harvest modules that send the +data to a database, NOT the names used to refer to the actual databases. + +### [Prometheus Exporter](prometheus-exporter.md) + +### [InfluxDB Exporter](influxdb-exporter.md) + +## Tools + +This section is optional. You can uncomment the `grafana_api_token` key and add your Grafana API token so `harvest` does +not prompt you for the key when importing dashboards. + +``` +Tools: + #grafana_api_token: 'aaa-bbb-ccc-ddd' +``` + +## Configuring collectors + +Collectors are configured by their own configuration files ([templates](configure-templates.md)), which are stored in subdirectories +in [conf/](https://github.com/NetApp/harvest/tree/main/conf). +Most collectors run concurrently and collect a subset of related metrics. +For example, node related metrics are grouped together and run independently of the disk related metrics. +Below is a snippet from `conf/zapi/default.yaml` + +In this example, the `default.yaml` template contains a list of objects (e.g. Node) that reference sub-templates (e.g. +node.yaml). This decomposition groups related metrics together and at runtime, a `Zapi` collector per object will be +created and each of these collectors will run concurrently. + +Using the snippet below, we expect there to be four `Zapi` collectors running, each with a different subtemplate and +object. + +``` +collector: Zapi +objects: + Node: node.yaml + Aggregate: aggr.yaml + Volume: volume.yaml + SnapMirror: snapmirror.yaml +``` + +At start-up, Harvest looks for two files (`default.yaml` and `custom.yaml`) in the `conf` directory of the +collector (e.g. `conf/zapi/default.yaml`). +The `default.yaml` is installed by default, while the `custom.yaml` is an optional file +you can create +to [add new templates](configure-templates.md#creatingediting-templates). + +When present, the `custom.yaml` file will be merged with the `default.yaml` file. +This behavior can be overridden in your `harvest.yml`, see +[here](https://github.com/NetApp/harvest/blob/main/pkg/conf/testdata/issue_396.yaml) for an example. + +For a list of collector-specific parameters, refer to their individual documentation. + +#### [Zapi and ZapiPerf](configure-zapi.md) + +#### [Rest and RestPerf](configure-rest.md) + +#### [EMS](configure-ems.md) + +#### [StorageGRID](configure-storagegrid.md) + +#### [Unix](configure-unix.md) + +## Labels + +Labels offer a way to add additional key-value pairs to a poller's metrics. These allow you to tag a cluster's metrics +in a cross-cutting fashion. Here's an example: + +``` + cluster-03: + datacenter: DC-01 + addr: 10.0.1.1 + labels: + - org: meg # add an org label with the value "meg" + - ns: rtp # add a namespace label with the value "rtp" +``` + +These settings add two key-value pairs to each metric collected from `cluster-03` like this: + +``` +node_vol_cifs_write_data{org="meg",ns="rtp",datacenter="DC-01",cluster="cluster-03",node="umeng-aff300-05"} 10 +``` + +Keep in mind that each unique combination of key-value pairs increases the amount of stored data. Use them sparingly. +See [PrometheusNaming](https://prometheus.io/docs/practices/naming/#labels) for details. + +## Credentials File + +If you would rather not list cluster credentials in your `harvest.yml`, you can use the `credentials_file` section +in your `harvest.yml` to point to a file that contains the credentials. +At runtime, the `credentials_file` will be read and the included credentials will be used to authenticate with the +matching cluster(s). + +This is handy when integrating with 3rd party credential stores. +See #884 for examples. + +The format of the `credentials_file` is similar to `harvest.yml` and can contain multiple cluster credentials. + +Example: + +Snippet from `harvest.yml`: + +```yaml +Pollers: + cluster1: + addr: 10.193.48.11 + credentials_file: secrets/cluster1.yml + exporters: + - prom1 +``` + +File `secrets/cluster1.yml`: + +```yaml +Pollers: + cluster1: + username: harvest + password: foo +``` \ No newline at end of file diff --git a/docs/configure-rest.md b/docs/configure-rest.md new file mode 100644 index 000000000..ca69ef1e0 --- /dev/null +++ b/docs/configure-rest.md @@ -0,0 +1 @@ +Coming Soon \ No newline at end of file diff --git a/docs/configure-storagegrid.md b/docs/configure-storagegrid.md new file mode 100644 index 000000000..ca69ef1e0 --- /dev/null +++ b/docs/configure-storagegrid.md @@ -0,0 +1 @@ +Coming Soon \ No newline at end of file diff --git a/docs/configure-templates.md b/docs/configure-templates.md new file mode 100644 index 000000000..90e2859ee --- /dev/null +++ b/docs/configure-templates.md @@ -0,0 +1,395 @@ +## Creating/editing templates + +This document covers how to use [Collector](configure-templates.md#collector-templates) +and [Object](configure-templates.md#object-templates) templates to extend Harvest. + +1. [How to add a new object template](configure-templates.md#create-a-new-object-template) +2. [How to extend an existing object template](configure-templates.md#extend-an-existing-object-template) + +There are a couple of ways to learn about ZAPIs and their attributes: + +- [ONTAP's documentation](https://mysupport.netapp.com/documentation/productlibrary/index.html?productID=60427) +- Using Harvest's `zapi` tool to explore available APIs and metrics on your cluster. Examples: + +```sh +$ harvest zapi --poller show apis + # will print list of apis that are available + # usually apis with the "get-iter" suffix can provide useful metrics +$ harvest zapi --poller show attrs --api volume-get-iter + # will print the attribute tree of the API +$ harvest zapi --poller show data --api volume-get-iter + # will print raw data of the API attribute tree +``` + +(Replace `` with the name of a poller that can connect to an ONTAP system.) + +## Collector templates + +Collector templates define which set of objects Harvest should collect from the system being monitored. +In your `harvest.yml` configuration file, when you say that you want to use a `Zapi` collector, that +collector will read the matching `conf/zapi/default.yaml` - same with `ZapiPerf`, it will read +the `conf/zapiperf/default.yaml` file. Belows's a snippet from `conf/zapi/default.yaml`. Each object is mapped to a +corresponding [object template](configure-templates.md#object-templates) file. For example, the `Node` object searches +for the [most appropriate version](configure-templates.md#harvest-versioned-templates) of the `node.yaml` file in +the `conf/zapi/cdot/**` directory. + +``` +collector: Zapi +objects: + Node: node.yaml + Aggregate: aggr.yaml + Volume: volume.yaml + Disk: disk.yaml +``` + +Each collector will also check if a matching file named, `custom.yaml` exists, and if it does, it will read that file +and merge it with `default.yaml`. The `custom.yaml` file should be located beside the matching `default.yaml` file. ( +eg. `conf/zapi/custom.yaml` is beside `conf/zapi/default.yaml`). + +Let's take a look at some examples. + +1. Define a poller that uses the default Zapi collector. Using the default template is the easiest and most used option. + +```yaml +Pollers: + jamaica: + datacenter: munich + addr: 10.10.10.10 + collectors: + - Zapi # will use conf/zapi/default.yaml and optionally merge with conf/zapi/custom.yaml +``` + +2. Define a poller that uses the Zapi collector, but with a custom template file: + +```yaml +Pollers: + jamaica: + datacenter: munich + addr: 10.10.10.10 + collectors: + - ZapiPerf: + - limited.yaml # will use conf/zapiperf/limited.yaml + # more templates can be added, they will be merged +``` + +## Object Templates + +Object templates (example: `conf/zapi/cdot/9.8.0/lun.yaml`) describe what to collect and export. These templates are +used by collectors to gather metrics and send them to your time-series db. + +Object templates are made up of the following parts: + +1. the name of the object (or resource) to collect +2. the ZAPI or REST query used to collect the object +3. a list of object counters to collect and how to export them + +Instead of editing one of the existing templates, it's better to extend one of them. That way, your custom template will +not be overwritten when upgrading Harvest. For example, if you want to extend `conf/zapi/cdot/9.8.0/aggr.yaml`, first +create a copy (e.g., `conf/zapi/cdot/9.8.0/custom_aggr.yaml`), and then tell Harvest to use your custom template by +adding these lines to `conf/zapi/custom.yaml`: + +```yaml +objects: + Aggregate: custom_aggr.yaml +``` + +After restarting your pollers, `aggr.yaml` and `custom_aggr.yaml` will be merged. + +### Create a new object template + +In this example, imagine that Harvest doesn't already collect environment sensor data and you wanted to collect it. +Sensor does comes from the `environment-sensors-get-iter` ZAPI. Here are the steps to add a new object template. + +Create the file `conf/zapi/cdot/9.8.0/sensor.yaml` (optionally replace `9.8.0` with the earliest version of ONTAP that +supports sensor data. Refer to [Harvest Versioned Templates](configure-templates.md#harvest-versioned-templates) for +more information. Add the following content to your new `sensor.yaml` file. + +```yaml +name: Sensor # this name must match the key in your custom.yaml file +query: environment-sensors-get-iter +object: sensor + +metric_type: int64 + +counters: + environment-sensors-info: + - critical-high-threshold => critical_high + - critical-low-threshold => critical_low + - ^discrete-sensor-state => discrete_state + - ^discrete-sensor-value => discrete_value + - ^^node-name => node + - ^^sensor-name => sensor + - ^sensor-type => type + - ^threshold-sensor-state => threshold_state + - threshold-sensor-value => threshold_value + - ^value-units => unit + - ^warning-high-threshold => warning_high + - ^warning-low-threshold => warning_low + +export_options: + include_all_labels: true +``` + +### Enable the new object template + +To enable the new sensor object template, create the `conf/zapi/custom.yaml` file with the lines shown below. + +```yaml +objects: + Sensor: sensor.yaml # this key must match the name in your sensor.yaml file +``` + +The `Sensor` key used in the `custom.yaml` must match the name defined in the `sensor.yaml` file. That mapping is what +connects this object with its template. In the future, if you add more object templates, you can add those in your +existing `custom.yaml` file. + +### Test your object template changes + +Test your new `Sensor` template with a single poller like this: + +``` +./bin/harvest start --foreground --verbose --collectors Zapi --objects Sensor +``` + +Replace `` with the name of one of your ONTAP pollers. + +Once you have confirmed that the new template works, restart any already running pollers that you want to use the new +template(s). + +### Check the metrics + +If you are using the Prometheus exporter, you can scrape the poller's HTTP endpoint with curl or a web browser. E.g., my +poller exports its data on port 15001. Adjust as needed for your exporter. + +``` +curl -s 'http://localhost:15001/metrics' | grep ^sensor_ # sensor_ name matches the object: value in your sensor.yaml file. + +sensor_value{datacenter="WDRF",cluster="shopfloor",critical_high="3664",node="shopfloor-02",sensor="P3.3V STBY",type="voltage",warning_low="3040",critical_low="2960",threshold_state="normal",unit="mV",warning_high="3568"} 3280 +sensor_value{datacenter="WDRF",cluster="shopfloor",sensor="P1.2V STBY",type="voltage",threshold_state="normal",warning_high="1299",warning_low="1105",critical_low="1086",node="shopfloor-02",critical_high="1319",unit="mV"} 1193 +sensor_value{datacenter="WDRF",cluster="shopfloor",unit="mV",critical_high="15810",critical_low="0",node="shopfloor-02",sensor="P12V STBY",type="voltage",threshold_state="normal"} 11842 +sensor_value{datacenter="WDRF",cluster="shopfloor",sensor="P12V STBY Curr",type="current",threshold_state="normal",unit="mA",critical_high="3182",critical_low="0",node="shopfloor-02"} 748 +sensor_value{datacenter="WDRF",cluster="shopfloor",critical_low="1470",node="shopfloor-02",sensor="Sysfan2 F2 Speed",type="fan",threshold_state="normal",unit="RPM",warning_low="1560"} 2820 +sensor_value{datacenter="WDRF",cluster="shopfloor",sensor="PSU2 Fan1 Speed",type="fan",threshold_state="normal",unit="RPM",warning_low="4600",critical_low="4500",node="shopfloor-01"} 6900 +sensor_value{datacenter="WDRF",cluster="shopfloor",sensor="PSU1 InPwr Monitor",type="unknown",threshold_state="normal",unit="mW",node="shopfloor-01"} 132000 +sensor_value{datacenter="WDRF",cluster="shopfloor",critical_high="58",type="thermal",unit="C",warning_high="53",critical_low="0",node="shopfloor-01",sensor="Bat Temp",threshold_state="normal",warning_low="5"} 24 +sensor_value{datacenter="WDRF",cluster="shopfloor",critical_high="9000",node="shopfloor-01",sensor="Bat Charge Volt",type="voltage",threshold_state="normal",unit="mV",warning_high="8900"} 8200 +sensor_value{datacenter="WDRF",cluster="shopfloor",node="shopfloor-02",sensor="PSU1 InPwr Monitor",type="unknown",threshold_state="normal",unit="mW"} 132000 +``` + +## Extend an existing object template + +### How to extend a Rest/RestPerf/Ems collector's existing object template + +Instead of editing one of the existing templates, it's better to copy one and edit the copy. That way, your custom +template will not be overwritten when upgrading Harvest. For example, if you want to +change `conf/rest/cdot/9.12.0/aggr.yaml`, first create a copy (e.g., `conf/rest/cdot/9.12.0/custom_aggr.yaml`), then add +these lines to `conf/rest/custom.yaml`: + +```yaml +objects: + Aggregate: custom_aggr.yaml +``` + +After restarting pollers, `aggr.yaml` will be ignored and the new, `custom_aggr.yaml` subtemplate will be used instead. + +### How to extend a Zapi/ZapiPerf collector's existing object template + +In this example, we want to extend one of the existing object templates that Harvest ships with, +e.g. `conf/zapi/cdot/9.8.0/lun.yaml` and collect additional information as outlined below. + +Let's say you want to extend `lun.yaml` to: + +1. Increase `client_timeout` (You want to increase the default timeout of the lun ZAPI because it + keeps [timing out](https://github.com/NetApp/harvest/wiki/Troubleshooting-Harvest#client_timeout)) +2. Add additional counters, e.g. `multiprotocol-type`, `application` +3. Add a new counter to the already collected lun metrics using the `value_to_num` plugin +4. Add a new `application` instance_keys and labels to the collected metrics + +Let's assume the existing template is located at conf/zapi/cdot/9.8.0/lun.yaml and contains the following. + +```yaml +name: Lun +query: lun-get-iter +object: lun + +counters: + lun-info: + - ^node + - ^path + - ^qtree + - size + - size-used + - ^state + - ^^uuid + - ^volume + - ^vserver => svm + +plugins: + - LabelAgent: + # metric label zapi_value rest_value `default_value` + value_to_num: + - new_status state online online `0` + split: + - path `/` ,,,lun + +export_options: + instance_keys: + - node + - qtree + - lun + - volume + - svm + instance_labels: + - state + ``` + +To extend the out-of-the-box `lun.yaml` template, create a `conf/zapi/custom.yaml` file if it doesn't already exist and +add the lines shown below: + +```yaml +objects: + Lun: custom_lun.yaml +``` + +Create a new object template `conf/zapi/cdot/9.8.0/custom_lun.yaml` with the lines shown below. + +```yaml +client_timeout: 5m +counters: + lun-info: + - ^multiprotocol-type + - ^application + +plugins: + - LabelAgent: + value_to_num: + - custom_status state online online `0` + +export_options: + instance_keys: + - application + ``` + +When you restart your pollers, Harvest will take the out-of-the-box template (`lun.yaml`) and your new +one (`custom_lun.yaml`) and merge them into the following: + +```yaml +name: Lun +query: lun-get-iter +object: lun +counters: + lun-info: + - ^node + - ^path + - ^qtree + - size + - size-used + - ^state + - ^^uuid + - ^volume + - ^vserver => svm + - ^multiprotocol-type + - ^application +plugins: + LabelAgent: + value_to_num: + - new_status state online online `0` + - custom_status state online online `0` + split: + - path `/` ,,,lun +export_options: + instance_keys: + - node + - qtree + - lun + - volume + - svm + - application +client_timeout: 5m +``` + +To help understand the merging process and the resulting combined template, you can view the result with: + +```sh +bin/harvest doctor merge --template conf/zapi/cdot/9.8.0/lun.yaml --with conf/zapi/cdot/9.8.0/custom_lun.yaml +``` + +### Replace an existing object template for Zapi/ZapiPerf Collector + +You can only extend existing templates for Zapi/ZapiPerf Collector as +explained [above](configure-templates.md#extend-an-existing-object-template). +If you need to replace one of the existing object templates, let us know +on [Discord](https://github.com/NetApp/harvest/blob/main/SUPPORT.md#getting-help) or GitHub. + +## Harvest Versioned Templates + +Harvest ships with a set of versioned templates tailored for specific versions of ONTAP. At runtime, Harvest uses a +BestFit heuristic to pick the most appropriate template. The BestFit heuristic compares the list of Harvest templates +with the ONTAP version and selects the best match. There are versioned templates for both the ZAPI and REST collectors. +Below is an example of how the BestFit algorithm works - assume Harvest has these templated versions: + +- 9.6.0 +- 9.6.1 +- 9.8.0 +- 9.9.0 +- 9.10.1 + +if you are monitoring a cluster at these versions, Harvest will select the indicated template: + +- ONTAP version 9.4.1, Harvest will select the templates for 9.6.0 +- ONTAP version 9.6.0, Harvest will select the templates for 9.6.0 +- ONTAP version 9.7.X, Harvest will select the templates for 9.6.1 +- ONTAP version 9.12, Harvest will select the templates for 9.10.1 + +### counters + +This section contains the complete or partial attribute tree of the queried API. Since the collector does not get +counter metadata from the ONTAP system, two additional symbols are used for non-numeric attributes: + +- `^` used as a prefix indicates that the attribute should be stored as a label +- `^^` indicates that the attribute is a label and an instance key (i.e., a label that uniquely identifies an instance, + such as `name`, `uuid`). If a single label does not uniquely identify an instance, then multiple instance keys should + be indicated. + +Additionally, the symbol `=>` can be used to set a custom display name for both instance labels and numeric counters. +Example: + +```yaml +name: Spare +query: aggr-spare-get-iter +object: spare +collect_only_labels: true +counters: + aggr-spare-disk-info: + - ^^disk # creates label aggr-disk + - ^disk-type # creates label aggr-disk-type + - ^is-disk-zeroed => is_disk_zeroed # creates label is_disk_zeroed + - ^^original-owner => original_owner # creates label original_owner +export_options: + instance_keys: + - disk + - original_owner + instance_labels: + - disk_type + - is_disk_zeroed +``` + +Harvest does its best to determine a unique display name for each template's label and metric. Instead of relying on +this heuristic, it is better to be explicit in your templates and define a display name using the caret (`^`) mapping. +For example, instead of this: + +``` +aggr-spare-disk-info: + - ^^disk + - ^disk-type +``` + +do this: + +``` +aggr-spare-disk-info: + - ^^disk => disk + - ^disk-type => disk_type +``` + +See also [#585](https://github.com/NetApp/harvest/issues/585) diff --git a/docs/configure-unix.md b/docs/configure-unix.md new file mode 100644 index 000000000..05ad9b3d3 --- /dev/null +++ b/docs/configure-unix.md @@ -0,0 +1,59 @@ +# Unix + +This collector polls resource usage by Harvest pollers on the local system. Collector might be extended in the future to +monitor any local or remote process. + +## Target System + +The machine where Harvest is running ("localhost"). + +## Requirements + +Collector requires any OS where the proc-filesystem is available. If you are a developer, you are welcome to add support +for other platforms. Currently, supported platforms includes most Unix/Unix-like systems: + +* Android / Termux +* DragonFly BSD +* FreeBSD +* IBM AIX +* Linux +* NetBSD +* Plan9 +* Solaris + +(On FreeBSD and NetBSD the proc-filesystem needs to be manually mounted). + +## Parameters + +| parameter | type | description | default | +|---------------|------------------|-------------------------------|---------| +| `mount_point` | string, optional | path to the `proc` filesystem | `/proc | + +## Metrics + +The Collector follows [the Linux proc(5) manual](https://man7.org/linux/man-pages/man5/procfs.5.html) to parse a static +set of metrics. Unless otherwise stated, the metric has a scalar value: + +| metric | type | unit | description | +|------------------|----------------------|-------------------|----------------------------------------------------------------------------------------------------------------------------------------------| +| `start_time` | counter, `float64` | seconds | process uptime | +| `cpu_percent` | gauge, `float64` | percent | CPU used since last poll | +| `memory_percent` | gauge, `float64` | percent | Memory used (RSS) since last poll | +| `cpu` | histogram, `float64` | seconds | CPU used since last poll (`system`, `user`, `iowait`) | +| `memory` | histogram, `uint64` | kB | Memory used since last poll (`rss`, `vms`, `swap`, etc) | +| `io` | histogram, `uint64` |
byte
count | IOs performed by process:
`rchar`, `wchar`, `read_bytes`, `write_bytes` - read/write IOs
`syscr`, `syscw` - syscalls for IO operations | +| `net` | histogram, `uint64` | count/byte | Different IO operations over network devices | +| `ctx` | histogram, `uint64` | count | Number of context switched (`voluntary`, `involuntary`) | +| `threads` | counter, `uint64` | count | Number of threads | +| `fds` | counter, `uint64` | count | Number of file descriptors | + +Additionally, the collector provides the following instance labels: + +| label | description | +|--------|--------------------| +| poller | name of the poller | +| pid | PID of the poller | + +## Issues + +* Collector will fail on WSL because some non-critical files, in the proc-filesystem, are not present. \ No newline at end of file diff --git a/docs/configure-zapi.md b/docs/configure-zapi.md new file mode 100644 index 000000000..a4aa26ce5 --- /dev/null +++ b/docs/configure-zapi.md @@ -0,0 +1,191 @@ +!!! tip "What about REST?" + + ZAPI will reach end of availablity in ONTAP 9.13.1 released Q2 2023. + Don't worry, Harvest has you covered. Switch to Harvest's REST collectors + and collect idential metrics. See [REST Strategy](https://github.com/NetApp/harvest/blob/main/pkg/docs/architecture/rest-strategy.md) for more details. + +## Zapi Collector + +The Zapi collectors uses the ZAPI protocol to collect data from ONTAP systems. The collector submits data as received +from the target system, and does not perform any calculations or post-processing. Since the attributes of most APIs have +an irregular tree structure, sometimes a plugin will be required to collect all metrics from an API. + +The [ZapiPerf collector](#zapiperf-collector) is an extension of this collector, therefore the share many parameters +and configuration settings. + +### Target System + +Target system can be any cDot or 7Mode ONTAP system. Any version is supported, however the default configuration files +may not completely match with older systems. + +### Requirements + +No SDK or other requirements. It is recommended to create a read-only user for Harvest on the ONTAP system (see +[prepare monitored clusters](prepare-cdot-clusters.md) for details) + +### Metrics + +The collector collects a dynamic set of metrics. Since most ZAPIs have a tree structure, the collector converts that +structure into a flat metric representation. No post-processing or calculation is performed on the collected data +itself. + +As an example, the `aggr-get-iter` ZAPI provides the following partial attribute tree: + +```yaml +aggr-attributes: + - aggr-raid-attributes: + - disk-count + - aggr-snapshot-attributes: + - files-total +``` + +The Zapi collector will convert this tree into two "flat" metrics: `aggr_raid_disk_count` +and `aggr_snapshot_files_total`. (The algorithm to generate a name for the metrics will attempt to keep it as simple as +possible, but sometimes it's useful to manually set a short display name. See [counters](configure-templates.md#counters) +for more details. + +### Parameters + +The parameters and configuration are similar to those of the [ZapiPerf collector](#zapiperf-collector). Only the +differences will be discussed below. + +#### Collector configuration file + +Parameters different from ZapiPerf: + +| parameter | type | description | default | +|-------------------------|----------------|--------------------------------------------------------------------------------------------------------------|---------| +| `schedule` | required | same as for ZapiPerf, but only two elements: `instance` and `data` (collector does not run a `counter` poll) || +| `no_max_records` | bool, optional | don't add `max-records` to the ZAPI request | | +| `collect_only_labels` | bool, optional | don't look for numeric metrics, only submit labels (suppresses the `ErrNoMetrics` error) | | +| `only_cluster_instance` | bool, optional | don't look for instance keys and assume only instance is the cluster itself || + +#### Object configuration file + +The Zapi collector does not have the parameters `instance_key` and `override` parameters. The optional +parameter `metric_type` allows you to override the default metric type (`uint64`). The value of this parameter should be +one of the metric types supported by [the matrix data-structure](resources/matrix.md). + +## ZapiPerf Collector + +# ZapiPerf + +ZapiPerf collects performance metrics from ONTAP systems using the ZAPI protocol. The collector is designed to be easily +extendable to collect new objects or to collect additional counters from already configured objects. + +This collector is an extension of the [Zapi collector](#zapi-collector). The major difference between them is that +ZapiPerf collects only the performance (`perf`) APIs. Additionally, ZapiPerf always calculates final values from the +deltas of two subsequent polls. + +## Metrics + +The collector collects a dynamic set of metrics. The metric values are calculated from two consecutive polls (therefore, +no metrics are emitted after the first poll). The calculation algorithm depends on the `property` and `base-counter` +attributes of each metric, the following properties are supported: + +| property | formula | description | +|----------|---------------------------------------------------------------------------------|-------------------------------------------------------------------| +| raw | x = xi | no post-processing, value **x** is submitted as it is | +| delta | x = xi - xi-1 | delta of two poll values, **xi** and **xi-1** | +| rate | x = (xi - xi-1) / (ti - ti-1) | delta divided by the interval of the two polls in seconds | +| average | x = (xi - xi-1) / (yi - yi-1) | delta divided by the delta of the base counter **y** | +| percent | x = 100 * (xi - xi-1) / (yi - yi-1) | average multiplied by 100 | + +## Parameters + +The parameters of the collector are distributed across three files: + +- [Harvest configuration file](configure-harvest-basic.md#pollers) (default: `harvest.yml`) +- ZapiPerf configuration file (default: `conf/zapiperf/default.yaml`) +- Each object has its own configuration file (located in `conf/zapiperf/cdot/` and `conf/zapiperf/7mode/` for cDot and + 7Mode systems respectively) + +Except for `addr`, `datacenter` and `auth_style`, all other parameters of the ZapiPerf collector can be +defined in either of these three files. Parameters defined in the lower-level file, override parameters in the +higher-level file. This allows the user to configure each objects individually, or use the same parameters for all +objects. + +The full set of parameters are described [below](#zapiperf-configuration-file). + +### Harvest configuration file + +Parameters in poller section should define (at least) the address and authentication method of the target system: + +| parameter | type | description | default | +|------------------------|------------------|--------------------------------------------------------------------------------|--------------| +| `addr` | string, required | address (IP or FQDN) of the ONTAP system | | +| `datacenter` | string, required | name of the datacenter where the target system is located | | +| `auth_style` | string, optional | authentication method: either `basic_auth` or `certificate_auth` | `basic_auth` | +| `ssl_cert`, `ssl_key` | string, optional | full path of the SSL certificate and key pairs (when using `certificate_auth`) | | +| `username`, `password` | string, optional | full path of the SSL certificate and key pairs (when using `basic_auth`) | | + +### ZapiPerf configuration file + +This configuration file (the "template") contains a list of objects that should be collected and the filenames of their +configuration (explained in the next section). + +Additionally, this file contains the parameters that are applied as defaults to all objects. (As mentioned before, any +of these parameters can be defined in the Harvest or object configuration files as well). + +| parameter | type | description | default | +|--------------------|----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------| +| `use_insecure_tls` | bool, optional | skip verifying TLS certificate of the target system | `false` | +| `client_timeout` | duration (Go-syntax) | how long to wait for server responses | 30s | +| `batch_size` | int, optional | max instances per API request | `500` | +| `latency_io_reqd` | int, optional | threshold of IOPs for calculating latency metrics (latencies based on very few IOPs are unreliable) | `100` | +| `schedule` | list, required | the poll frequencies of the collector/object, should include exactly these three elements in the exact same other: | | +| - `counter` | duration (Go-syntax) | poll frequency of updating the counter metadata cache (example value: `1200s` = `20m`) | | +| - `instance` | duration (Go-syntax) | poll frequency of updating the instance cache (example value: `600s` = `10m`) | | +| - `data` | duration (Go-syntax) | poll frequency of updating the data cache (example value: `60s` = `1m`)

**Note** Harvest allows defining poll intervals on sub-second level (e.g. `1ms`), however keep in mind the following:
  • API response of an ONTAP system can take several seconds, so the collector is likely to enter failed state if the poll interval is less than `client_timeout`.
  • Small poll intervals will create significant workload on the ONTAP system, as many counters are aggregated on-demand.
  • Some metric values become less significant if they are calculated for very short intervals (e.g. latencies)
| | + +The template should define objects in the `objects` section. Example: + +```yaml +objects: + SystemNode: system_node.yaml + HostAdapter: hostadapter.yaml +``` + +Note that for each object we only define the filename of the object configuration file. The object configuration files +are located in subdirectories matching to the ONTAP version that was used to create these files. It is possible to have +multiple version-subdirectories for multiple ONTAP versions. At runtime, the collector will select the object +configuration file that closest matches to the version of the target ONTAP system. (A mismatch is tolerated since +ZapiPerf will fetch and validate counter metadata from the system.) + +### Object configuration file + +The Object configuration file ("subtemplate") should contain the following parameters: + +| parameter | type | description | default | +|------------------|-------------------------|-------------------------------------------------------------------------------------|---------| +| `name` | string | display name of the collector that will collect this object | | +| `object` | string | short name of the object | | +| `query` | string | raw object name used to issue a ZAPI request | | +| `counters` | list | list of counters to collect (see notes below) | | +| `instance_key` | string | label to use as instance key (either `name` or `uuid`) | | +| `override` | list of key-value pairs | override counter properties that we get from ONTAP (allows circumventing ZAPI bugs) | | +| `plugins` | list | plugins and their parameters to run on the collected data | | +| `export_options` | list | parameters to pass to exporters (see notes below) | | + +#### `counters` + +This section defines the list of counters that will be collected. These counters can be labels, numeric metrics or +histograms. The exact property of each counter is fetched from ONTAP and updated periodically. + +Some counters require a "base-counter" for post-processing. If the base-counter is missing, ZapiPerf will still run, but +the missing data won't be exported. + +The display name of a counter can be changed with `=>` (e.g., `nfsv3_ops => ops`). There's one conversion Harvest does +for you by default, the `instance_name` counter will be renamed to the value of `object`. + +Counters that are stored as labels will only be exported if they are included in the `export_options` section. + +#### `export_options` + +Parameters in this section tell the exporters how to handle the collected data. The set of parameters varies by +exporter. For [Prometheus](prometheus-exporter.md) and [InfluxDB](influxdb-exporter.md) +exporters, the following parameters can be defined: + +* `instances_keys` (list): display names of labels to export with each data-point +* `instance_labels` (list): display names of labels to export as a separate data-point +* `include_all_labels` (bool): export all labels with each data-point (overrides previous two parameters) diff --git a/docs/dashboards.md b/docs/dashboards.md new file mode 100644 index 000000000..5b8a11e1e --- /dev/null +++ b/docs/dashboards.md @@ -0,0 +1,31 @@ +Harvest can be used to import dashboards to Grafana. + +The `bin/harvest garfana` utility requires the address (hostname or IP), port of the Grafana server, +and a Grafana API token. The port can be omitted if Grafana is configured to redirect the URL. Use the `-d` flag to +point to the directory that contains the dashboards. + +### Grafana API token + +The utility tool asks for an API token which can be generated from the Grafana web-gui. + +![Grafana API](assets/grafana/grafana_api.png) + +Click on `Configuration` in the left menu bar (1), click on `API Keys` (2) and click on the `New API Key` button. Choose +a Key name (3), choose `Editor` for role (4) and click on add (5). Copy the generated key and paste it in your terminal +or add the token to the `Tools` section of your configuration file. (see below) + +For example, let's say your Grafana server is on `http://my.grafana.server:3000` and you want to import the +Prometheus-based dashboards from the `grafana` directory. You would run this: + +``` +$ bin/grafana import --addr my.grafana.server:3000 +``` + +Similarly, to export: + +``` +$ bin/grafana export --addr my.grafana.server:3000 --directory /path/to/export/directory --serverfolder grafanaFolderName +``` + +By default, the dashboards are connected to the `Prometheus` datasource defined in Grafana. If your datasource has a +different name, use the `--datasource` flag during import/export. diff --git a/docs/index.md b/docs/index.md index 25b564eda..7b16cb829 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,17 +1,16 @@ -# Welcome to Harvest +> Harvest is the open-metrics endpoint for ONTAP and StorageGRID -## Introduction to NetApp Harvest +NetApp Harvest brings observability to ONTAP and StorageGRID clusters. +Harvest collects performance, capacity and hardware metrics from ONTAP and StorageGRID, +transforms them, and routes them to your choice of time-series database. -> The open-metrics endpoint for ONTAP and StorageGRID +The included Grafana dashboards deliver the datacenter insights you need, while +new metrics can be collected with a few edits of the included template files. -NetApp Harvest collects metrics from remote clusters, summarizes it, and stores it in metrics servers. - -The default package collects performance, capacity and hardware metrics from ONTAP and StorageGRID clusters. -Metrics are delivered to Prometheus and InfluxDB databases - and displayed in Grafana dashboards. -New metrics can be added by editing the config files. - -Harvest offers great flexibility in how it collects, augments, and exports data. -You are more than welcome to contribute your own collector, plugin or exporter. +Harvest is open-source, released under an [Apache2 license](https://github.com/NetApp/harvest/blob/main/LICENSE), +and offers great flexibility in how you collect, augment, and export your datacenter metrics. !!! note - Hop onto our [Discord]() or GitHub [discussions]() and say hi. 👋🏽 + + Hop onto our [Discord](https://discordapp.com/channels/855068651522490400/1001963189124206732) + or GitHub [discussions](https://github.com/NetApp/harvest/discussions) and say hi. 👋🏽 diff --git a/docs/influxdb-exporter.md b/docs/influxdb-exporter.md new file mode 100644 index 000000000..e40527ada --- /dev/null +++ b/docs/influxdb-exporter.md @@ -0,0 +1,71 @@ +# InfluxDB Exporter + +???+ note "InfluxDB Install" + + The information below describes how to setup Harvest's InfluxDB exporter. + If you need help installing or setting up InfluxDB, check + out [their documention](https://docs.influxdata.com/). + +## Overview + +The InfluxDB Exporter will format metrics into the +InfluxDB's [line protocol](https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#naming-restrictions) +and write it into a bucket. +The Exporter is compatible with InfluxDB v2.0. +For explanation about `bucket`, `org` and `precision`, +see [InfluxDB API documentation](https://docs.influxdata.com/influxdb/v2.0/api/#tag/Write). + +If you are monitoring both CDOT and 7mode clusters, it is strongly recommended to use two different buckets. + +## Parameters + +Overview of all parameters is provided below. Only one of `url` or `addr` should be provided and at least one of them is +required. +If `addr` is specified, it should be a valid TCP address or hostname of the InfluxDB server and should not include the +scheme. +When using `addr`, the `bucket`, `org`, and `token` key/values are required. + +> `addr` only works with HTTP. If you need to use HTTPS, you should use `url` instead. + +If `url` is specified, you must add all arguments to the url. +Harvest will do no additional processing and use exactly what you specify. ( +e.g. `url: https://influxdb.example.com:8086/write?db=netapp&u=user&p=pass&precision=2`. +When using `url`, the `bucket`, `org`, `port`, and `precision` fields will be ignored. + +| parameter | type | description | default | +|------------------|------------------------------|----------------------------------------------------------------------------------------------------|---------| +| `url` | string | URL of the database, format: `SCHEME://HOST[:PORT]` | | +| `addr` | string | address of the database, format: `HOST` (HTTP only) | | +| `port` | int, optional | port of the database | `8086` | +| `bucket` | string, required with `addr` | InfluxDB bucket to write | | +| `org` | string, required with `addr` | InfluxDB organization name | | +| `precision` | string, required with `addr` | Preferred timestamp precision in seconds | `2` | +| `client_timeout` | int, optional | client timeout in seconds | `5` | +| `token` | string | [token for authentication](https://docs.influxdata.com/influxdb/v2.0/security/tokens/view-tokens/) | | + +### Example + +snippet from `harvest.yml` using `addr`: (supports HTTP only)) + +```yaml +Exporters: + my_influx: + exporter: InfluxDB + addr: localhost + bucket: harvest + org: harvest + token: ZTTrt%24@#WNFM2VZTTNNT25wZWUdtUmhBZEdVUmd3dl@# +``` + +snippet from `harvest.yml` using `url`: (supports both HTTP/HTTPS)) + +```yaml +Exporters: + influx2: + exporter: InfluxDB + url: https://localhost:8086/api/v2/write?org=harvest&bucket=harvest&precision=s + token: my-token== +``` + +Notice: InfluxDB stores a token in `~/.influxdbv2/configs`, but you can also retrieve it from the UI (usually serving +on `localhost:8086`): click on "Data" on the left task bar, then on "Tokens". diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 000000000..fd7141430 --- /dev/null +++ b/docs/install.md @@ -0,0 +1,117 @@ +Get up and running with Harvest on your preferred platform. +We provide pre-compiled binaries for Linux, RPMs, Debs, as well +as prebuilt container images for both [Nightly](https://github.com/NetApp/harvest/releases/tag/nightly) +and stable [releases](https://github.com/NetApp/harvest/releases). + +## Native + +Visit the [Releases page](https://github.com/NetApp/harvest/releases) and copy the `tar.gz` link +for the latest release. For example, to download the `v22.08.0` release: +``` +wget https://github.com/NetApp/harvest/releases/download/v22.08.0/harvest-22.08.0-1_linux_amd64.tar.gz +tar -xvf harvest-22.08.0-1_linux_amd64.tar.gz +cd harvest-22.08.0-1_linux_amd64 + +# Run Harvest with the default unix localhost collector +bin/harvest start +``` + +??? info "With curl" + + If you don't have `wget` installed, you can use `curl` like so: + ``` + curl -L -O https://github.com/NetApp/harvest/releases/download/v22.08.0/harvest-22.08.0-1_linux_amd64.tar.gz + ``` + +It's best to run Harvest as a non-root user. Make sure the user running Harvest can write to `/var/log/harvest/` or tell Harvest to write the logs somewhere else with the `HARVEST_LOGS` environment variable. + +If something goes wrong, examine the logs files in `/var/log/harvest`, check out +the [troubleshooting](https://github.com/NetApp/harvest/wiki/Troubleshooting-Harvest) section on the wiki and jump +onto [Discord](https://github.com/NetApp/harvest/blob/main/SUPPORT.md#getting-help) and ask for help. + +## Containers + +See [Harvest and containers](https://github.com/NetApp/harvest/blob/main/docker/README.md). + +## Package managers + +### Redhat + +> Installation and upgrade of the Harvest package may require root or administrator privileges + +Download the latest rpm of [Harvest](https://github.com/NetApp/harvest/releases/latest) from the releases +tab and install or upgrade with yum. + +``` +sudo yum install harvest.XXX.rpm +``` + +Once the installation has finished, edit the [harvest.yml configuration](configure-harvest-basic.md) file +located in `/opt/harvest/harvest.yml` + +After editing `/opt/harvest/harvest.yml`, manage Harvest with `systemctl start|stop|restart harvest`. + +After upgrade, re-import all dashboards (either `bin/harvest grafana import` cli or via the Grafana UI) to +get any new enhancements in dashboards. + +> To ensure that you don't run +> into [permission issues](https://github.com/NetApp/harvest/issues/122#issuecomment-856138831), make sure you manage +> Harvest using `systemctl` instead of running the harvest binary directly. + +??? quote "Changes install makes" + + * Directories `/var/log/harvest/` and `/var/log/run/` are created + * A `harvest` user and group are created and the installed files are chowned to harvest + * Systemd `/etc/systemd/system/harvest.service` file is created and enabled + +### Debian + +> Installation and upgrade of the Harvest package may require root or administrator privileges + +Download the latest deb of [Harvest](https://github.com/NetApp/harvest/releases/latest) from the releases +tab and install or upgrade with apt. + +``` +sudo apt update +sudo apt install|upgrade ./harvest-.amd64.deb +``` + +Once the installation has finished, edit the [harvest.yml configuration](configure-harvest-basic.md) file +located in `/opt/harvest/harvest.yml` + +After editing `/opt/harvest/harvest.yml`, manage Harvest with `systemctl start|stop|restart harvest`. + +After upgrade, re-import all dashboards (either `bin/harvest grafana import` cli or via the Grafana UI) to +get any new enhancements in dashboards. + +> To ensure that you don't run +> into [permission issues](https://github.com/NetApp/harvest/issues/122#issuecomment-856138831), make sure you manage +> Harvest using `systemctl` instead of running the harvest binary directly. + +??? quote "Changes install makes" + + * Directories `/var/log/harvest/` and `/var/log/run/` are created + * A `harvest` user and group are created and the installed files are chowned to harvest + * Systemd `/etc/systemd/system/harvest.service` file is created and enabled + +## Nabox + +Instructions on how to install Harvest via [NAbox](https://nabox.org/documentation/installation/). + +## Source + +To build Harvest from source code, first make sure you have a working Go environment +with [version 1.19 or greater installed](https://golang.org/doc/install). + +Clone the repo and build everything. + +``` +git clone https://github.com/NetApp/harvest.git +cd harvest +make build +bin/harvest version +``` + +If you're building on a Mac use `GOOS=darwin make build` + +Checkout the `Makefile` for other targets of interest. diff --git a/docs/license.md b/docs/license.md new file mode 100644 index 000000000..bb541e5c8 --- /dev/null +++ b/docs/license.md @@ -0,0 +1 @@ +[Harvest's License](https://github.com/NetApp/harvest/blob/main/LICENSE) \ No newline at end of file diff --git a/docs/manage-harvest.md b/docs/manage-harvest.md new file mode 100644 index 000000000..ca69ef1e0 --- /dev/null +++ b/docs/manage-harvest.md @@ -0,0 +1 @@ +Coming Soon \ No newline at end of file diff --git a/docs/prepare-7mode-clusters.md b/docs/prepare-7mode-clusters.md new file mode 100644 index 000000000..98e5b2a8e --- /dev/null +++ b/docs/prepare-7mode-clusters.md @@ -0,0 +1,61 @@ +NetApp Harvest requires login credentials to access monitored hosts. Although, a generic admin account can be used, it +is best practice to create a dedicated monitoring account with the least privilege access. + +ONTAP 7-mode supports only username / password based authentication with NetApp Harvest. +Harvest communicates with monitored systems exclusively via HTTPS, which is not enabled by default in Data +ONTAP 7-mode. Login as a user with full administrative privileges and execute the following steps. + +# Enabling HTTPS and TLS (ONTAP 7-mode only) + +Verify SSL is configured + +``` +secureadmin status ssl +``` + +If ssl is ‘active’ continue. If not, setup SSL and be sure to choose a Key length (bits) of 2048: + +``` +secureadmin setup ssl +``` + +``` +SSL Setup has already been done before. Do you want to proceed? [no] yes +Country Name (2 letter code) [US]: NL +State or Province Name (full name) [California]: Noord-Holland +Locality Name (city, town, etc.) [Santa Clara]: Schiphol +Organization Name (company) [Your Company]: NetApp +Organization Unit Name (division): SalesEngineering +Common Name (fully qualified domain name) [sdt-7dot1a.nltestlab.hq.netapp.com]: +Administrator email: noreply@netapp.com +Days until expires [5475] :5475 Key length (bits) [512] :2048 +``` + +Enable management via SSL and enable TLS + +``` +options httpd.admin.ssl.enable on +options tls.enable on +``` + +## Creating ONTAP user + +### Create the role with required capabilities + +``` +role add netapp-harvest-role -c "Role for performance monitoring by NetApp Harvest" -a login-http-admin,api-system-get-version,api-system-get-info,api-perf-object-*,api-emsautosupport-log +``` + +### Create a group for this role + +``` +useradmin group add netapp-harvest-group -c "Group for performance monitoring by NetApp Harvest" -r netapp-harvest-role +``` + +### Create a user for the role and enter the password when prompted + +``` +useradmin user add netapp-harvest -c "User account for performance monitoring by NetApp Harvest" -n "NetApp Harvest" -g netapp-harvest-group +``` + +The user is now created and can be configured for use by NetApp Harvest. \ No newline at end of file diff --git a/docs/prepare-cdot-clusters.md b/docs/prepare-cdot-clusters.md new file mode 100644 index 000000000..02ca991bb --- /dev/null +++ b/docs/prepare-cdot-clusters.md @@ -0,0 +1,581 @@ + +## Prepare ONTAP cDOT cluster + +NetApp Harvest requires login credentials to access monitored hosts. Although, a generic admin account can be used, it +is best practice to create a dedicated monitoring account with the least privilege access. + +In the examples below, the user, group, roles, etc., use a naming convention of ‘netapp-harvest’. These can be +modified as needed to match your environment standards. + +There are few steps required to prepare each monitored system for collection. +Harvest supports two authentication styles (`auth_style`) to connect to ONTAP clusters. +They are `basic_auth` or `certificate_auth`. Both work well, but if you're starting fresh, the recommendation is to +create a read-only harvest user on your ONTAP server and use certificate-based TLS authentication. + +Here's a summary of what we're going to do + +1. Create an ONTAP role with the necessary capabilities that Harvest will use to auth and collect data +2. Create a user account using the role created in step #1. + +## Creating ONTAP user + +There are two ways to create a read-only user: + +1. Create a user with read-only access to **all** API objects +2. Create a user with read-only access to only the APIs Harvest collects today + +The second option has a smaller attack surface, but each time you want to collect counters for a new object, you will +need to update the user's privileges. + +Below we explain how to create an ONTAP user and role for Harvest using ONTAP System Manager (Classic interface & New +interface) and CLI. + +### System Manager: New interface + +*Note: in this section we add a user with read-only access to all API objects. For limited access, use either the +classic interface or the CLI* + +Open System Manager. Click on *CLUSTER* in the left menu bar, *Settings* and *Users and Roles*. + +![System Manager Settings](assets/prepare-ontap/ontap_user_sm_0.png) + +In the right column, under *Roles*, click on *Add* to add a new role. + +![System Manager Settings](assets/prepare-ontap/ontap_user_sm_1.png) + +Choose a role name (e.g. *harvest2-role*). In the *REST API PATH* field, type */api* and select *Read-Only* for +*ACCESS*. Click on *Save*. + +![System Manager Settings](assets/prepare-ontap/ontap_user_sm_2.png) + +In the left column, under *Users*, click on *Add* to create a new user. Choose a username. Under *Role*, select the role +that we just created. Under *User Login Methods* select *ONTAPI*, and select one of the two authentication methods. Type +in a password if you chose *Password*. Click on *Save* + +![System Manager Settings](assets/prepare-ontap/ontap_user_sm_3.png) + +If you chose *Password*, you can add the username and password to the Harvest configuration file and start Harvest. If +you chose *Certificate* jump to [Using Certificate Authentication](#using-certificate-authentication) to generate +certificates files. + +### System Manager: Classic interface + +Open System Manager. Click on the Settings icon in the top-right corner of the window. + +![System Manager Classic Settings](assets/prepare-ontap/ontap_user_smc_0.png) + +Click on *Roles* in the left menu bar and click *Add*. Choose a role name (e.g. *harvest2-role*). + +![System Manager Classic Settings](assets/prepare-ontap/ontap_user_smc_1.png) + +If you want to give Harvest read-only access to **all** API objects, then under *Role Attributes* click on *Add*, under +*Command* type *DEFAULT*, leave *Query* empty, select *readonly* under *Access Level*, click on *OK* and *Add*. + +If you want to limit the API objects, then under *Role Attributes*, add each of the following lines as an entry. All of +those should be entered under the *Command* column, *Query* should be left blank, and *Access Level* should be selected +*readonly*. + +* cluster +* lun +* snapmirror +* statistics +* storage aggregate +* storage disk +* storage shelf +* system node +* version +* volume + +After you click on *Add*, this is what you should see: + +![System Manager Classic Settings](assets/prepare-ontap/ontap_user_smc_2.png) + +Now we need to create a user. Click on *Users* in the left menu bar and *Add*. Choose a username and password. Under +*User Login Methods*, click on *Add*, select *ontapi* as *Application* and select the role that we just created as +*Role*. Click on *Add* in the pop-up window to save. + +![System Manager Classic Settings](assets/prepare-ontap/ontap_user_smc_3.png) + +Now add the username and password to `harvest.yml` and start Harvest. + +### ONTAP CLI + +We are going to: + +- create a Harvest role with read-only access to the API objects +- create a Harvest user and assign it to that role + +You should decide if you want to limit the Harvest role to only the subset of API objects Harvest requires or +give Harvest access to all API objects. In both cases, Harvest's access will be read-only. + +Either approach is fine, following the principle of least-privilege, we recommend the limited approach. + +Login to the CLI of your c-DOT ONTAP system using SSH. + +#### Least-privilege approach + +Verify there are no errors when you copy/paste these. Warnings are fine. + +```bash +security login role create -role harvest2-role -access readonly -cmddirname "cluster" +security login role create -role harvest2-role -access readonly -cmddirname "lun" +security login role create -role harvest2-role -access readonly -cmddirname "qos workload show" +security login role create -role harvest2-role -access readonly -cmddirname "snapmirror" +security login role create -role harvest2-role -access readonly -cmddirname "statistics" +security login role create -role harvest2-role -access readonly -cmddirname "storage aggregate" +security login role create -role harvest2-role -access readonly -cmddirname "storage disk" +security login role create -role harvest2-role -access readonly -cmddirname "storage shelf" +security login role create -role harvest2-role -access readonly -cmddirname "system health status show" +security login role create -role harvest2-role -access readonly -cmddirname "system health subsystem show" +security login role create -role harvest2-role -access readonly -cmddirname "system node" +security login role create -role harvest2-role -access readonly -cmddirname "version" +security login role create -role harvest2-role -access readonly -cmddirname "volume" + +# Permissions required for Harvest 22.05+ security dashboard +security login role create -role harvest2-role -access readonly -cmddirname "network interface" +security login role create -role harvest2-role -access readonly -cmddirname "security" +security login role create -role harvest2-role -access readonly -cmddirname "storage encryption disk" +security login role create -role harvest2-role -access readonly -cmddirname "vserver" +``` + +#### All APIs read-only approach + +```bash +security login role create -role harvest2-role -access readonly -cmddirname "DEFAULT" +``` + +#### Create harvest user and associate to role + +Use this for password authentication + +```bash +# ZAPI based access +security login create -user-or-group-name harvest2 -application ontapi -role harvest2-role -authentication-method password + +# REST based access +security login create -user-or-group-name harvest2 -application http -role harvest2-role -authentication-method password +``` + +Or this for certificate authentication + +```bash +security login create -user-or-group-name harvest2 -application ontapi \ + -role harvest2-role -authentication-method cert +``` + +#### 7-Mode CLI + +Login to the CLI of your 7-Mode ONTAP system (e.g. using SSH). First, we create a user role. If you want to give the +user readonly access to **all** API objects, type in the following command: + +```bash +useradmin role modify harvest2-role -a login-http-admin,api-system-get-version, \ +api-system-get-info,api-perf-object-*,api-ems-autosupport-log,api-diagnosis-status-get, \ +api-lun-list-info,api-diagnosis-subsystem-config-get-iter,api-disk-list-info, \ +api-diagnosis-config-get-iter,api-aggr-list-info,api-volume-list-info, \ +api-storage-shelf-environment-list-info,api-qtree-list,api-quota-report +``` + +# Using Certificate Authentication + +See [comments here for troubleshooting](https://github.com/NetApp/harvest/issues/314#issuecomment-882120238) client +certificate authentication. + +Client certificate authentication allows you to authenticate with your ONTAP cluster without including +username/passwords in your `harvest.yml` file. The process to setup client certificates is straightforward, although +self-signed certificates introduce more work as does Go's strict treatment of common names. + +Unless you've installed production certificates on your ONTAP cluster, you'll need to replace your cluster's +common-name-based self-signed certificates with a subject alternative name based certificate. After that step is +completed, we'll create client certificates and add those for passwordless login. + +If you can't or don't want to replace your ONTAP cluster certificates, there are some workarounds. You can + +- Use `use_insecure_tls: true` in your `harvest.yml` to disable certificate verification +- Change your `harvest.yml` to connect via hostname instead of IP address + +## Create Self-Signed Subject Alternate Name Certificates for ONTAP + +Subject alternate name (SAN) certificates allow multiple hostnames in a single certificate. Starting with Go 1.3, when +connecting to a cluster via its IP address, the CN field in the server certificate is ignored. This often causes errors +like this: +`x509: cannot validate certificate for 127.0.0.1 because it doesn't contain any IP SANs` + +### Overview of steps to create a self-signed SAN certificate and make ONTAP use it + +1. Create a root key +2. Create a root certificate authority certificate +3. Create a SAN certificate for your ONTAP cluster, using #2 to create it +4. Install root ca certificate created in step #2 on cluster +5. Install SAN certificate created in step #3 on your cluster +6. Modify you cluster/SVM to use the new certificate installed at step #5 + +#### Setup + +``` +# create a place to store the certificate authority files, adjust as needed +mkdir -p ca/{private,certs} +``` + +#### Create a root key + +``` +cd ca +# generate a private key that we will use to create our self-signed certificate authority +openssl genrsa -out private/ca.key.pem 4096 +chmod 400 private/ca.key.pem +``` + +#### Create a root certificate authority certificate + +Download the sample [samples/openssl.cnf] file and put it in the directory we created in [setup](#setup). Edit line 9, +changing `dir` to point to your `ca` directory created in [setup](#setup). + +``` +openssl req -config openssl.cnf -key private/ca.key.pem -new -x509 -days 7300 -sha256 -extensions v3_ca -out certs/ca.cert.pem + +# Verify +openssl x509 -noout -text -in certs/ca.cert.pem + +# Make sure these are present + Signature Algorithm: sha256WithRSAEncryption <======== Signature Algorithm can not be sha-1 + X509v3 extensions: + X509v3 Subject Key Identifier: + --removed + X509v3 Authority Key Identifier: + --removed + + X509v3 Basic Constraints: critical + CA:TRUE <======== CA must be true + X509v3 Key Usage: critical + Digital Signature, Certificate Sign, CRL Sign <======== Digital and certificate signature +``` + +#### Create a SAN certificate for your ONTAP cluster + +First, we'll create the certificate signing request and then the certificate. In this example, the ONTAP cluster is +named `umeng-aff300-05-06`, update accordingly. + +Download the sample [samples/server_cert.cnf] file and put it in the directory we created in [setup](#setup). Edit lines +18-21 to include your ONTAP cluster hostnames and IP addresses. Edit lines 6-11 with new names as needed. + +``` +openssl req -new -newkey rsa:4096 -nodes -sha256 -subj "/" -config server_cert.cnf -outform pem -out umeng-aff300-05-06.csr -keyout umeng-aff300-05-06.key + +# Verify +openssl req -text -noout -in umeng-aff300-05-06.csr + +# Make sure these are present + Attributes: + Requested Extensions: + X509v3 Subject Alternative Name: <======== Section that lists alternate DNS and IP names + DNS:umeng-aff300-05-06-cm.rtp.openenglab.netapp.com, DNS:umeng-aff300-05-06, IP Address:10.193.48.11, IP Address:10.193.48.11 + Signature Algorithm: sha256WithRSAEncryption <======== Signature Algorithm can not be sha-1 +``` + +We'll now use the certificate signing request and the recently create certificate authority to create a new SAN +certificate for our cluster. + +``` +openssl x509 -req -sha256 -days 30 -in umeng-aff300-05-06.csr -CA certs/ca.cert.pem -CAkey private/ca.key.pem -CAcreateserial -out umeng-aff300-05-06.crt -extensions req_ext -extfile server_cert.cnf + +# Verify +openssl x509 -text -noout -in umeng-aff300-05-06.crt + +# Make sure these are present +X509v3 extensions: + X509v3 Subject Alternative Name: <======== Section that lists alternate DNS and IP names + DNS:umeng-aff300-05-06-cm.rtp.openenglab.netapp.com, DNS:umeng-aff300-05-06, IP Address:10.193.48.11, IP Address:10.193.48.11 + Signature Algorithm: sha256WithRSAEncryption <======== Signature Algorithm can not be sha-1 +``` + +#### Install Root CA Certificate On Cluster + +Login to your cluster with admin credentials and install the server certificate authority. + +``` +ssh admin@IP +umeng-aff300-05-06::*> security certificate install -type server-ca + +Please enter Certificate: Press when done +-----BEGIN CERTIFICATE----- +... +-----END CERTIFICATE----- + +You should keep a copy of the CA-signed digital certificate for future reference. + +The installed certificate's CA and serial number for reference: +CA: ntap +Serial: 46AFFC7A3A9999999E8FB2FEB0 + +The certificate's generated name for reference: ntap +``` + +Now install the server certificate we created above with SAN. + +``` +umeng-aff300-05-06::*> security certificate install -type server + +Please enter Certificate: Press when done +-----BEGIN CERTIFICATE----- +.. +-----END CERTIFICATE----- + +Please enter Private Key: Press when done +-----BEGIN PRIVATE KEY----- +... +-----END PRIVATE KEY----- + +Please enter certificates of Certification Authorities (CA) which form the certificate chain of the server certificate. This starts with the issuing CA certificate of the server certificate and can range up to the root CA certificate. + +Do you want to continue entering root and/or intermediate certificates {y|n}: n +``` + +If ONTAP tells you the provided certificate does not have a common name in the subject field, type the hostname of the +cluster like this: + +``` +The provided certificate does not have a common name in the subject field. + +Enter a valid common name to continue installation of the certificate: + +Enter a valid common name to continue installation of the certificate: umeng-aff300-05-06-cm.rtp.openenglab.netapp.com + +You should keep a copy of the private key and the CA-signed digital certificate for future reference. + +The installed certificate's CA and serial number for reference: +CA: ntap +Serial: 67A94AA25B229A68AC5BABACA8939A835AA998A58 + +The certificate's generated name for reference: umeng-aff300-05-06-cm.rtp.openenglab.netapp.com +``` + +#### Modify the admin SVM to use the new certificate + +We'll modify the cluster's admin SVM to use the just installed server certificate and certificate authority. + +``` +vserver show -type admin -fields vserver,type +vserver type +------------------ ----- +umeng-aff300-05-06 admin + +umeng-aff300-05-06::*> ssl modify -vserver umeng-aff300-05-06 -server-enabled true -serial 67A94AA25B229A68AC5BABACA8939A835AA998A58 -ca ntap + (security ssl modify) +``` + +You can verify the certificate(s) are installed and working by using `openssl` like so: + +``` +openssl s_client -CAfile certs/ca.cert.pem -showcerts -servername server -connect umeng-aff300-05-06-cm.rtp.openenglab.netapp.com:443 + +CONNECTED(00000005) +depth=1 C = US, ST = NC, L = RTP, O = ntap, OU = ntap +verify return:1 +depth=0 +verify return:1 +... +``` + +without the `-CAfile`, `openssl` will report + +``` +CONNECTED(00000005) +depth=0 +verify error:num=20:unable to get local issuer certificate +verify return:1 +depth=0 +verify error:num=21:unable to verify the first certificate +verify return:1 +--- +``` + +## Create Client Certificates for Password-less Login + +Copy the server certificate we created above into the Harvest install directory. + +``` +cp ca/umeng-aff300-05-06.crt /opt/harvest +cd /opt/harvest +``` + +Create a self-signed client key and certificate with the same name as the hostname where Harvest is running. It's not +required to name the key/cert pair after the hostname, but if you do, Harvest will load them automatically when you +specify `auth_style: certificate_auth` otherwise you can point to them directly. +See [Pollers](https://github.com/NetApp/harvest#pollers) for details. + +Change the common name to the ONTAP user you setup with the harvest role above. e.g `harvest2` + +``` +cd /opt/harvest +mkdir cert +openssl req -x509 -nodes -days 1095 -newkey rsa:2048 -keyout cert/$(hostname).key -out cert/$(hostname).pem -subj "/CN=harvest2" +``` + +## Install Client Certificates on Cluster + +Login to your cluster with admin credentials and install the client certificate. + +``` +ssh admin@IP +umeng-aff300-05-06::*> security certificate install -type client-ca -vserver umeng-aff300-05-06 + +Please enter Certificate: Press when done +-----BEGIN CERTIFICATE----- +... +-----END CERTIFICATE----- + +You should keep a copy of the CA-signed digital certificate for future reference. + +The installed certificate's CA and serial number for reference: +CA: cbg +Serial: B77B59444444CCCC + +The certificate's generated name for reference: cbg_B77B59444444CCCC +``` + +Now that the client certificate is installed, let's enable it. + +``` +umeng-aff300-05-06::*> ssl modify -vserver umeng-aff300-05-06 -client-enabled true + (security ssl modify) +``` + +Verify with a recent version of `curl`. If you are runnin on a Mac [see below](). + +``` +curl --cacert umeng-aff300-05-06.crt --key cert/$(hostname).key --cert cert/$(hostname).pem https://umeng-aff300-05-06-cm.rtp.openenglab.netapp.com/api/storage/disks +``` + +## Update Harvest.yml to use client certificates + +Update the poller section with `auth_style: certificate_auth` like this: + +``` + u2-cert: + auth_style: certificate_auth + addr: umeng-aff300-05-06-cm.rtp.openenglab.netapp.com +``` + +Restart your poller and enjoy your password-less life-style. + +### macOS + +The version of `curl` installed on macOS up through Monterey is not recent enough to work with self-signed SAN certs. +You will need to install a newer version of `curl` via Homebrew, MacPorts, source, etc. + +Example of failure when running with older version of `curl` - you will see this +in [client auth](#install-client-certificates-on-cluster) test step above. + +``` +curl --version +curl 7.64.1 (x86_64-apple-darwin20.0) libcurl/7.64.1 (SecureTransport) LibreSSL/2.8.3 zlib/1.2.11 nghttp2/1.41.0 + +curl --cacert umeng-aff300-05-06.crt --key cert/cgrindst-mac-0.key --cert cert/cgrindst-mac-0.pem https://umeng-aff300-05-06-cm.rtp.openenglab.netapp.com/api/storage/disks + +curl: (60) SSL certificate problem: unable to get local issuer certificate +``` + +Let's install `curl` via Homebrew. Make sure you don't miss the message that Homebrew prints about your path. + +``` +If you need to have curl first in your PATH, run: + echo 'export PATH="/usr/local/opt/curl/bin:$PATH"' >> /Users/cgrindst/.bash_profile +``` + +Now when we make a client auth request with our self-signed certificate it works! `\o/` + +``` +brew install curl + +curl --version +curl 7.80.0 (x86_64-apple-darwin20.6.0) libcurl/7.80.0 (SecureTransport) OpenSSL/1.1.1l zlib/1.2.11 brotli/1.0.9 zstd/1.5.0 libidn2/2.3.2 libssh2/1.10.0 nghttp2/1.46.0 librtmp/2.3 OpenLDAP/2.6.0 +Release-Date: 2021-11-10 +Protocols: dict file ftp ftps gopher gophers http https imap imaps ldap ldaps mqtt pop3 pop3s rtmp rtsp scp sftp smb smbs smtp smtps telnet tftp +Features: alt-svc AsynchDNS brotli GSS-API HSTS HTTP2 HTTPS-proxy IDN IPv6 Kerberos Largefile libz MultiSSL NTLM NTLM_WB SPNEGO SSL TLS-SRP UnixSockets zstd + +curl --cacert umeng-aff300-05-06.crt --key cert/cgrindst-mac-0.key --cert cert/cgrindst-mac-0.pem https://umeng-aff300-05-06-cm.rtp.openenglab.netapp.com/api/storage/disks + +{ + "records": [ + { + "name": "1.1.22", + "_links": { + "self": { + "href": "/api/storage/disks/1.1.22" + } + } + } +} +``` + +---- + +Change directory to your Harvest home directory (replace `/opt/harvest/` if this is not the default): + +```bash +$ cd /opt/harvest/ +``` + +Generate an SSL cert and key pair with the following command. Note that it's preferred to generate these files using the +hostname of the local machine. The command below assumes `debian8` as our hostname name and `harvest2` as the user we +created in the previous step: + +```bash +openssl req -x509 -nodes -days 1095 -newkey rsa:2048 -keyout cert/debian8.key \ + -out cert/debian8.pem -subj "/CN=harvest2" +``` + +Next, open the public key (`debian8.pem` in our example) and copy all of its content. Login into your ONTAP CLI and run +this command by replacing **CLUSTER** with the name of your cluster. + +```bash +security certificate install -type client-ca -vserver CLUSTER +``` + +Paste the public key content and hit enter. Output should be similar to this: + +```bash +jamaica::> security certificate install -type client-ca -vserver jamaica + +Please enter Certificate: Press when done +-----BEGIN CERTIFICATE----- +MIIDETCCAfmgAwIBAgIUP9EUXyl2BDSUOkNEcDU0yqbJ29IwDQYJKoZIhvcNAQEL +BQAwGDEWMBQGA1UEAwwNaGFydmVzdDItY2xpMzAeFw0yMDEwMDkxMjA0MDhaFw0y +MzEwMDktcGFueSBMdGQxFzAVBgNVBAMlc3QyLWNsaTMwggEiMA0tcGFueSBGCSqG +SIb3DQEBAQUAA4IBDwAwggEKAoIBAQCVVy25BeCRoGCJWFOlyUL7Ddkze4Hl2/6u +qye/3mk5vBNsGuXUrtad5XfBB70Ez9hWl5sraLiY68ro6MyX1icjiUTeaYDvS/76 +Iw7HeXJ5Pyb/fWth1nePunytoLyG/vaTCySINkIV5nlxC+k0X3wWFJdfJzhloPtt +1Vdm7aCF2q6a2oZRnUEBGQb6t5KyF0/Xh65mvfgB0pl/AS2HY5Gz+~L54Xyvs+BY +V7UmTop7WBYl0L3QXLieERpHXnyOXmtwlm1vG5g4n/0DVBNTBXjEdvc6oRh8sxBN +ZlQWRApE7pa/I1bLD7G2AiS4UcPmR4cEpPRVEsOFOaAN3Z3YskvnAgMBAAGjUzBR +MB0GA1UdDgQWBBQr4syV6TCcgO/5EcU/F8L2YYF15jAfBgNVHSMEGDAWgBQr4syV +6TCcgO/5EcU/F8L2YYF15jAPBgNVHRMdfdfwerH/MA0GCSqGSIb^ECd3DQEBCwUA +A4IBAQBjP1BVhClRKkO/M3zlWa2L9Ztce6SuGwSnm6Ebmbs+iMc7o2N9p3RmV6Xl +h6NcdXRzzPAVrUoK8ewhnBzdghgIPoCI6inAf1CUhcCX2xcnE/osO+CfvKuFnPYE +WQ7UNLsdfka0a9kTK13r3GMs09z/VsDs0gD8UhPjoeO7LQhdU9tJ/qOaSP3s48pv +sYzZurHUgKmVOaOE4t9DAdevSECEWCETRETA$Vbn%@@@%%rcdrctru65ryFaByb+ +hTtGhDnoHwzt/cAGvLGV/RyWdGFAbu7Fb1rV94ceggE7nh1FqbdLH9siot6LlnQN +MhEWp5PYgndOW49dDYUxoauCCkiA +-----END CERTIFICATE----- + + +You should keep a copy of the CA-signed digital certificate for future reference. + +The installed certificate's CA and serial number for reference: +CA: harvest2 +Serial: 3FD1145F2976043012213d3009095534CCRDBD2 + +The certificate's generated name for reference: harvest2 +``` + +Finally, we need to enable SSL authentication with the following command (replace **CLUSTER** with the name of your +cluster): + +```bash +security ssl modify -client-enabled true -vserver CLUSTER +``` + +## Reference + +- https://github.com/jcbsmpsn/golang-https-example diff --git a/docs/prepare-storagegrid-clusters.md b/docs/prepare-storagegrid-clusters.md new file mode 100644 index 000000000..ca69ef1e0 --- /dev/null +++ b/docs/prepare-storagegrid-clusters.md @@ -0,0 +1 @@ +Coming Soon \ No newline at end of file diff --git a/docs/prometheus-exporter.md b/docs/prometheus-exporter.md new file mode 100644 index 000000000..b7cb85f62 --- /dev/null +++ b/docs/prometheus-exporter.md @@ -0,0 +1,324 @@ +# Prometheus Exporter + +???+ note "Prometheus Install" + + The information below describes how to setup Harvest's Prometheus exporter. + If you need help installing or setting up Prometheus, check + out [their documention](https://prometheus.io/docs/prometheus/latest/getting_started/). + +## Overview + +The Prometheus exporter is responsible for: + +- formatting metrics into the Prometheus [line protocol](https://prometheus.io/docs/instrumenting/exposition_formats/) +- creating a web-endpoint on `http://:/metrics` for Prometheus to scrape + +A web end-point is required because Prometheus scrapes Harvest by polling that end-point. + +In addition to the `/metrics` end-point, the Prometheus exporter also serves an overview of all metrics and collectors +available on its root address `http://:/`. + +Because Prometheus polls Harvest, don't forget +to [update your Prometheus configuration](#configure-prometheus-to-scrape-harvest-pollers) and tell Prometheus how to +scrape each poller. + +There are two ways to configure the Prometheus exporter: using a `port range` or individual `port`s. + +The `port range` is more flexible and should be used when you want multiple pollers all exporting to the same instance +of Prometheus. Both options are explained below. + +## Parameters + +All parameters of the exporter are defined in the `Exporters` section of `harvest.yml`. + +An overview of all parameters: + +| parameter | type | description | default | +|---------------------|------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------| +| `port_range` | int-int (range), overrides `port` if specified | lower port to upper port (inclusive) of the HTTP end-point to create when a poller specifies this exporter. Starting at lower port, each free port will be tried sequentially up to the upper port. | | +| `port` | int, required if port_range is not specified | port of the HTTP end-point | | +| `local_http_addr` | string, optional | address of the HTTP server Harvest starts for Prometheus to scrape:
use `localhost` to serve only on the local machine
use `0.0.0.0` (default) if Prometheus is scrapping from another machine | `0.0.0.0` | +| `global_prefix` | string, optional | add a prefix to all metrics (e.g. `netapp_`) | | +| `allow_addrs` | list of strings, optional | allow access only if host matches any of the provided addresses | | +| `allow_addrs_regex` | list of strings, optional | allow access only if host address matches at least one of the regular expressions | | +| `cache_max_keep` | string (Go duration format), optional | maximum amount of time metrics are cached (in case Prometheus does not timely collect the metrics) | `300s` | +| `add_meta_tags` | bool, optional | add `HELP` and `TYPE` [metatags](https://prometheus.io/docs/instrumenting/exposition_formats/#comments-help-text-and-type-information) to metrics (currently no useful information, but required by some tools) | `false` | +| `sort_labels` | bool, optional | sort metric labels before exporting. Some [open-metrics scrapers report](https://github.com/NetApp/harvest/issues/756) stale metrics when labels are not sorted. | `false` | + +A few examples: + +#### port_range + +```yaml +Exporters: + prom-prod: + exporter: Prometheus + port_range: 2000-2030 +Pollers: + cluster-01: + exporters: + - prom-prod + cluster-02: + exporters: + - prom-prod + cluster-03: + exporters: + - prom-prod + # ... more + cluster-16: + exporters: + - prom-prod +``` + +Sixteen pollers will collect metrics from 16 clusters and make those metrics available to a single instance of +Prometheus named `prom-prod`. Sixteen web end-points will be created on the first 16 available free ports between 2000 +and 2030 (inclusive). + +After staring the pollers in the example above, running `bin/harvest status` shows the following. Note that ports 2000 +and 2003 were not available so the next free port in the range was selected. If no free port can be found an error will +be logged. + +``` +Datacenter Poller PID PromPort Status +++++++++++++ ++++++++++++ +++++++ +++++++++ ++++++++++++++++++++ +DC-01 cluster-01 2339 2001 running +DC-01 cluster-02 2343 2002 running +DC-01 cluster-03 2351 2004 running +... +DC-01 cluster-14 2405 2015 running +DC-01 cluster-15 2502 2016 running +DC-01 cluster-16 2514 2017 running +``` + +#### allow_addrs + +```yaml +Exporters: + my_prom: + allow_addrs: + - 192.168.0.102 + - 192.168.0.103 +``` + +will only allow access from exactly these two addresses. + +#### allow_addrs_regex + +```yaml +Exporters: + my_prom: + allow_addrs_regex: + - `^192.168.0.\d+$` +``` + +will only allow access from the IP4 range `192.168.0.0`-`192.168.0.255`. + +## Configure Prometheus to scrape Harvest pollers + +There are two ways to tell Prometheus how to scrape Harvest: using HTTP service discovery (SD) or listing each poller +individually. + +HTTP service discovery is the more flexible of the two. It is also less error-prone, and easier to manage. Combined with +the port_range configuration described above, SD is the least effort to configure Prometheus and the easiest way to keep +both Harvest and Prometheus in sync. + +**NOTE** HTTP service discovery does not work with Docker yet. With Docker, you will need to list each poller +individually or if possible, use the [Docker Compose](https://github.com/NetApp/harvest/tree/main/docker) workflow that +uses file service discovery to achieve a similar ease-of-use as HTTP service discovery. + +See the [example](#prometheus-http-service-discovery-and-port-range) below for how to use HTTP SD and port_range +together. + +### Prometheus HTTP Service Discovery + +[HTTP service discovery](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config) was +introduced in Prometheus version 2.28.0. Make sure you're using that version or later. + +The way service discovery works is: + +- shortly after a poller starts up, it registers with the SD node (if one exists) +- the poller sends a heartbeat to the SD node, by default every 45s. +- if a poller fails to send a heartbeat, the SD node removes the poller from the list of active targets after a minute +- the SD end-point is reachable via SCHEMA:///api/v1/sd + +To use HTTP service discovery you need to: + +1. tell [Harvest to start the HTTP service discovery process](#enable-http-service-discovery-in-harvest) +2. tell [Prometheus to use the HTTP service discovery endpoint](#enable-http-service-discovery-in-prometheus) + +#### Enable HTTP service discovery in Harvest + +Add the following to your `harvest.yml` + +```yaml +Admin: + httpsd: + listen: :8887 +``` + +This tells Harvest to create an HTTP service discovery end-point on interface `0.0.0.0:8887`. If you want to only listen +on localhost, use `127.0.0.1:` instead. See [net.Dial](https://pkg.go.dev/net#Dial) for details on the supported +listen formats. + +Start the SD process by running `bin/harvest admin start`. Once it is started, you can curl the end-point for the list +of running Harvest pollers. + +``` +curl -s 'http://localhost:8887/api/v1/sd' | jq . +[ + { + "targets": [ + "10.0.1.55:12990", + "10.0.1.55:15037", + "127.0.0.1:15511", + "127.0.0.1:15008", + "127.0.0.1:15191", + "10.0.1.55:15343" + ] + } +] +``` + +#### Harvest HTTP Service Discovery options + +HTTP service discovery (SD) is configured in the `Admin > httpsd` section of your `harvest.yml`. + +| parameter | type | description | default | +|-----------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------| +| `listen` | **required** | Interface and port to listen on, use localhost:PORT or :PORT for all interfaces | | +| `auth_basic` | optional | If present, enables basic authentication on `/api/v1/sd` end-point | | +| auth_basic `username`, `password` | **required** child of `auth_basic` | | | +| `tls` | optional | If present, enables TLS transport. If running in a container, see [note](https://github.com/NetApp/harvest/issues/672#issuecomment-1036338589) | | +| tls `cert_file`, `key_file` | **required** child of `tls` | Relative or absolute path to TLS certificate and key file. TLS 1.3 certificates required.
FIPS complaint P-256 TLS 1.3 certificates can be created with `bin/harvest admin tls create server` | | +| `ssl_cert`, `ssl_key` | optional if `auth_style` is `certificate_auth` | Absolute paths to SSL (client) certificate and key used to authenticate with the target system.

If not provided, the poller will look for `.key` and `.pem` in `$HARVEST_HOME/cert/`.

To create certificates for ONTAP systems, see [using certificate authentication](prepare-cdot-clusters.md#using-certificate-authentication) | | +| `heart_beat` | optional, [Go Duration format](https://pkg.go.dev/time#ParseDuration) | How frequently each poller sends a heartbeat message to the SD node | 45s | +| `expire_after` | optional, [Go Duration format](https://pkg.go.dev/time#ParseDuration) | If a poller fails to send a heartbeat, the SD node removes the poller after this duration | 1m | + +#### Enable HTTP service discovery in Prometheus + +Edit your `prometheus.yml` and add the following section + +`$ vim /etc/prometheus/prometheus.yml` + +```yaml +scrape_configs: + - job_name: harvest + http_sd_configs: + - url: http://localhost:8887/api/v1/sd +``` + +Harvest and Prometheus both support basic authentication for HTTP SD end-points. To enable basic auth, add the following +to your Harvest config. + +```yaml +Admin: + httpsd: + listen: :8887 + # Basic auth protects GETs and publishes + auth_basic: + username: admin + password: admin +``` + +Don't forget to also update your Prometheus config with the +matching [basic_auth](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config) +credentials. + +### Prometheus HTTP Service Discovery and Port Range + +HTTP SD combined with Harvest's `port_range` feature leads to significantly less configuration in your `harvest.yml`. +For example, if your clusters all export to the same Prometheus instance, you can refactor the per-poller exporter into +a single exporter shared by all clusters in `Defaults` as shown below: + +Notice that none of the pollers specify an exporter. Instead, all the pollers share the single exporter +named `prometheus-r` listed in `Defaults`. `prometheus-r` is the only exporter defined and as specified will manage up +to 1,000 Harvest Prometheus exporters. + +If you add or remove more clusters in the `Pollers` section, you do not have to change Prometheus since it dynamically +pulls the targets from the Harvest admin node. + +```yaml +Admin: + httpsd: + listen: :8887 + +Exporters: + prometheus-r: + exporter: Prometheus + port_range: 13000-13999 + +Defaults: + collectors: + - Zapi + - ZapiPerf + use_insecure_tls: false + auth_style: password + username: admin + password: pass + exporters: + - prometheus-r + +Pollers: + umeng_aff300: + datacenter: meg + addr: 10.193.48.11 + + F2240-127-26: + datacenter: meg + addr: 10.193.6.61 + + # ... add more clusters +``` + +### Static Scrape Targets + +If we define four prometheus exporters at ports: 12990, 12991, 14567, and 14568 you need to add four sections to +your `prometheus.yml`. + +```bash +$ vim /etc/prometheus/prometheus.yml +``` + +Scroll down to near the end of file and add the following lines: + +```yaml + - job_name: 'harvest' + scrape_interval: 60s + static_configs: + - targets: + - 'localhost:12990' + - 'localhost:12991' + - 'localhost:14567' + - 'localhost:14568' +``` + +**NOTE** If Prometheus is not on the same machine as Harvest, then replace `localhost` with the IP address of your +Harvest machine. Also note the scrape interval above is set to 60s. That matches the polling frequency of the default +Harvest collectors. If you change the polling frequency of a Harvest collector to a lower value, you should also change +the scrape interval. + +# Prometheus Alerts + +Prometheus includes out-of-the-box support for simple alerting. Alert rules are configured in your `prometheus.yml` +file. Setup and details can be found in the Prometheus +guide on [alerting](https://prometheus.io/docs/practices/alerting/). + +Harvest also includes [ems alerts](https://github.com/NetApp/harvest/blob/main/docker/prometheus/ems_alert_rules.yml) +and [sample alerts](https://github.com/NetApp/harvest/blob/main/docker/prometheus/alert_rules.yml) for reference. +Refer [EMS Collector](https://github.com/NetApp/harvest/blob/main/cmd/collectors/ems/README.md) for more details about +EMS events. + +## Alertmanager + +Prometheus's builtin alerts are good for simple workflows. They do a nice job telling you what's happening at the +moment. +If you need a richer solution that includes summarization, notification, advanced delivery, deduplication, etc. +checkout [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/). + +## Reference + +- [Prometheus Alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +- [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) +- [Alertmanager's notification metrics](https://utcc.utoronto.ca/~cks/space/blog/sysadmin/AlertmanagerNotificationMetrics) +- [Prometheus Linter](https://github.com/cloudflare/pint) +- [Collection of example Prometheus Alerts](https://github.com/samber/awesome-prometheus-alerts) \ No newline at end of file diff --git a/docs/quickstart.md b/docs/quickstart.md new file mode 100644 index 000000000..205d5ad5b --- /dev/null +++ b/docs/quickstart.md @@ -0,0 +1,62 @@ +## 1. Configuration file + +Harvest's configuration information is defined in `harvest.yml`. There are a few ways to tell Harvest how to load this file: + +* If you don't use the `--config` flag, the `harvest.yml` file located in the current working directory will be used + +* If you specify the `--config` flag like so `harvest status --config /opt/harvest/harvest.yml`, Harvest will use that file + +To start collecting metrics, you need to define at least one `poller` and one `exporter` in your configuration file. +The default configuration comes with a pre-configured poller named `unix` which collects metrics from the local system. +This is useful if you want to monitor resource usage by Harvest and serves as a good example. +Feel free to delete it if you want. + +The next step is to add pollers for your ONTAP clusters in the [Pollers](configure-harvest-basic.md#pollers) +section of the Harvest configuration file, `harvest.yml`. + +## 2. Start Harvest + +Start all Harvest pollers as daemons: + +```bash +bin/harvest start +``` + +Or start a specific poller(s). In this case, we're staring two pollers named `jamaica` and `jamaica`. + +```bash +bin/harvest start jamaica jamaica +``` + +Replace `jamaica` and `grenada` with the poller names you defined in `harvest.yml`. +The logs of each poller can be found in `/var/log/harvest/`. + +## 3. Import Grafana dashboards + +The Grafana dashboards are located in the `$HARVEST_HOME/grafana` directory. +You can manually import the dashboards or use the `bin/harvest grafana` command +([more documentation](dashboards.md)). + +Note: the current dashboards specify Prometheus as the datasource. +If you use the InfluxDB exporter, you will need to create your own dashboards. + +## 4. Verify the metrics + +If you use a Prometheus Exporter, open a browser and navigate to [http://0.0.0.0:12990/](http://0.0.0.0:12990/) +(replace `12990` with the port number of your poller). +This is the Harvest created HTTP end-point for your Prometheus exporter. +This page provides a real-time generated list of running collectors and names of exported metrics. + +The metric data that is exported for Prometheus to scrap is +available at [http://0.0.0.0:12990/metrics/](http://0.0.0.0:12990/metrics/). + +More information on configuring the exporter can be found in the +[Prometheus exporter](prometheus-exporter.md) documentation. + +If you can't access the URL, check the logs of your pollers. These are located in `/var/log/harvest/`. + +## 5. (Optional) Setup Systemd service files + +If you're running Harvest on a system with Systemd, you may want +to [take advantage of systemd instantiated units](https://github.com/NetApp/harvest/tree/main/service/contrib) +to manage your pollers. diff --git a/docs/release-notes.md b/docs/release-notes.md new file mode 100644 index 000000000..9af5048d5 --- /dev/null +++ b/docs/release-notes.md @@ -0,0 +1,2 @@ +- [Changelog](https://github.com/NetApp/harvest/blob/main/CHANGELOG.md) +- [Releases](https://github.com/NetApp/harvest/releases) \ No newline at end of file diff --git a/docs/resources/matrix.md b/docs/resources/matrix.md new file mode 100644 index 000000000..1275a81ca --- /dev/null +++ b/docs/resources/matrix.md @@ -0,0 +1,247 @@ +## Matrix + +The ℳatriχ package provides the `matrix.Matrix` data-structure for storage, manipulation and transmission of both numeric and non-numeric (string) data. It is utilized by core components of Harvest, including collectors, plugins and exporters. It furthermore serves as an interface between these components, such that "the left hand does not know what the right hand does". + +Internally, the Matrix is a collection of metrics (`matrix.Metric`) and instances (`matrix.Instance`) in the form of a 2-dimensional array: + +![matrix image](../assets/matrix.png) + +Since we use hash tables for accessing the elements of the array, all metrics and instances added to the matrix must have a unique key. Metrics are typed and contain the numeric data (i.e. rows) of the Matrix. Instances only serve as pointers to the columents of the Matrix, but they also store non-numeric data as labels (`*dict.Dict`). + +This package is the architectural backbone of Harvest, therefore understanding it is key for an advanced user or contributor. + +# Basic Usage +## Initialize +```go +func matrix.New(name, object string, identifier string) *Matrix +// always returns successfully pointer to (empty) Matrix +``` +This section describes how to properly initialize a new Matrix instance. Note that if you write a collector, a Matrix instance is already properly initialized for you (as `MyCollector.matrix`), and if you write a plugin or exporter, it is passed to you from the collector. That means most of the time you don't have to worry about initializing the Matrix. + +`matrix.New()` requires three arguments: +* `UUID` is by convention the collector name (e.g. `MyCollector`) if the Matrix comes from a collector, or the collector name and the plugin name concatenated with a `.` (e.g. `MyCollector.MyPlugin`) if the Matrix comes from a plugin. +* `object` is a description of the instances of the Matrix. For example, if we collect data about cars and our instances are cars, a good name would be `car`. +* `identifier` is a unique key used to identify a matrix instance + +Note that `identifier` should uniquely identify a Matrix instance. This is not a strict requirement, but guarantees that your data is properly handled by exporters. + +### Example +Here is an example from the point of view of a collector: + +```go + +import "github.com/netapp/harvest/v2/pkg/matrix" + +var myMatrix *matrix.Matrix + +myMatrix = matrix.New("CarCollector", "car", "car") +``` + +Next step is to add metrics and instances to our Matrix. + +## Add instances and instance labels +```go +func (x *Matrix) NewInstance(key string) (*Instance, error) +// returns pointer to a new Instance, or nil with error (if key is not unique) +``` + +```go +func (i *Instance) SetLabel(key, value string) +// always successful, overwrites existing values +``` +```go +func (i *Instance) GetLabel(key) string +// always returns value, if label is not set, returns empty string +``` + +Once we have initialized a Matrix, we can add instances and add labels to our instances. + +### Example + +```go + +var ( + instance *matrix.Instance + err error +) +if instance, err = myMatrix.NewInstance("SomeCarMark"); err != nil { + return err + // or handle err, but beware that instance is nil +} +instance.SetLabel("mark", "SomeCarMark") +instance.SetLabel("color", "red") +instance.SetLabel("style", "coupe") +// add as many labels as you like +instance.GetLabel("color") // return "red" +instance.GetLabel("owner") // returns "" +``` + +## Add Metrics +```go +func (x *Matrix) NewMetricInt64(key string) (Metric, error) +// returns pointer to a new MetricInt64, or nil with error (if key is not unique) +// note that Metric is an interface +``` + +Metrics are typed and there are currently 8 types, all can be created with the same signature as above: +* `MetricUint8` +* `MetricUint32` +* `MetricUint64` +* `MetricInt` +* `MetricInt32` +* `MetricInt64` +* `MetricFloat32` +* `MetricFloat64` +* +We are able to read from and write to a metric instance using different types (as displayed in the next section), however choosing a type wisely ensures that this is done efficiently and overflow does not occur. + +We can add labels to metrics just like instances. This is usually done when we deal with histograms: + +```go +func (m Metric) SetLabel(key, value string) +// always successful, overwrites existing values +``` +```go +func (m Metric) GetLabel(key) string +// always returns value, if label is not set, returns empty string +``` + +### Example + +Continuing our Matrix for collecting car-related data: + + +```go +var ( + speed, length matrix.Metric + err error +) + +if speed, err = myMatrix.NewMetricUint32("max_speed"); err != nil { + return err +} +if length, err = myMatrix.NewMetricFloat32("length_in_mm"); err != nil { + return err +} +``` + +## Write numeric data + +```go +func (x *Matrix) Reset() +// flush numeric data from previous poll +``` +```go +func (m Metric) SetValueInt64(i *Instance, v int64) error +func (m Metric) SetValueUint8(i *Instance, v uint8) error +func (m Metric) SetValueUint64(i *Instance, v uint64) error +func (m Metric) SetValueFloat64(i *Instance, v float64) error +func (m Metric) SetValueBytes(i *Instance, v []byte) error +func (m Metric) SetValueString(i *Instance, v []string) error +// sets the numeric value for the instance i to v +// returns error if v is invalid (explained below) + +``` +```go +func (m Metric) AddValueInt64(i *Instance, v int64) error +// increments the numeric value for the instance i by v +// same signatures for all the types defined above +``` + +When possible you should reuse a Matrix for each data poll, but to do that, you need to call `Reset()` to drop old data from the Matrix. It is safe to add new instances and metrics after calling this method. + +The `SetValue*()` and `AddValue*()` methods are typed same as the metrics. Even though you are not required to use the same type as the metric, it is the safest and most efficient way. + +Since most collectors get their data as bytes or strings, it is recommended to use the `SetValueString()` and `SetValueBytes()` methods. + +These methods return an error if value `v` can not be converted to the type of the metric. Error is always `nil` when the type of `v` matches the type of the metric. + +### Example + +Continuing with the previous examples: + + +```go + +if err = myMatrix.Reset(); err != nil { + return +} +// write numbers to the matrix using the instance and the metrics we have created + +// let the metric do the conversion for us +if err = speed.SetValueString(instance, "500"); err != nil { + logger.Error(me.Prefix, "set speed value: ", err) +} +// here we ignore err since type is the metric type +length.SetValueFloat64(instance, 10000.00) + +// safe to add new instances +var instance2 matrix.Instance +if instance2, err = myMatrix.NewInstance("SomeOtherCar"); err != nil { + return err +} + +// possible and safe even though speed has type Float32 +} if err = length.SetValueInt64(instance2, 13000); err != nil { + logger.Error(me.Prefix, "set speed value:", err) +} + +// possible, but will overflow since speed is unsigned +} if err = speed.SetValueInt64(instance2, -500); err != nil { + logger.Error(me.Prefix, "set length value:", err) +} +``` + +## Read metrics and instances +In this section we switch gears and look at the Matrix from the point of view of plugins and exporters. Both those components need to read from the Matrix and have no knowledge of its origin or contents. + +```go +func (x *Matrix) GetMetrics() map[string]Metric +// returns all metrics in the Matrix +``` +```go +func (x *Matrix) GetInstances() map[string]*Instance +// returns all instances in the Matrix +``` + +Usually we will do a nested loop with these two methods to read all data in the Matrix. See examples below. + +### Example: Iterate over instances + +In this example the method `PrintKeys()` will iterate over a Matrix and print all metric and instance keys. + +```go +func PrintKeys(x *matrix.Matrix) { + for instanceKey, _ := range x.GetInstances() { + fmt.Println("instance key=", instanceKey) + } +} +``` + +### Example: Read instance labels + +Each instance has a set of labels. We can iterate over these labels with the `GetLabel()` and `GetLabels()` method. In this example, we write a function that prints all labels of an instance: + +```go +func PrintLabels(instance *matrix.Instance) { + for label, value, := range instance.GetLabels().Map() { + fmt.Printf("%s=%s\n", label, value) + } +} +``` + +### Example: Read metric values labels + +Similar to the `SetValue*` and `AddValue*` methods, you can choose a type when reading from a metric. If you don't know the type of the metric, it is safe to read it as a string. In this example, we write a function that prints the value of a metric for all instances in a Matrix: + +```go +func PrintMetricValues(x *matrix.Matrix, m matrix.Metric) { + for key, instance := range x.GetInstances() { + if value, has := m.GetValueString(instance) { + fmt.Printf("instance %s = %s\n", key, value) + } else { + fmt.Printf("instance %s has no value\n", key) + } + } +} +``` \ No newline at end of file diff --git a/docs/resources/templates-and-metrics.md b/docs/resources/templates-and-metrics.md new file mode 100644 index 000000000..95f0c869a --- /dev/null +++ b/docs/resources/templates-and-metrics.md @@ -0,0 +1,357 @@ +# Harvest Templates and Metrics + +Harvest collects ONTAP counter information, augments it, and stores it in a time-series DB. + +```mermaid +flowchart RL + Harvest[Harvest
Get & Augment] -- REST
ZAPI --> ONTAP + id1[(Prometheus
Store)] -- Scrape --> Harvest +``` + +Three concepts work in unison to collect ONTAP metrics data, +prepare it and make it available to Prometheus. + +- ZAPI/REST +- Harvest templates +- Exporters + +We're going to walk through an example from a running system, focusing on the `disk` object. + +At a high-level, Harvest templates describe what ZAPIs to send to ONTAP and how to interpret the responses. + +- ONTAP defines twos ZAPIs to collect `disk` info + - Config information is collected via `storage-disk-get-iter` + - Performance counters are collected via `disk:constituent` +- These ZAPIs are found in their corresponding object template file `conf/zapi/cdot/9.8.0/disk.yaml` + and `conf/zapiperf/cdot/9.8.0/disk.yaml`. These files also describe how to map the ZAPI responses into a + time-series-friendly format +- Prometheus uniquely identifies a time series by its metric name and optional key-value pairs called labels. + +## Handy Tools + +- [dasel](https://github.com/TomWright/dasel) is useful to convert between XML, YAML, JSON, etc. We'll use it to make + displaying some of the data easier. + +## ONTAP ZAPI disk example + +We'll use the `bin/zapi` tool to interrogate the cluster and gather information about the counters. This is one way you +can send ZAPIs to ONTAP and explore the return types and values. + +``` +bin/zapi -p u2 show attrs --api storage-disk-get-iter +``` + +> Output edited for brevity and line numbers added on left + +The hierarchy and return type of each counter is shown below. We'll use this hierarchy to build a matching Harvest +template. +For example, line `3` is the `bytes-per-sector` counter, which has an integer value, and is the child +of `storage-disk-info > disk-inventory-info`. + +To capture that counter's value as a metric in a Harvest, the ZAPI template must use the same hierarchical path. The +matching path can be seen [below](#harvest-object-template). + +``` +building tree for attribute [attributes-list] => [storage-disk-info] + + 1 [storage-disk-info] - * + 2 [disk-inventory-info] - + 3 [bytes-per-sector] - integer + 4 [capacity-sectors] - integer + 5 [disk-type] - string + 6 [is-shared] - boolean + 7 [model] - string + 8 [serial-number] - string + 9 [shelf] - string +10 [shelf-bay] - string +11 [disk-name] - string +12 [disk-ownership-info] - +13 [home-node-name] - string +14 [is-failed] - boolean +15 [owner-node-name] - string +16 [disk-raid-info] - +17 [container-type] - string +18 [disk-outage-info] - +19 [is-in-fdr] - boolean +20 [reason] - string +21 [disk-stats-info] - +22 [average-latency] - integer +23 [disk-io-kbps] - integer +24 [power-on-time-interval] - integer +25 [sectors-read] - integer +26 [sectors-written] - integer +27 [disk-uid] - string +28 [node-name] - string +29 [storage-disk-state] - integer +30 [storage-disk-state-flags] - integer +``` + +## Harvest Templates + +To understand templates, there are a few concepts to cover: + +There are three kinds of information included in templates that define what Harvest collects and exports: + +1. Configuration information is exported into the `_labels` metric (e.g. `disk_labels` see below) +2. Metrics data is exported as `disk_"metric name"` e.g. `disk_bytes_per_sector`, `disk_sectors`, etc. Metrics are leaf + nodes that are not prefixed with a ^ or ^^. Metrics must be one of the number types: float or int. +3. Plugins may add additional metrics, increasing the number of metrics exported in #2 + +A resource will typically have multiple instances. Using disk as an example, that means there will be one `disk_labels` +and a metric row per instance. If we have 24 disks and the disk template lists seven metrics to capture, Harvest will +export a total of 192 rows of Prometheus data. + +`24 instances * (7 metrics per instance + 1 label per instance) = 192 rows` + +Sum of disk metrics that Harvest exports + +``` +curl -s 'http://localhost:14002/metrics' | grep ^disk | cut -d'{' -f1 | sort | uniq -c + 24 disk_bytes_per_sector + 24 disk_labels + 24 disk_sectors + 24 disk_stats_average_latency + 24 disk_stats_io_kbps + 24 disk_stats_sectors_read + 24 disk_stats_sectors_written + 24 disk_uptime +# 192 rows +``` + +Read on to see how we control which labels from #1 and which metrics from #2 are included in the exported data. + +### Instance Keys and Labels + +- Instance key - An instance key defines the set of attributes Harvest uses to construct a key that uniquely identifies + an object. For example, the disk template uses the `node` + `disk` attributes to determine uniqueness. Using `node` + or `disk` alone wouldn't be sufficient since disks on separate nodes can have the same name. If a single label does + not uniquely identify an instance, combine multiple keys for uniqueness. Instance keys must refer to attributes that + are of type `string`. + + Because instance keys define uniqueness, these keys are also added to each metric as a key-value pair. ( + see [Control What Labels and Metrics are Exported](#control-what-labels-and-metrics-are-exported) for examples) + +- Instance label - Labels are key-value pairs used to gather configuration information about each instance. All of the + key-value pairs are combined into a single metric named `disk_labels`. There will be one `disk_labels` for each + monitored instance. Here's an example reformatted so it's easier to read: + +``` +disk_labels{ + datacenter="dc-1", + cluster="umeng-aff300-05-06", + node="umeng-aff300-06", + disk="1.1.23", + type="SSD", + model="X371_S1643960ATE", + outage="", + owner_node="umeng-aff300-06", + shared="true", + shelf="1", + shelf_bay="23", + serial_number="S3SENE0K500532", + failed="false", + container_type="shared" +} +``` + +### Harvest Object Template + +Continuing with the disk example, below is the `conf/zapi/cdot/9.8.0/disk.yaml` that tells Harvest which ZAPI to send to +ONTAP (`storage-disk-get-iter`) and describes how to interpret and export the response. + +- Line `1` defines the name of this resource and is an exact match to the object defined in your `default.yaml` + or `custom.yaml` file. Eg. + +``` +# default.yaml +objects: + Disk: disk.yaml +``` + +- Line `2` is the name of the ZAPI that Harvest will send to collect disk resources +- Line `3` is the prefix used to export metrics associated with this object. i.e. all metrics will be of the + form `disk_*` +- Line `5` the [counter section](https://github.com/NetApp/harvest/tree/main/conf#counters) is where we define the + metrics, labels, and what constitutes instance uniqueness +- Line `7` the double hat prefix `^^` means this attribute is an instance key used to determine uniqueness. Instance + keys are also included as labels. Uuids are good choices for uniqueness +- Line `13` the single hat prefix `^` means this attribute should be stored as a label. That means we can include it in + the `export_options` section as one of the key-value pairs in `disk_labels` +- Rows 10, 11, 23, 24, 25, 26, 27 - these are the metrics rows - metrics are leaf nodes that are not prefixed with a ^ + or ^^. If you refer back to the [ONTAP ZAPI disk example](#ontap-zapi-disk-example) above, you'll notice each of these + attributes are integer types. +- Line 43 defines the set of labels to use when constructing the `disk_labels` metrics. As + mentioned [above](#instance-keys-and-labels), these labels capture config-related attributes per instance. + +> Output edited for brevity and line numbers added for reference. + +``` + 1 name: Disk + 2 query: storage-disk-get-iter + 3 object: disk + 4 + 5 counters: + 6 storage-disk-info: + 7 - ^^disk-uid + 8 - ^^disk-name => disk + 9 - disk-inventory-info: +10 - bytes-per-sector => bytes_per_sector # notice this has the same hierarchical path we saw from bin/zapi +11 - capacity-sectors => sectors +12 - ^disk-type => type +13 - ^is-shared => shared +14 - ^model => model +15 - ^serial-number => serial_number +16 - ^shelf => shelf +17 - ^shelf-bay => shelf_bay +18 - disk-ownership-info: +19 - ^home-node-name => node +20 - ^owner-node-name => owner_node +21 - ^is-failed => failed +22 - disk-stats-info: +23 - average-latency +24 - disk-io-kbps +25 - power-on-time-interval => uptime +26 - sectors-read +27 - sectors-written +28 - disk-raid-info: +29 - ^container-type => container_type +30 - disk-outage-info: +31 - ^reason => outage +32 +33 plugins: +34 - LabelAgent: +35 # metric label zapi_value rest_value `default_value` +36 value_to_num: +37 - new_status outage - - `0` #ok_value is empty value, '-' would be converted to blank while processing. +38 +39 export_options: +40 instance_keys: +41 - node +42 - disk +43 instance_labels: +44 - type +45 - model +46 - outage +47 - owner_node +48 - shared +49 - shelf +50 - shelf_bay +51 - serial_number +52 - failed +53 - container_type +``` + +### Control What Labels and Metrics are Exported + +Let's continue with `disk` and look at a few examples. We'll use `curl` to examine the Prometheus wire format that +Harvest uses to export the metrics from `conf/zapi/cdot/9.8.0/disk.yaml`. + +The curl below shows all exported disk metrics. There are 24 disks on this cluster, Harvest is collecting seven +metrics + one disk_labels + one plugin-created metric, `disk_new_status` for a total of 216 rows. + +``` +curl -s 'http://localhost:14002/metrics' | grep ^disk | cut -d'{' -f1 | sort | uniq -c + 24 disk_bytes_per_sector # metric + 24 disk_labels # labels + 24 disk_new_status # plugin created metric + 24 disk_sectors # metric + 24 disk_stats_average_latency # metric + 24 disk_stats_io_kbps # metric + 24 disk_stats_sectors_read # metric + 24 disk_stats_sectors_written # metric + 24 disk_uptime # metric +# sum = ((7 + 1 + 1) * 24 = 216 rows) +``` + +Here's a `disk_labels` for one instance, reformated to make it easier to read. + +``` +curl -s 'http://localhost:14002/metrics' | grep ^disk_labels | head -1 + +disk_labels{ + datacenter = "dc-1", # always included - value taken from datacenter in harvest.yml + cluster = "umeng-aff300-05-06", # always included + node = "umeng-aff300-06", # node is in the list of export_options instance_keys + disk = "1.1.13", # disk is in the list of export_options instance_keys + type = "SSD", # remainder are included because they are listed in the template's instance_labels + model = "X371_S1643960ATE", + outage = "", + owner_node = "umeng-aff300-06", + shared = "true", + shelf = "1", + shelf_bay = "13", + serial_number = "S3SENE0K500572", + failed = "false", + container_type = "", +} 1.0 +``` + +Here's the `disk_sectors` metric for a single instance. + +``` +curl -s 'http://localhost:14002/metrics' | grep ^disk_sectors | head -1 + +disk_sectors{ # prefix of disk_ + metric name (line 11 in template) + datacenter = "dc-1", # always included - value taken from datacenter in harvest.yml + cluster = "umeng-aff300-05-06", # always included + node = "umeng-aff300-06", # node is in the list of export_options instance_keys + disk = "1.1.17", # disk is in the list of export_options instance_keys +} 1875385008 # metric value - number of sectors for this disk instance +``` + +``` +Number of rows for each template = number of instances * (number of metrics + 1 (for _labels row) + plugin additions) +Number of metrics = number of counters which are not labels or keys, those without a ^ or ^^ +``` + +## Common Errors and Troubleshooting + +### 1. Failed to parse any metrics + +You add a new template to Harvest, restart your poller, and get an error message: + +``` +WRN ./poller.go:649 > init collector-object (Zapi:NetPort): no metrics => failed to parse any +``` + +This means the collector, `Zapi NetPort`, was unable to find any metrics. [Recall metrics](#harvest-templates) are lines +without prefixes. In cases where you don't have any metrics, but still want to collect labels, add +the `collect_only_labels: true` key-value to your template. This flag tells Harvest to ignore that you don't have +metrics and +continue. [Example](https://github.com/NetApp/harvest/blob/7334f11419075bf98b45fd14aee41dc2c16e4531/conf/zapi/cdot/9.8.0/qtree.yaml#L17). + +### 2. Missing Data + +1. What happens if an attribute is listed in the list of `instance_labels` (line 43 above), but that label is missing + from the list of counters captured at line 5? + +The label will still be written into `disk_labels`, but the value will be empty since it's missing. e.g if line 29 was +deleted `container_type` would still be present in `disk_labels{container_type=""}`. + +## Prometheus Wire Format + +https://prometheus.io/docs/instrumenting/exposition_formats/ + +Keep in mind that Prometheus does not permit dashes (`-`) in labels. That's why Harvest templates use name replacement +to convert dashed-names to underscored-names with `=>`. e.g. `bytes-per-sector => bytes_per_sector` +converts `bytes-per-sector` into the Prometheus accepted `bytes_per_sector`. + +Every time series is uniquely identified by its metric name and optional key-value pairs called labels. + +Labels enable Prometheus's dimensional data model: any combination of labels for the same metric name identifies a +particular dimensional instantiation of that metric (for example: all HTTP requests that used the method POST to the +/api/tracks handler). The query language allows filtering and aggregation based on these dimensions. Changing any label +value, including adding or removing a label, will create a new time series. + +> `{=, ...} value [ timestamp ]` +> +> - metric_name and label_name carry the usual Prometheus expression language restrictions +> - label_value can be any sequence of UTF-8 characters, but the backslash (\), double-quote ("), and line feed (\n) + characters have to be escaped as \\, \", and \n, respectively. +> - value is a float represented as required by Go's ParseFloat() function. In addition to standard numerical values, + NaN, +Inf, and -Inf are valid values representing not a number, positive infinity, and negative infinity, + respectively. +> - timestamp is an int64 (milliseconds since epoch, i.e. 1970-01-01 00:00:00 UTC, excluding leap seconds), represented + as required by Go's ParseInt() function +> +> [Exposition formats](https://prometheus.io/docs/instrumenting/exposition_formats/#comments-help-text-and-type-information) diff --git a/docs/system-requirements.md b/docs/system-requirements.md new file mode 100644 index 000000000..a6985e1db --- /dev/null +++ b/docs/system-requirements.md @@ -0,0 +1,16 @@ +Harvest is written in Go, which means it runs on recent Linux systems. +It also runs on Macs for development. + +Hardware requirements depend on how many clusters you monitor and the number of metrics you chose to collect. +With the default configuration, when monitoring 10 clusters, we recommend: + +- CPU: 2 cores +- Memory: 1 GB +- Disk: 500 MB (mostly used by log files) + +Harvest is compatible with: + +- Prometheus: `2.26` or higher +- InfluxDB: `v2` +- Grafana: `8.1.X` or higher +- Docker: `20.10.0` or higher diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 000000000..e38dffac2 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,3 @@ +- [Troubleshooting Harvest](https://github.com/NetApp/harvest/wiki/Troubleshooting-Harvest) +- [FAQ](https://github.com/NetApp/harvest/wiki/FAQ) +- [NABox Troubleshooting](https://nabox.org/documentation/troubleshooting/) \ No newline at end of file diff --git a/docs/upgrade.md b/docs/upgrade.md new file mode 100644 index 000000000..9c35566eb --- /dev/null +++ b/docs/upgrade.md @@ -0,0 +1,24 @@ +To upgrade Harvest + +Stop harvest +``` +cd +bin/harvest stop +``` + +Verify that all pollers have stopped: +``` +bin/harvest status +or +pgrep --full '\-\-poller' # should return nothing if all pollers are stopped +``` + +Follow the [installation](install.md) instructions to download and install Harvest and then +copy your old `harvest.yml` into the new install directory like so: + +``` +cp /path/to/old/harvest/harvest.yml /path/to/new/harvest.yml +``` + +After upgrade, re-import all dashboards (either `bin/harvest grafana import` cli or via the Grafana UI) to +get any new enhancements in dashboards. diff --git a/docs/what-is-harvest.md b/docs/what-is-harvest.md deleted file mode 100644 index 7b16cb829..000000000 --- a/docs/what-is-harvest.md +++ /dev/null @@ -1,16 +0,0 @@ -> Harvest is the open-metrics endpoint for ONTAP and StorageGRID - -NetApp Harvest brings observability to ONTAP and StorageGRID clusters. -Harvest collects performance, capacity and hardware metrics from ONTAP and StorageGRID, -transforms them, and routes them to your choice of time-series database. - -The included Grafana dashboards deliver the datacenter insights you need, while -new metrics can be collected with a few edits of the included template files. - -Harvest is open-source, released under an [Apache2 license](https://github.com/NetApp/harvest/blob/main/LICENSE), -and offers great flexibility in how you collect, augment, and export your datacenter metrics. - -!!! note - - Hop onto our [Discord](https://discordapp.com/channels/855068651522490400/1001963189124206732) - or GitHub [discussions](https://github.com/NetApp/harvest/discussions) and say hi. 👋🏽 diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..46375240c --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,88 @@ +site_name: Harvest +repo_url: https://github.com/NetApp/harvest +repo_name: NetApp/harvest +copyright: Copyright © NetApp +edit_uri: "" + +nav: + - What is Harvest?: 'index.md' + - Quickstart: 'quickstart.md' + - Installation: 'install.md' + - System Requirements: 'system-requirements.md' + - Upgrade: 'upgrade.md' + - Prepare Monitored Clusters: + - 'ONTAP cDOT': 'prepare-cdot-clusters.md' + - 'ONTAP 7mode': 'prepare-7mode-clusters.md' + - 'StorageGRID': 'prepare-storagegrid-clusters.md' + - Configure Harvest (basic): 'configure-harvest-basic.md' + - Configure Exporters: + - 'Prometheus': 'prometheus-exporter.md' + - 'InfluxDB': 'influxdb-exporter.md' + - Configure Grafana: 'configure-grafana.md' + - Configure Collectors: + - 'ZAPI': 'configure-zapi.md' + - 'REST': 'configure-rest.md' + - 'EMS': 'configure-ems.md' + - 'StorageGRID': 'configure-storagegrid.md' + - 'Unix': 'configure-unix.md' + - Templates: 'configure-templates.md' + - Dashboards: 'dashboards.md' + - Manage Harvest Pollers: 'manage-harvest.md' + - Configure Harvest (advanced): 'configure-harvest-advanced.md' + - Troubleshoot: 'troubleshooting.md' + - Reference: + - 'Templates And Metrics': 'resources/templates-and-metrics.md' + - 'Matrix': 'resources/matrix.md' + - About: + - 'License': 'license.md' + - 'Release Notes': 'release-notes.md' + +theme: + logo: assets/harvest.svg + favicon: assets/harvest.svg + name: material + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + toggle: + icon: material/brightness-auto + name: Switch to light mode + + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to system preference + +extra: + version: + provider: mike + social: + - icon: fontawesome/brands/discord + link: https://discordapp.com/channels/855068651522490400/1001963189124206732 + - icon: fontawesome/brands/github + link: https://github.com/NetApp/harvest/discussions + +markdown_extensions: + - admonition + - toc: + permalink: true + - pymdownx.details + - pymdownx.emoji: + emoji_index: !!python/name:materialx.emoji.twemoji + emoji_generator: !!python/name:materialx.emoji.to_svg + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + +plugins: [] # search does not work on gh-pages