Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Harvest should include SnapMirror Active Sync EMS events #2588

Merged
merged 1 commit into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions conf/ems/9.6.0/ems.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -942,3 +942,57 @@ events:
resolve_when_ems:
- name: wafl.vvol.online
resolve_after: 672h

- name: sms.resync.attempt.failed
exports:
- ^^parameters.relationship_id => relationship_id
- parameters.dstpath => dst_path
- parameters.srcpath => src_path
- parameters.next_resync_interval => next_resync_interval

- name: sms.common.snapshot.failed
exports:
- ^^parameters.relationship_id => relationship_id
- parameters.dstpath => dst_path
- parameters.srcpath => src_path
- parameters.error_msg => error_msg
- parameters.css_fail_interval => css_fail_interval

- name: sms.snap.not.replicated
exports:
- parameters.snapshot => snapshot
- parameters.transferId => transfer_id
- parameters.volumeDSID => volume_DSID
- parameters.volumePath => volume_path
- parameters.failureReason => failure_reason

- name: sms.fanout.comm.snap.deleted
exports:
- parameters.sm_operation => sm_operation
- ^^parameters.relationship_id => relationship_id

- name: smc.snapmir.init.fail
exports:
- ^^parameters.relationship_id => relationship_id
- parameters.dstpath => dst_path
- parameters.srcpath => src_path
- parameters.error => error
resolve_when_ems:
- name: sms.status.in.sync
resolve_after: 672h

- name: smbc.aufo.failed
exports:
- parameters.dstpath => dst_path

- name: smbc.aufo.completed
exports:
- parameters.dstpath => dst_path

- name: smbc.pfo.failed
exports:
- parameters.dstpath => dst_path

- name: smbc.pfo.completed
exports:
- parameters.dstpath => dst_path
212 changes: 205 additions & 7 deletions container/prometheus/ems_alert_rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1213,7 +1213,7 @@ groups:
annotations:
summary: "ONTAP Mediator (version {{ $labels.version }}) is added on cluster '{{ $labels.cluster }}' having peer cluster '{{ $labels.peerCluster }}' and mediator IP address '{{ $labels.ipAddress }}'."

- alert: SMBC CA Certificate Expired
- alert: SnapMirror Active Sync CA Certificate Expired
expr: last_over_time(ems_events{message="sm.mediator.cacert.expired"}[5m]) == 1
labels:
severity: >
Expand All @@ -1235,7 +1235,7 @@ groups:
annotations:
summary: "CA certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}."

- alert: SMBC CA Certificate Expiring
- alert: SnapMirror Active Sync CA Certificate Expiring
expr: last_over_time(ems_events{message="sm.mediator.cacert.expiring"}[5m]) == 1
labels:
severity: >
Expand All @@ -1257,7 +1257,7 @@ groups:
annotations:
summary: "CA certificate for the ONTAP Mediator (IP: {{ $labels.ipAddress }}) will expire in {{ $labels.daysToExpire }} days. Expiry: {{ $labels.expiryDate }}."

- alert: SMBC Client Certificate Expired
- alert: SnapMirror Active Sync Client Certificate Expired
expr: last_over_time(ems_events{message="sm.mediator.clientc.expired"}[5m]) == 1
labels:
severity: >
Expand All @@ -1279,7 +1279,7 @@ groups:
annotations:
summary: "Client certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}."

- alert: SMBC Client Certificate Expiring
- alert: SnapMirror Active Sync Client Certificate Expiring
expr: last_over_time(ems_events{message="sm.mediator.clientc.expiring"}[5m]) == 1
labels:
severity: >
Expand Down Expand Up @@ -1345,7 +1345,7 @@ groups:
annotations:
summary: "ONTAP Mediator (version {{ $labels.version }}) was removed on cluster '{{ $labels.cluster }}' having peer cluster '{{ $labels.peerCluster }}' and mediator IP address '{{ $labels.ipAddress }}'."

- alert: SMBC Server Certificate Expired
- alert: SnapMirror Active Sync Server Certificate Expired
expr: last_over_time(ems_events{message="sm.mediator.serverc.expired"}[5m]) == 1
labels:
severity: >
Expand All @@ -1367,7 +1367,7 @@ groups:
annotations:
summary: "Server certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}."

- alert: SMBC Server Certificate Expiring
- alert: SnapMirror Active Sync Server Certificate Expiring
expr: last_over_time(ems_events{message="sm.mediator.serverc.expiring"}[5m]) == 1
labels:
severity: >
Expand Down Expand Up @@ -1433,7 +1433,7 @@ groups:
annotations:
summary: "Source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" is in \"out-of-sync\" status due to the following reason: \"{{ $labels.error_msg }}\"."

- alert: SMBC Relationship Out of Sync
- alert: SnapMirror Active Sync Relationship Out of Sync
expr: last_over_time(ems_events{message="sms.status.out.of.sync.cg"}[4w]) == 1
labels:
severity: >
Expand Down Expand Up @@ -1652,3 +1652,201 @@ groups:
{{- end -}}
annotations:
summary: "vol=\"{{ $labels.vol }}\", app=\"{{ $labels.app }}\", volident=\"{{ $labels.volident }}\", instuuid=\"{{ $labels.instuuid }}\""

- alert: SnapMirror Relationship Resync Attempt Failed
expr: last_over_time(ems_events{message="sms.resync.attempt.failed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "Resynchronize operation between source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" has failed. The next auto-resync will be attempted after \"{{ $labels.next_resync_interval }}\" mins."

- alert: SnapMirror Relationship Common Snapshot Failed
expr: last_over_time(ems_events{message="sms.common.snapshot.failed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "Creating a common Snapshot copy for source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" has failed due to the following reason:\"{{ $labels.error_msg }}\". Elapsed time since the latest successful common Snapshot copy is \"{{ $labels.css_fail_interval }}\"."

- alert: SnapMirror Relationship Snapshot is not Replicated
expr: last_over_time(ems_events{message="sms.snap.not.replicated"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "Snapshot copy \"{{ $labels.snapshot }}\" is not sucessfully replicated for the relationship \"{{ $labels.transferId }}\" with source volume DSID \"{{ $labels.volumeDSID }}\" and path \"{{ $labels.volumePath }}\". Reason: \"{{ $labels.failureReason }}\"."

- alert: Fanout SnapMirror Relationship Common Snapshot Deleted
expr: last_over_time(ems_events{message="sms.fanout.comm.snap.deleted"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror Synchronous operation \"{{ $labels.sm_operation }}\" for relationship \"{{ $labels.relationship_id }}\" has cleaned up some of the old base Snapshot copies between the synchronous source and synchronous destination, which could result in no common Snapshot copy existing between the synchronous and asynchronous destinations."

- alert: SnapMirror Relationship Initialization Failed
expr: last_over_time(ems_events{message="smc.snapmir.init.fail"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "Initialize from source volume \"{{ $labels.srcpath }}\" to destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" failed with error \"{{ $labels.error }}\"."

- alert: SnapMirror Active Sync Automatic Unplanned Failover Failed
expr: last_over_time(ems_events{message="smbc.aufo.failed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror automatic failover failed for Destination path: \"{{ $labels.dstpath }}\"."

- alert: SnapMirror Active Sync Automatic Unplanned Failover Completed
expr: last_over_time(ems_events{message="smbc.aufo.completed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror automatic failover completed for Destination path: \"{{ $labels.dstpath }}\"."

- alert: SnapMirror Active Sync Planned Failover Failed
expr: last_over_time(ems_events{message="smbc.pfo.failed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror Active Sync planned failover operation failed for Destination path: \"{{ $labels.dstpath }}\"."

- alert: SnapMirror Active Sync Planned Failover Completed
expr: last_over_time(ems_events{message="smbc.pfo.completed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror Active Sync planned failover operation completed for Destination path: \"{{ $labels.dstpath }}\"."
Loading