Skip to content

Commit

Permalink
feat: Harvest should include SnapMirror Active Sync EMS events
Browse files Browse the repository at this point in the history
  • Loading branch information
cgrinds committed Jan 16, 2024
1 parent 47c03c6 commit 83324bc
Show file tree
Hide file tree
Showing 2 changed files with 259 additions and 7 deletions.
54 changes: 54 additions & 0 deletions conf/ems/9.6.0/ems.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -942,3 +942,57 @@ events:
resolve_when_ems:
- name: wafl.vvol.online
resolve_after: 672h

- name: sms.resync.attempt.failed
exports:
- ^^parameters.relationship_id => relationship_id
- parameters.dstpath => dst_path
- parameters.srcpath => src_path
- parameters.next_resync_interval => next_resync_interval

- name: sms.common.snapshot.failed
exports:
- ^^parameters.relationship_id => relationship_id
- parameters.dstpath => dst_path
- parameters.srcpath => src_path
- parameters.error_msg => error_msg
- parameters.css_fail_interval => css_fail_interval

- name: sms.snap.not.replicated
exports:
- parameters.snapshot => snapshot
- parameters.transferId => transfer_id
- parameters.volumeDSID => volume_DSID
- parameters.volumePath => volume_path
- parameters.failureReason => failure_reason

- name: sms.fanout.comm.snap.deleted
exports:
- parameters.sm_operation => sm_operation
- ^^parameters.relationship_id => relationship_id

- name: smc.snapmir.init.fail
exports:
- ^^parameters.relationship_id => relationship_id
- parameters.dstpath => dst_path
- parameters.srcpath => src_path
- parameters.error => error
resolve_when_ems:
- name: sms.status.in.sync
resolve_after: 672h

- name: smbc.aufo.failed
exports:
- parameters.dstpath => dst_path

- name: smbc.aufo.completed
exports:
- parameters.dstpath => dst_path

- name: smbc.pfo.failed
exports:
- parameters.dstpath => dst_path

- name: smbc.pfo.completed
exports:
- parameters.dstpath => dst_path
212 changes: 205 additions & 7 deletions container/prometheus/ems_alert_rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1213,7 +1213,7 @@ groups:
annotations:
summary: "ONTAP Mediator (version {{ $labels.version }}) is added on cluster '{{ $labels.cluster }}' having peer cluster '{{ $labels.peerCluster }}' and mediator IP address '{{ $labels.ipAddress }}'."

- alert: SMBC CA Certificate Expired
- alert: SnapMirror Active Sync CA Certificate Expired
expr: last_over_time(ems_events{message="sm.mediator.cacert.expired"}[5m]) == 1
labels:
severity: >
Expand All @@ -1235,7 +1235,7 @@ groups:
annotations:
summary: "CA certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}."

- alert: SMBC CA Certificate Expiring
- alert: SnapMirror Active Sync CA Certificate Expiring
expr: last_over_time(ems_events{message="sm.mediator.cacert.expiring"}[5m]) == 1
labels:
severity: >
Expand All @@ -1257,7 +1257,7 @@ groups:
annotations:
summary: "CA certificate for the ONTAP Mediator (IP: {{ $labels.ipAddress }}) will expire in {{ $labels.daysToExpire }} days. Expiry: {{ $labels.expiryDate }}."

- alert: SMBC Client Certificate Expired
- alert: SnapMirror Active Sync Client Certificate Expired
expr: last_over_time(ems_events{message="sm.mediator.clientc.expired"}[5m]) == 1
labels:
severity: >
Expand All @@ -1279,7 +1279,7 @@ groups:
annotations:
summary: "Client certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}."

- alert: SMBC Client Certificate Expiring
- alert: SnapMirror Active Sync Client Certificate Expiring
expr: last_over_time(ems_events{message="sm.mediator.clientc.expiring"}[5m]) == 1
labels:
severity: >
Expand Down Expand Up @@ -1345,7 +1345,7 @@ groups:
annotations:
summary: "ONTAP Mediator (version {{ $labels.version }}) was removed on cluster '{{ $labels.cluster }}' having peer cluster '{{ $labels.peerCluster }}' and mediator IP address '{{ $labels.ipAddress }}'."

- alert: SMBC Server Certificate Expired
- alert: SnapMirror Active Sync Server Certificate Expired
expr: last_over_time(ems_events{message="sm.mediator.serverc.expired"}[5m]) == 1
labels:
severity: >
Expand All @@ -1367,7 +1367,7 @@ groups:
annotations:
summary: "Server certificate of the ONTAP Mediator (IP: {{ $labels.ipAddress }}) expired on {{ $labels.expiryDate }}."

- alert: SMBC Server Certificate Expiring
- alert: SnapMirror Active Sync Server Certificate Expiring
expr: last_over_time(ems_events{message="sm.mediator.serverc.expiring"}[5m]) == 1
labels:
severity: >
Expand Down Expand Up @@ -1433,7 +1433,7 @@ groups:
annotations:
summary: "Source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" is in \"out-of-sync\" status due to the following reason: \"{{ $labels.error_msg }}\"."

- alert: SMBC Relationship Out of Sync
- alert: SnapMirror Active Sync Relationship Out of Sync
expr: last_over_time(ems_events{message="sms.status.out.of.sync.cg"}[4w]) == 1
labels:
severity: >
Expand Down Expand Up @@ -1652,3 +1652,201 @@ groups:
{{- end -}}
annotations:
summary: "vol=\"{{ $labels.vol }}\", app=\"{{ $labels.app }}\", volident=\"{{ $labels.volident }}\", instuuid=\"{{ $labels.instuuid }}\""

- alert: SnapMirror Relationship Resync Attempt Failed
expr: last_over_time(ems_events{message="sms.resync.attempt.failed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "Resynchronize operation between source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" has failed. The next auto-resync will be attempted after \"{{ $labels.next_resync_interval }}\" mins."

- alert: SnapMirror Relationship Common Snapshot Failed
expr: last_over_time(ems_events{message="sms.common.snapshot.failed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "Creating a common Snapshot copy for source volume \"{{ $labels.srcpath }}\" and destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" has failed due to the following reason:\"{{ $labels.error_msg }}\". Elapsed time since the latest successful common Snapshot copy is \"{{ $labels.css_fail_interval }}\"."

- alert: SnapMirror Relationship Snapshot is not Replicated
expr: last_over_time(ems_events{message="sms.snap.not.replicated"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "Snapshot copy \"{{ $labels.snapshot }}\" is not sucessfully replicated for the relationship \"{{ $labels.transferId }}\" with source volume DSID \"{{ $labels.volumeDSID }}\" and path \"{{ $labels.volumePath }}\". Reason: \"{{ $labels.failureReason }}\"."

- alert: Fanout SnapMirror Relationship Common Snapshot Deleted
expr: last_over_time(ems_events{message="sms.fanout.comm.snap.deleted"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror Synchronous operation \"{{ $labels.sm_operation }}\" for relationship \"{{ $labels.relationship_id }}\" has cleaned up some of the old base Snapshot copies between the synchronous source and synchronous destination, which could result in no common Snapshot copy existing between the synchronous and asynchronous destinations."

- alert: SnapMirror Relationship Initialization Failed
expr: last_over_time(ems_events{message="smc.snapmir.init.fail"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "Initialize from source volume \"{{ $labels.srcpath }}\" to destination volume \"{{ $labels.dstpath }}\" with relationship UUID \"{{ $labels.relationship_id }}\" failed with error \"{{ $labels.error }}\"."

- alert: SnapMirror Active Sync Automatic Unplanned Failover Failed
expr: last_over_time(ems_events{message="smbc.aufo.failed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror automatic failover failed for Destination path: \"{{ $labels.dstpath }}\"."

- alert: SnapMirror Active Sync Automatic Unplanned Failover Completed
expr: last_over_time(ems_events{message="smbc.aufo.completed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror automatic failover completed for Destination path: \"{{ $labels.dstpath }}\"."

- alert: SnapMirror Active Sync Planned Failover Failed
expr: last_over_time(ems_events{message="smbc.pfo.failed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror Active Sync planned failover operation failed for Destination path: \"{{ $labels.dstpath }}\"."

- alert: SnapMirror Active Sync Planned Failover Completed
expr: last_over_time(ems_events{message="smbc.pfo.completed"}[4w]) == 1
labels:
severity: >
{{- if $labels.severity -}}
{{- if eq $labels.severity "alert" -}}
critical
{{- else if eq $labels.severity "error" -}}
warning
{{- else if eq $labels.severity "emergency" -}}
critical
{{- else if eq $labels.severity "notice" -}}
info
{{- else if eq $labels.severity "informational" -}}
info
{{- else -}}
{{ $labels.severity }}
{{- end -}}
{{- end -}}
annotations:
summary: "SnapMirror Active Sync planned failover operation completed for Destination path: \"{{ $labels.dstpath }}\"."

0 comments on commit 83324bc

Please sign in to comment.