Merge pull request #196 from STCLab-Inc/181-metric-integrations-from-…

…telegraf feat(controller): add metric integrations from telegraf
STCLab-Inc · Nov 4, 2023 · 3d0e922 · 3d0e922
2 parents 92130b0 + 6d2e022
commit 3d0e922
Show file tree

Hide file tree

Showing 11 changed files with 1,028 additions and 0 deletions.
diff --git a/examples/metrics/telegraf/amazon_cloudwatch/amazon_dynamodb.yaml b/examples/metrics/telegraf/amazon_cloudwatch/amazon_dynamodb.yaml
@@ -0,0 +1,119 @@
+---
+kind: Metric
+id: telegraf_cloudwatch_amazon_dynamodb_metrics
+collector: telegraf
+metadata:
+  inputs:
+    cloudwatch:
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(ConsumedReadCapacityUnits) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["ConsumedReadCapacityUnits"] # The ConsumedReadCapacityUnits metric tracks the usage of read capacity units in a table. This metric represents the current workload and is crucial for scale-in and scale-out decisions. When capacity usage is high, it may indicate that the table's throughput is reaching its limits, leading to automatic scale-out to provide additional capacity.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(ConsumedWriteCapacityUnits) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["ConsumedWriteCapacityUnits"] # The ConsumedWriteCapacityUnits metric tracks the usage of write capacity units in a table. This metric represents the current workload and is crucial for scale-in and scale-out decisions. When capacity usage is high, it may indicate that the table's throughput is reaching its limits, leading to automatic scale-out to provide additional capacity.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(ProvisionedReadCapacityUnits) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["ProvisionedReadCapacityUnits"] # The ProvisionedReadCapacityUnits metric indicates the provisioned read units for a table. This metric is used to assess the current load on the table in comparison to its provisioned throughput.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(ProvisionedWriteCapacityUnits) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["ProvisionedWriteCapacityUnits"] # The ProvisionedWriteCapacityUnits metric indicates the provisioned write units for a table. This metric is used to assess the current load on the table in comparison to its provisioned throughput.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(ReadThrottleEvents) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["ReadThrottleEvents"] # The ReadThrottleEvents metric tracks the number of read requests that have been throttled. Throttling occurs when requests exceed the capacity limits of the table, indicating the need for scale-out or an increase in provisioned capacity.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(WriteThrottleEvents) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["WriteThrottleEvents"] # The WriteThrottleEvents metric tracks the number of write requests that have been throttled. Throttling occurs when requests exceed the capacity limits of the table, indicating the need for scale-out or an increase in provisioned capacity.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(ReturnedItemCount) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["ReturnedItemCount"] # The ReturnedItemCount metric tracks the number of items returned in query or scan operations. This metric is associated with the table's workload, and scale-out may be considered when the workload is high.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(UserErrors) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["UserErrors"] # The UserErrors metric monitors the number of errors generated by users. When errors occur, it may indicate the need for scale-out or improvements in application code.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/DynamoDB"]
+        metrics: # SELECT SUM(SystemErrors) FROM SCHEMA("AWS/DynamoDB", TableName) WHERE TableName = '"{{ table_name }}"'
+          - names: ["SystemErrors"] # The SystemErrors metric tracks the number of errors originating from DynamoDB itself. System errors can impact the scalability of the table, and monitoring these errors supports scale-in and scale-out decisions.
+            statistic_include: ["sum"] # Collect the metrics with the Sum statistic.
+            dimensions:
+              - name: TableName
+                value: "{{ table_name }}"
+  outputs:
+    wave-autoscale: {}
+---
+
diff --git a/examples/metrics/telegraf/amazon_cloudwatch/amazon_ec2.yaml b/examples/metrics/telegraf/amazon_cloudwatch/amazon_ec2.yaml
@@ -0,0 +1,71 @@
+---
+kind: Metric
+id: telegraf_cloudwatch_amazon_ec2_metrics
+collector: telegraf
+metadata:
+  inputs:
+    cloudwatch:
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/EC2"]
+        metrics: # SELECT MAX(CPUUtilization) FROM SCHEMA("AWS/EC2", AutoScalingGroupName) WHERE AutoScalingGroupName = '"{{ auto_scaling_group_name }}"'
+          - names: ["CPUUtilization"] # CPU usage reflects the current workload of the instance. High CPU usage may indicate that the instance is overloaded with the current workload and can be a signal for scaling out. If CPU usage remains consistently high, it may be necessary to start additional instances to distribute the load.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: AutoScalingGroupName
+                value: "{{ auto_scaling_group_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/EC2"]
+        metrics: # SELECT MAX(DiskReadOps) FROM SCHEMA("AWS/EC2", AutoScalingGroupName) WHERE AutoScalingGroupName = '"{{ auto_scaling_group_name }}"'
+          - names: ["DiskReadOps"] # High disk read operations can indicate an increasing I/O load on the disks. This can be a signal to consider scaling out.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: AutoScalingGroupName
+                value: "{{ auto_scaling_group_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/EC2"]
+        metrics: # SELECT MAX(DiskWriteOps) FROM SCHEMA("AWS/EC2", AutoScalingGroupName) WHERE AutoScalingGroupName = '"{{ auto_scaling_group_name }}"'
+          - names: ["DiskWriteOps"] # High disk write operations can indicate an increasing write I/O load on the disks. This can be a signal to consider scaling out.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: AutoScalingGroupName
+                value: "{{ auto_scaling_group_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/EC2"]
+        metrics: # SELECT MAX(NetworkIn) FROM SCHEMA("AWS/EC2", AutoScalingGroupName) WHERE AutoScalingGroupName = '"{{ auto_scaling_group_name }}"'
+          - names: ["NetworkIn"] # High network ingress indicates a high network traffic load on the instance. This can be a signal to consider scaling out.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: AutoScalingGroupName
+                value: "{{ auto_scaling_group_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/EC2"]
+        metrics: # SELECT MAX(NetworkOut) FROM SCHEMA("AWS/EC2", AutoScalingGroupName) WHERE AutoScalingGroupName = '"{{ auto_scaling_group_name }}"'
+          - names: ["NetworkOut"] # High network egress indicates a high network traffic load leaving the instance. This can be a signal to consider scaling out.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: AutoScalingGroupName
+                value: "{{ auto_scaling_group_name }}"
+  outputs:
+    wave-autoscale: {}
+---
+
diff --git a/examples/metrics/telegraf/amazon_cloudwatch/amazon_ecs.yaml b/examples/metrics/telegraf/amazon_cloudwatch/amazon_ecs.yaml
@@ -0,0 +1,71 @@
+---
+kind: Metric
+id: telegraf_cloudwatch_amazon_ecs_metrics
+collector: telegraf
+metadata:
+  inputs:
+    cloudwatch:
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/ECS"]
+        metrics: # SELECT MAX(CPUUtilization) FROM SCHEMA("AWS/ECS", ClusterName) WHERE ClusterName = '"{{ cluster_name }}"'
+          - names: ["CPUUtilization"] # When CPUUtilization decreases, indicating a reduced need for CPU resources, it becomes a candidate for scale-in. This may involve terminating unnecessary EC2 instances or reducing Fargate tasks. Conversely, when CPUUtilization increases, indicating a higher demand for CPU resources, it becomes a candidate for scale-out. This may involve launching additional EC2 instances or expanding Fargate tasks.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: ClusterName
+                value: "{{ cluster_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/ECS"]
+        metrics: # SELECT MAX(MemoryUtilization) FROM SCHEMA("AWS/ECS", ClusterName) WHERE ClusterName = '"{{ cluster_name }}"'
+          - names: ["MemoryUtilization"] # When MemoryUtilization decreases, indicating a reduced need for memory resources, it becomes a candidate for scale-in. This may involve terminating unnecessary EC2 instances or reducing Fargate tasks. Conversely, when MemoryUtilization increases, indicating a higher demand for memory resources, it becomes a candidate for scale-out. This may involve launching additional EC2 instances or expanding Fargate tasks.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: ClusterName
+                value: "{{ cluster_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/ECS"]
+        metrics: # SELECT MAX(ActiveConnectionCount) FROM SCHEMA("AWS/ECS", ClusterName) WHERE ClusterName = '"{{ cluster_name }}"'
+          - names: ["ActiveConnectionCount"] # If ActiveConnectionCount decreases, indicating fewer active connections, it may be considered for scale-in. Resources that are no longer needed can be released, or the number of tasks can be reduced. Conversely, if ActiveConnectionCount increases, indicating more active connections, it may be considered for scale-out. Additional tasks or expanded network resources can be deployed.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: ClusterName
+                value: "{{ cluster_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/ECS"]
+        metrics: # SELECT MAX(NewConnectionCount) FROM SCHEMA("AWS/ECS", ClusterName) WHERE ClusterName = '"{{ cluster_name }}"'
+          - names: ["NewConnectionCount"] # If NewConnectionCount decreases, indicating a decrease in new connections, it may be considered for scale-in. The number of tasks can be reduced, or expanded network resources can be removed. Conversely, if NewConnectionCount increases, indicating an increase in new connections, it may be considered for scale-out. Additional tasks or expanded network resources can be deployed.
+            statistic_include: ["maximum"] # Collect the metrics with the Sample Count statistic.
+            dimensions:
+              - name: ClusterName
+                value: "{{ cluster_name }}"
+      - region: "{{ region }}"
+        profile: "{{ profile }}"
+        period: "1m"
+        delay: "1m"
+        interval: "1m"
+        namespaces: ["AWS/ECS"]
+        metrics: # SELECT MAX(GPUReservation) FROM SCHEMA("AWS/ECS", ClusterName) WHERE ClusterName = '"{{ cluster_name }}"'
+          - names: ["GPUReservation"] # If additional GPU resources are required to handle GPU workloads, it may be considered for scale-out. This could involve launching additional EC2 GPU instances or expanding Fargate GPU tasks.
+            statistic_include: ["maximum"] # Collect the metrics with the Maximum statistic.
+            dimensions:
+              - name: ClusterName
+                value: "{{ cluster_name }}"
+  outputs:
+    wave-autoscale: {}
+---
+