risingwavelabs · mergify · Feb 2, 2023 · Jan 30, 2023 · Jan 30, 2023 · Jan 30, 2023
diff --git a/grafana/risingwave-dashboard.dashboard.py b/grafana/risingwave-dashboard.dashboard.py
@@ -480,6 +480,7 @@ def metric(name, filter=None):
 
 def quantile(f, percentiles):
     quantile_map = {
+        "60": ["0.6", "60"],
         "50": ["0.5", "50"],
         "90": ["0.9", "90"],
         "99": ["0.99", "99"],
@@ -1617,13 +1618,13 @@ def section_hummock(panels):
                 *quantile(
                     lambda quantile, legend: panels.target(
                         f"histogram_quantile({quantile}, sum(rate({metric('state_store_iter_duration_bucket')}[$__rate_interval])) by (le, job, instance, table_id))",
-                        f"total_time p{legend} - {{{{table_id}}}} @ {{{{job}}}} @ {{{{instance}}}}",
+                        f"create_iter_time p{legend} - {{{{table_id}}}} @ {{{{job}}}} @ {{{{instance}}}}",
                     ),
                     [90, 99, 999, "max"],
                 ),
                 panels.target(
                     f"sum by(le, job, instance)(rate({metric('state_store_iter_duration_sum')}[$__rate_interval])) / sum by(le, job,instance) (rate({metric('state_store_iter_duration_count')}[$__rate_interval]))",
-                    "total_time avg - {{job}} @ {{instance}}",
+                    "create_iter_time avg - {{job}} @ {{instance}}",
                 ),
                 *quantile(
                     lambda quantile, legend: panels.target(

diff --git a/grafana/risingwave-dashboard.json b/grafana/risingwave-dashboard.json
diff --git a/src/storage/src/monitor/monitored_storage_metrics.rs b/src/storage/src/monitor/monitored_storage_metrics.rs
@@ -14,7 +14,7 @@
 
 use prometheus::core::{AtomicU64, GenericCounterVec};
 use prometheus::{
-    exponential_buckets, histogram_opts, register_histogram_vec_with_registry,
+    exponential_buckets, histogram_opts, linear_buckets, register_histogram_vec_with_registry,
     register_histogram_with_registry, register_int_counter_vec_with_registry, Histogram,
     HistogramVec, Registry,
 };
@@ -58,10 +58,16 @@ impl MonitoredStorageMetrics {
         let get_value_size =
             register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap();
 
+        let mut buckets = exponential_buckets(0.000004, 2.0, 4).unwrap(); // 4 ~ 32us
+        buckets.extend(linear_buckets(0.00006, 0.00004, 5).unwrap()); // 60 ~ 220us.
+        buckets.extend(linear_buckets(0.0003, 0.0001, 3).unwrap()); // 300 ~ 500us.
+        buckets.extend(exponential_buckets(0.001, 2.0, 5).unwrap()); // 1 ~ 16ms.
+        buckets.extend(exponential_buckets(0.05, 4.0, 5).unwrap()); // 0.05 ~ 1.28s.
+        buckets.push(16.0); // 16s
         let get_duration_opts = histogram_opts!(
             "state_store_get_duration",
             "Total latency of get that have been issued to state store",
-            exponential_buckets(0.00001, 2.0, 21).unwrap() // max 10s
+            buckets.clone(),
         );
         let get_duration =
             register_histogram_vec_with_registry!(get_duration_opts, &["table_id"], registry)
@@ -86,15 +92,15 @@ impl MonitoredStorageMetrics {
         let opts = histogram_opts!(
             "state_store_iter_duration",
             "Histogram of iterator scan and initialization time that have been issued to state store",
-            exponential_buckets(0.0001, 2.0, 21).unwrap() // max 104s
+            buckets.clone(),
         );
         let iter_duration =
             register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap();
 
         let opts = histogram_opts!(
             "state_store_iter_scan_duration",
             "Histogram of iterator scan time that have been issued to state store",
-            exponential_buckets(0.0001, 2.0, 21).unwrap() // max 104s
+            buckets,
         );
         let iter_scan_duration =
             register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap();

diff --git a/src/storage/src/monitor/monitored_store.rs b/src/storage/src/monitor/monitored_store.rs
@@ -68,6 +68,10 @@ impl<S: StateStoreRead> MonitoredStateStore<S> {
             .await
             .inspect_err(|e| error!("Failed in iter: {:?}", e))?;
 
+        self.storage_metrics
+            .iter_duration
+            .with_label_values(&[table_id_label.as_str()])
+            .observe(start_time.elapsed().as_secs_f64());
         // statistics of iter in process count to estimate the read ops in the same time
         self.storage_metrics
             .iter_in_process_counts
@@ -80,7 +84,6 @@ impl<S: StateStoreRead> MonitoredStateStore<S> {
             stats: MonitoredStateStoreIterStats {
                 total_items: 0,
                 total_size: 0,
-                start_time,
                 scan_time: minstant::Instant::now(),
                 storage_metrics: self.storage_metrics.clone(),
                 table_id,
@@ -279,7 +282,6 @@ pub struct MonitoredStateStoreIter<S> {
 struct MonitoredStateStoreIterStats {
     total_items: usize,
     total_size: usize,
-    start_time: minstant::Instant,
     scan_time: minstant::Instant,
     storage_metrics: Arc<MonitoredStorageMetrics>,
 
@@ -311,10 +313,6 @@ impl Drop for MonitoredStateStoreIterStats {
     fn drop(&mut self) {
         let table_id_label = self.table_id.to_string();
 
-        self.storage_metrics
-            .iter_duration
-            .with_label_values(&[table_id_label.as_str()])
-            .observe(self.start_time.elapsed().as_secs_f64());
         self.storage_metrics
             .iter_scan_duration
             .with_label_values(&[table_id_label.as_str()])