Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(storage): use finer granularity to monitor request latency #7586

Merged
merged 8 commits into from
Feb 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions grafana/risingwave-dashboard.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ def metric(name, filter=None):

def quantile(f, percentiles):
quantile_map = {
"60": ["0.6", "60"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool, i have to questions

  1. Don't we need p999 anymore?
  2. Can we extend this fine-grained bucket statistics to other metrics, such as read/write duration

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P999 and max is duplicated. I think we only need to keep max.
In fact, if I do not close max and p999, I could even not see the value of p60 and p90....

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I also often encounter this problem. This pr only modifies create_iter_time. Is there any update?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would change it rollback. But I still do not think we need P999 and max.

"50": ["0.5", "50"],
"90": ["0.9", "90"],
"99": ["0.99", "99"],
Expand Down Expand Up @@ -1617,13 +1618,13 @@ def section_hummock(panels):
*quantile(
lambda quantile, legend: panels.target(
f"histogram_quantile({quantile}, sum(rate({metric('state_store_iter_duration_bucket')}[$__rate_interval])) by (le, job, instance, table_id))",
f"total_time p{legend} - {{{{table_id}}}} @ {{{{job}}}} @ {{{{instance}}}}",
f"create_iter_time p{legend} - {{{{table_id}}}} @ {{{{job}}}} @ {{{{instance}}}}",
),
[90, 99, 999, "max"],
),
panels.target(
f"sum by(le, job, instance)(rate({metric('state_store_iter_duration_sum')}[$__rate_interval])) / sum by(le, job,instance) (rate({metric('state_store_iter_duration_count')}[$__rate_interval]))",
"total_time avg - {{job}} @ {{instance}}",
"create_iter_time avg - {{job}} @ {{instance}}",
),
*quantile(
lambda quantile, legend: panels.target(
Expand Down
2 changes: 1 addition & 1 deletion grafana/risingwave-dashboard.json

Large diffs are not rendered by default.

14 changes: 10 additions & 4 deletions src/storage/src/monitor/monitored_storage_metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

use prometheus::core::{AtomicU64, GenericCounterVec};
use prometheus::{
exponential_buckets, histogram_opts, register_histogram_vec_with_registry,
exponential_buckets, histogram_opts, linear_buckets, register_histogram_vec_with_registry,
register_histogram_with_registry, register_int_counter_vec_with_registry, Histogram,
HistogramVec, Registry,
};
Expand Down Expand Up @@ -58,10 +58,16 @@ impl MonitoredStorageMetrics {
let get_value_size =
register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap();

let mut buckets = exponential_buckets(0.000004, 2.0, 4).unwrap(); // 4 ~ 32us
buckets.extend(linear_buckets(0.00006, 0.00004, 5).unwrap()); // 60 ~ 220us.
buckets.extend(linear_buckets(0.0003, 0.0001, 3).unwrap()); // 300 ~ 500us.
buckets.extend(exponential_buckets(0.001, 2.0, 5).unwrap()); // 1 ~ 16ms.
buckets.extend(exponential_buckets(0.05, 4.0, 5).unwrap()); // 0.05 ~ 1.28s.
buckets.push(16.0); // 16s
let get_duration_opts = histogram_opts!(
"state_store_get_duration",
"Total latency of get that have been issued to state store",
exponential_buckets(0.00001, 2.0, 21).unwrap() // max 10s
buckets.clone(),
);
let get_duration =
register_histogram_vec_with_registry!(get_duration_opts, &["table_id"], registry)
Expand All @@ -86,15 +92,15 @@ impl MonitoredStorageMetrics {
let opts = histogram_opts!(
"state_store_iter_duration",
"Histogram of iterator scan and initialization time that have been issued to state store",
exponential_buckets(0.0001, 2.0, 21).unwrap() // max 104s
buckets.clone(),
);
let iter_duration =
register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap();

let opts = histogram_opts!(
"state_store_iter_scan_duration",
"Histogram of iterator scan time that have been issued to state store",
exponential_buckets(0.0001, 2.0, 21).unwrap() // max 104s
buckets,
);
let iter_scan_duration =
register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap();
Expand Down
10 changes: 4 additions & 6 deletions src/storage/src/monitor/monitored_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ impl<S: StateStoreRead> MonitoredStateStore<S> {
.await
.inspect_err(|e| error!("Failed in iter: {:?}", e))?;

self.storage_metrics
.iter_duration
.with_label_values(&[table_id_label.as_str()])
.observe(start_time.elapsed().as_secs_f64());
// statistics of iter in process count to estimate the read ops in the same time
self.storage_metrics
.iter_in_process_counts
Expand All @@ -80,7 +84,6 @@ impl<S: StateStoreRead> MonitoredStateStore<S> {
stats: MonitoredStateStoreIterStats {
total_items: 0,
total_size: 0,
start_time,
scan_time: minstant::Instant::now(),
storage_metrics: self.storage_metrics.clone(),
table_id,
Expand Down Expand Up @@ -279,7 +282,6 @@ pub struct MonitoredStateStoreIter<S> {
struct MonitoredStateStoreIterStats {
total_items: usize,
total_size: usize,
start_time: minstant::Instant,
scan_time: minstant::Instant,
storage_metrics: Arc<MonitoredStorageMetrics>,

Expand Down Expand Up @@ -311,10 +313,6 @@ impl Drop for MonitoredStateStoreIterStats {
fn drop(&mut self) {
let table_id_label = self.table_id.to_string();

self.storage_metrics
.iter_duration
.with_label_values(&[table_id_label.as_str()])
.observe(self.start_time.elapsed().as_secs_f64());
self.storage_metrics
.iter_scan_duration
.with_label_values(&[table_id_label.as_str()])
Expand Down