diff --git a/CHANGELOG.md b/CHANGELOG.md index 155d49855c8..9e30bd05ba2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,6 @@ ## main / unreleased + +* [ENHANCEMENT] Add tracing integration to profiling endpoints [#3276](https://github.com/grafana/tempo/pull/3276) (@cyriltovena) * [FEATURE] Add configuration on tempo-query plugin for fetch services older than complete_block_timeout [#3262](https://github.com/grafana/tempo/pull/3350) (@rubenvp8510) * [FEATURE] Add support for multi-tenant queries in streaming search [#3262](https://github.com/grafana/tempo/pull/3262) (@electron0zero) * [FEATURE] TraceQL metrics queries [#3227](https://github.com/grafana/tempo/pull/3227) [#3252](https://github.com/grafana/tempo/pull/3252) [#3258](https://github.com/grafana/tempo/pull/3258) (@mdisibio @zalegrala) @@ -15,10 +17,11 @@ * [CHANGE] Make vParquet3 the default block encoding [#2526](https://github.com/grafana/tempo/pull/3134) (@stoewer) * [CHANGE] Set `autocomplete_filtering_enabled` to `true` by default [#3178](https://github.com/grafana/tempo/pull/3178) (@mapno) * [CHANGE] Update Alpine image version to 3.19 [#3289](https://github.com/grafana/tempo/pull/3289) (@zalegrala) -* [CHANGE] Introduce localblocks process config option to select only server spans 3303https://github.com/grafana/tempo/pull/3303 (@zalegrala) +* [CHANGE] Introduce localblocks process config option to select only server spans 3303 (@zalegrala) * [CHANGE] Localblocks processor honor tenant max trace size limit [3305](https://github.com/grafana/tempo/pull/3305) (@mdisibio) * [CHANGE] Major cache refactor to allow multiple role based caches to be configured [#3166](https://github.com/grafana/tempo/pull/3166). **BREAKING CHANGE** Deprecate the following fields. These have all been migrated to a top level "cache:" field. + ``` storage: trace: @@ -29,6 +32,7 @@ memcached: redis: ``` + * [FEATURE] Introduce list_blocks_concurrency on GCS and S3 backends to control backend load and performance. [#2652](https://github.com/grafana/tempo/pull/2652) (@zalegrala) * [FEATURE] Add per-tenant compaction window [#3129](https://github.com/grafana/tempo/pull/3129) (@zalegrala) * [ENHANCEMENT] Make the trace ID label name configurable for remote written exemplars [#3074](https://github.com/grafana/tempo/pull/3074) @@ -57,7 +61,7 @@ ## v2.3.1 / 2023-11-28 * [BUGFIX] Include statusMessage intrinsic attribute in tag search. [#3084](https://github.com/grafana/tempo/pull/3084) (@rcrowe) -* [BUGFIX] Fix compactor ignore configured S3 headers [#3149](https://github.com/grafana/tempo/pull/3154) (@Batkilin) +* [BUGFIX] Fix compactor ignore configured S3 headers [#3149](https://github.com/grafana/tempo/pull/3154) (@Batkilin) * [BUGFIX] Readd session token to s3 credentials. [#3144](https://github.com/grafana/tempo/pull/3144) (@farodin91) ## v2.3.0 / 2023-10-30 @@ -74,6 +78,7 @@ * [CHANGE] Overrides module refactor [#2688](https://github.com/grafana/tempo/pull/2688) (@mapno) Added new `defaults` block to the overrides' module. Overrides change to indented syntax. Old config: + ``` overrides: ingestion_rate_strategy: local @@ -83,7 +88,9 @@ overrides: forwarders: ['foo'] metrics_generator_processors: [service-graphs, span-metrics] ``` + New config: + ``` overrides: defaults: @@ -96,7 +103,8 @@ defaults: forwarders: ['foo'] metrics_generator: processors: [service-graphs, span-metrics] -``` +``` + * [CHANGE] Bump Jaeger query docker image to 1.50.0 [#2998](https://github.com/grafana/tempo/pull/2998) (@pavolloffay) * [FEATURE] New TraceQL structural operators ancestor (<<), parent (<) [#2877](https://github.com/grafana/tempo/pull/2877) (@kousikmitra) * [FEATURE] Add the `/api/status/buildinfo` endpoint [#2702](https://github.com/grafana/tempo/pull/2702) (@fabrizio-grafana) @@ -126,7 +134,7 @@ defaults: * [ENHANCEMENT] Add new config block to distributors to produce debug metrics. [#3008](https://github.com/grafana/tempo/pull/3008) (@joe-elliott) **Breaking Change** Removed deprecated config option: distributor.log_received_spans * [ENHANCEMENT] added a metrics generator config option to enable/disable X-Scope-OrgID headers on remote write. [#2974](https://github.com/grafana/tempo/pull/2974) (@vineetjp) -* [ENHANCEMENT] Correctly return RetryInfo to Otel Collector/Grafana Agent on ResourceExhausted. This allows the agents to honor their own retry +* [ENHANCEMENT] Correctly return RetryInfo to Otel Collector/Grafana Agent on ResourceExhausted. This allows the agents to honor their own retry settings. [#3019](https://github.com/grafana/tempo/pull/3019) (@joe-elliott) * [BUGFIX] Unescape tag names [#2894](https://github.com/grafana/tempo/pull/2894) (@fabrizio-grafana) * [BUGFIX] Load defaults for the internal server [#3041](https://github.com/grafana/tempo/pull/3041) (@rubenvp8510) @@ -136,17 +144,19 @@ defaults: * [BUGFIX] Only search ingester blocks that fall within the request time range. [#2783](https://github.com/grafana/tempo/pull/2783) (@joe-elliott) * [BUGFIX] Align tempo_query_frontend_queries_total and tempo_query_frontend_queries_within_slo_total. [#2840](https://github.com/grafana/tempo/pull/2840) (@joe-elliott) This query will now correctly tell you %age of requests that are within SLO: + ``` - sum(rate(tempo_query_frontend_queries_within_slo_total{}[1m])) by (op) + sum(rate(tempo_query_frontend_queries_within_slo_total{}[1m])) by (op) / sum(rate(tempo_query_frontend_queries_total{}[1m])) by (op) ``` - **BREAKING CHANGE** Removed: tempo_query_frontend_queries_total{op="searchtags|metrics"}. + + **BREAKING CHANGE** Removed: tempo_query_frontend_queries_total{op="searchtags|metrics"}. * [BUGFIX] To support blob storage in Azure Stack Hub as backend. [#2853](https://github.com/grafana/tempo/pull/2853) (@chlislb) * [BUGFIX] Respect spss on GRPC streaming. [#2971](https://github.com/grafana/tempo/pull/2840) (@joe-elliott) * [BUGFIX] Moved empty root span substitution from `querier` to `query-frontend`. [#2671](https://github.com/grafana/tempo/issues/2671) (@galalen) * [BUGFIX] Correctly propagate ingester errors on the query path [#2935](https://github.com/grafana/tempo/issues/2935) (@joe-elliott) -* [BUGFIX] Fix issue where ingester doesn't stop query after timeout [#3031](https://github.com/grafana/tempo/pull/3031) (@mdisibio) +* [BUGFIX] Fix issue where ingester doesn't stop query after timeout [#3031](https://github.com/grafana/tempo/pull/3031) (@mdisibio) * [BUGFIX] Fix cases where empty filter {} wouldn't return expected results [#2498](https://github.com/grafana/tempo/issues/2498) (@mdisibio) * [BUGFIX] Reorder S3 credential chain and upgrade minio-go. `native_aws_auth_enabled` is deprecated [#3006](https://github.com/grafana/tempo/pull/3006) (@ekristen, @mapno) * [BUGFIX] Update parquet-go dependency including a bugfix that prevents corrupted blocks from being written [#3068](https://github.com/grafana/tempo/pull/3068) (@stoewer) @@ -173,7 +183,6 @@ defaults: ## v2.2.0 / 2023-07-31 - * [CHANGE] Make vParquet2 the default block format [#2526](https://github.com/grafana/tempo/pull/2526) (@stoewer) * [CHANGE] Disable tempo-query by default in Jsonnet libs. [#2462](https://github.com/grafana/tempo/pull/2462) (@electron0zero) * [CHANGE] Integrate `gofumpt` into CI for formatting requirements [2584](https://github.com/grafana/tempo/pull/2584) (@zalegrala) @@ -181,19 +190,23 @@ defaults: * [CHANGE] Remove `tenant_header_key` option from `tempo-query` config [#2414](https://github.com/grafana/tempo/pull/2414) (@kousikmitra) * [CHANGE] **Breaking Change** Remove support tolerate_failed_blocks. [#2416](https://github.com/grafana/tempo/pull/2416) (@joe-elliott) Removed config option: + ``` query_frontend: tolerate_failed_blocks: ``` + * [CHANGE] Upgrade memcached version in jsonnet microservices [#2466](https://github.com/grafana/tempo/pull/2466) (@zalegrala) * [CHANGE] Prefix service graph extra dimensions labels with `server_` and `client_` if `enable_client_server_prefix` is enabled [#2335](https://github.com/grafana/tempo/pull/2335) (@domasx2) * [CHANGE] **Breaking Change** Rename s3.insecure_skip_verify [#2407](https://github.com/grafana/tempo/pull/2407) (@zalegrala) + ```yaml storage: trace: s3: insecure_skip_verify: true // renamed to tls_insecure_skip_verify ``` + * [CHANGE] Ignore context canceled errors in the queriers [#2440](https://github.com/grafana/tempo/pull/2440) (@joe-elliott) * [CHANGE] Start flush queue worker after wal replay and block rediscovery [#2456](https://github.com/grafana/tempo/pull/2456) (@ie-pham) * [CHANGE] Update Go to 1.20.4 [#2486](https://github.com/grafana/tempo/pull/2486) (@ie-pham) @@ -203,7 +216,7 @@ storage: * [FEATURE] Add user-configurable overrides module [#2543](https://github.com/grafana/tempo/pull/2543) [#2682](https://github.com/grafana/tempo/pull/2682) [#2681](https://github.com/grafana/tempo/pull/2681) (@electron0zero @kvrhdn) * [FEATURE] Add support for `q` query param in `/api/v2/search//values` to filter results based on a TraceQL query [#2253](https://github.com/grafana/tempo/pull/2253) (@mapno) To make use of filtering, configure `autocomplete_filtering_enabled`. -* [FEATURE] Add support for `by()` and `coalesce()` to TraceQL. [#2490](https://github.com/grafana/tempo/pull/2490) +* [FEATURE] Add support for `by()` and `coalesce()` to TraceQL. [#2490](https://github.com/grafana/tempo/pull/2490) * [FEATURE] Add a GRPC streaming endpoint for traceql search [#2366](https://github.com/grafana/tempo/pull/2366) (@joe-elliott) * [FEATURE] Add new API to summarize span metrics from generators [#2481](https://github.com/grafana/tempo/pull/2481) (@zalegrala) * [FEATURE] Add `select()` to TraceQL [#2494](https://github.com/grafana/tempo/pull/2494) (@joe-elliott) @@ -223,18 +236,22 @@ To make use of filtering, configure `autocomplete_filtering_enabled`. * [ENHANCEMENT] Add `prefix` configuration option to `storage.trace.azure` and `storage.trace.gcs` [#2386](https://github.com/grafana/tempo/pull/2386) (@kousikmitra) * [ENHANCEMENT] Add `prefix` configuration option to `storage.trace.s3` [#2362](https://github.com/grafana/tempo/pull/2362) (@kousikmitra) * [ENHANCEMENT] Add support for `concurrent_shards` under `trace_by_id` [#2416](https://github.com/grafana/tempo/pull/2416) (@joe-elliott) + ``` query_frontend: trace_by_id: concurrent_shards: 3 ``` + * [ENHANCEMENT] Enable cross cluster querying by adding two config options. [#2598](https://github.com/grafana/tempo/pull/2598) (@joe-elliott) + ``` querier: secondary_ingester_ring: metrics_generator: override_ring_key: ``` + * [ENHANCEMENT] Add `scope` parameter to `/api/search/tags` [#2282](https://github.com/grafana/tempo/pull/2282) (@joe-elliott) Create new endpoint `/api/v2/search/tags` that returns all tags organized by scope. * [ENHANCEMENT] Ability to toggle off latency or count metrics in metrics-generator [#2070](https://github.com/grafana/tempo/pull/2070) (@AlexDHoffer) @@ -250,7 +267,7 @@ To make use of filtering, configure `autocomplete_filtering_enabled`. * [BUGFIX] Fix Search SLO by routing tags to a new handler. [#2468](https://github.com/grafana/tempo/issues/2468) (@electron0zero) * [BUGFIX] tempodb integer divide by zero error [#2167](https://github.com/grafana/tempo/issues/2167) (@kroksys) * [BUGFIX] metrics-generator: ensure Prometheus will scale up shards when remote write is lagging behind [#2463](https://github.com/grafana/tempo/issues/2463) (@kvrhdn) -* [BUGFIX] Fixes issue where matches and other spanset level attributes were not persisted to the TraceQL results. [#2490](https://github.com/grafana/tempo/pull/2490) +* [BUGFIX] Fixes issue where matches and other spanset level attributes were not persisted to the TraceQL results. [#2490](https://github.com/grafana/tempo/pull/2490) * [BUGFIX] Fixes issue where ingester search could occasionally fail with file does not exist error [#2534](https://github.com/grafana/tempo/issues/2534) (@mdisibio) * [BUGFIX] Tempo failed to find meta.json path after adding prefix in S3/GCS/Azure configuration. [#2585](https://github.com/grafana/tempo/issues/2585) (@WildCatFish) * [BUGFIX] Delay logging config warnings until the logger has been initialized [#2645](https://github.com/grafana/tempo/pull/2645) (@kvrhdn) @@ -258,21 +275,25 @@ To make use of filtering, configure `autocomplete_filtering_enabled`. * [FEATURE] Add `tempo-cli` commands `analyse block` and `analyse blocks` to analyse parquet blocks and output summaries of generic attribute columns [#2622](https://github.com/grafana/tempo/pull/2622) (@mapno) ## v2.1.1 / 2023-04-28 + * [BUGFIX] Fix issue where Tempo sometimes flips booleans from false->true at storage time. [#2400](https://github.com/grafana/tempo/issues/2400) (@joe-elliott) ## v2.1.0 / 2023-04-26 + * [CHANGE] Capture and update search metrics for TraceQL [#2087](https://github.com/grafana/tempo/pull/2087) (@electron0zero) * [CHANGE] tempo-mixin: disable auto refresh every 10 seconds [#2290](https://github.com/grafana/tempo/pull/2290) (@electron0zero) * [CHANGE] Update tempo-mixin to show request in Resources dashboard [#2281](https://github.com/grafana/tempo/pull/2281) (@electron0zero) * [CHANGE] Add support for s3 session token in static config [#2093](https://github.com/grafana/tempo/pull/2093) (@farodin91) * [CHANGE] **Breaking Change** Remove support for search on v2 blocks. [#2159](https://github.com/grafana/tempo/pull/2159) (@joe-elliott) Removed config options: + ``` overrides: max_search_bytes_per_trace: search_tags_allow_list: search_tags_deny_list: ``` + Removed metrics: `tempo_ingester_trace_search_bytes_discarded_total` * [CHANGE] Stop caching parquet files for search [#2164](https://github.com/grafana/tempo/pull/2164) (@mapno) @@ -286,7 +307,7 @@ To make use of filtering, configure `autocomplete_filtering_enabled`. * [FEATURE] Add support for span `kind` to TraceQL [#2217](https://github.com/grafana/tempo/pull/2217) (@joe-elliott) * [FEATURE] Add support for min/max/avg aggregates to TraceQL[#2255](https://github.com/grafana/tempo/pull/2255) (@joe-elliott) * [ENHANCEMENT] Add Throughput and SLO Metrics with SLOConfig in Query Frontend [#2008](https://github.com/grafana/tempo/pull/2008) (@electron0zero) - - **BREAKING CHANGE** `query_frontend_result_metrics_inspected_bytes` metric removed in favour of `query_frontend_bytes_processed_per_second` + * **BREAKING CHANGE** `query_frontend_result_metrics_inspected_bytes` metric removed in favour of `query_frontend_bytes_processed_per_second` * [ENHANCEMENT] Metrics generator to make use of counters earlier [#2068](https://github.com/grafana/tempo/pull/2068) (@zalegrala) * [ENHANCEMENT] Log when a trace is too large to compact [#2105](https://github.com/grafana/tempo/pull/2105) (@scalalang2) * [ENHANCEMENT] Add support for arbitrary arithemtic to TraceQL queries [#2146](https://github.com/grafana/tempo/pull/2146) (@joe-elliott) @@ -320,6 +341,7 @@ To make use of filtering, configure `autocomplete_filtering_enabled`. * [CHANGE] **BREAKING CHANGE** Use snake case on Azure Storage config [#1879](https://github.com/grafana/tempo/issues/1879) (@faustodavid) Example of using snake case on Azure Storage config: + ``` # config.yaml storage: @@ -328,6 +350,7 @@ storage: storage_account_key: container_name: ``` + * [CHANGE] Increase default values for `server.grpc_server_max_recv_msg_size` and `server.grpc_server_max_send_msg_size` from 4MB to 16MB [#1688](https://github.com/grafana/tempo/pull/1688) (@mapno) * [CHANGE] Propagate Ingesters search errors correctly [#2023](https://github.com/grafana/tempo/pull/2023) (@electron0zero) * [CHANGE] **BREAKING CHANGE** Use storage.trace.block.row_group_size_bytes to cut rows during compaction instead of @@ -338,7 +361,7 @@ storage: Old config will still work but will be removed in a future release. [#1735](https://github.com/grafana/tempo/pull/1735) (@mapno) * [CHANGE] Update alpine image version to 3.16. [#1784](https://github.com/grafana/tempo/pull/1784) (@zalegrala) * [CHANGE] Delete TempoRequestErrors alert from mixin [#1810](https://github.com/grafana/tempo/pull/1810) (@zalegrala) - - **BREAKING CHANGE** Any jsonnet users relying on this alert should copy this into their own environment. + * **BREAKING CHANGE** Any jsonnet users relying on this alert should copy this into their own environment. * [CHANGE] Update and replace a few go modules [#1945](https://github.com/grafana/tempo/pull/1945) (@zalegrala) * Replace `github.com/thanos-io/thanos/pkg/discovery/dns` use with `github.com/grafana/dskit/dns` * Upgrade `github.com/grafana/dskit` @@ -346,6 +369,7 @@ Old config will still work but will be removed in a future release. [#1735](http * Upgrade `github.com/minio/minio-go/v7` * [CHANGE] Config updates to prepare for Tempo 2.0. [#1978](https://github.com/grafana/tempo/pull/1978) (@joe-elliott) Defaults updated: + ``` query_frontend: max_oustanding_per_tenant: 2000 @@ -373,7 +397,9 @@ Old config will still work but will be removed in a future release. [#1735](http read_buffer_count: 32 read_buffer_size_bytes: 1048576 ``` + **BREAKING CHANGE** Renamed/removed/moved + ``` query_frontend: query_shards: // removed. use trace_by_id.query_shards @@ -396,12 +422,14 @@ Old config will still work but will be removed in a future release. [#1735](http encoding: // renamed to v2_encoding row_group_size_bytes: // renamed to parquet_row_group_size_bytes ``` + * [CHANGE] **BREAKING CHANGE** Remove `search_enabled` and `metrics_generator_enabled`. Both default to true. [#2004](https://github.com/grafana/tempo/pull/2004) (@joe-elliott) * [CHANGE] Update OTel collector to v0.57.2 [#1757](https://github.com/grafana/tempo/pull/1757) (@mapno) -* [FEATURE] TraceQL support https://grafana.com/docs/tempo/latest/traceql/ +* [FEATURE] TraceQL support * [FEATURE] Parquet backend is GA and default * [FEATURE] Add generic forwarder and implement otlpgrpc forwarder [#1775](https://github.com/grafana/tempo/pull/1775) (@Blinkuu) New config options and example configuration: + ``` # config.yaml distributor: @@ -420,11 +448,13 @@ overrides: "example-tenant-2": forwarders: ['otel-forwarder'] ``` + * [ENHANCEMENT] Add support for TraceQL in Parquet WAL and Local Blocks. [#1966](https://github.com/grafana/tempo/pull/1966) (@electron0zero) * [ENHANCEMENT] Add `/status/usage-stats` endpoint to show usage stats data [#1782](https://github.com/grafana/tempo/pull/1782) (@electron0zero) * [ENHANCEMENT] Add TLS support to jaeger query plugin. [#1999](https://github.com/grafana/tempo/pull/1999) (@rubenvp8510) * [ENHANCEMENT] Collect inspectedBytes from SearchMetrics [#1975](https://github.com/grafana/tempo/pull/1975) (@electron0zero) * [ENHANCEMENT] Add zone awareness replication for ingesters. [#1936](https://github.com/grafana/tempo/pull/1936) (@manohar-koukuntla) + ``` # use the following fields in _config field of jsonnet config, to enable zone aware ingester multi_zone_ingester_enabled: false, @@ -432,12 +462,13 @@ overrides: multi_zone_ingester_replicas: 0, multi_zone_ingester_max_unavailable: 25, ``` + * [ENHANCEMENT] Support global and wildcard overrides in generic forwarder feature [#1871](https://github.com/grafana/tempo/pull/1871) (@Blinkuu) * [ENHANCEMENT] Add new data-type aware searchtagvalues v2 api [#1956](https://github.com/grafana/tempo/pull/1956) (@mdisibio) * [ENHANCEMENT] Refactor queueManager into generic queue.Queue [#1796](https://github.com/grafana/tempo/pull/1796) (@Blinkuu) - - **BREAKING CHANGE** Rename `tempo_distributor_forwarder_queue_length` metric to `tempo_distributor_queue_length`. New metric has two custom labels: `name` and `tenant`. - - Deprecated `tempo_distributor_forwarder_pushes_total` metric in favor of `tempo_distributor_queue_pushes_total`. - - Deprecated `tempo_distributor_forwarder_pushes_failures_total` metric in favor of `tempo_distributor_queue_pushes_failures_total`. + * **BREAKING CHANGE** Rename `tempo_distributor_forwarder_queue_length` metric to `tempo_distributor_queue_length`. New metric has two custom labels: `name` and `tenant`. + * Deprecated `tempo_distributor_forwarder_pushes_total` metric in favor of `tempo_distributor_queue_pushes_total`. + * Deprecated `tempo_distributor_forwarder_pushes_failures_total` metric in favor of `tempo_distributor_queue_pushes_failures_total`. * [ENHANCEMENT] Filter namespace by cluster in tempo dashboards variables [#1771](https://github.com/grafana/tempo/pull/1771) (@electron0zero) * [ENHANCEMENT] Exit early from sharded search requests [#1742](https://github.com/grafana/tempo/pull/1742) (@electron0zero) * [ENHANCEMENT] Upgrade prometheus/prometheus to `51a44e6657c3` [#1829](https://github.com/grafana/tempo/pull/1829) (@mapno) @@ -449,7 +480,7 @@ overrides: * [ENHANCEMENT] Improve parquet compaction memory profile when dropping spans [#1692](https://github.com/grafana/tempo/pull/1692) (@joe-elliott) * [ENHANCEMENT] Use Parquet for local block search, tag search and tag value search instead of flatbuffers. A configuration value (`ingester.use_flatbuffer_search`) is provided to continue using flatbuffers. - - **BREAKING CHANGE** Makes Parquet the default encoding. + * **BREAKING CHANGE** Makes Parquet the default encoding. * [ENHANCEMENT] Return 200 instead of 206 when blocks failed is < tolerate_failed_blocks. [#1725](https://github.com/grafana/tempo/pull/1725) (@joe-elliott) * [ENHANCEMENT] Add GOMEMLIMIT variable to compactor jsonnet and set the value to equal compactor memory limit. [#1758](https://github.com/grafana/tempo/pull/1758/files) (@ie-pham) * [ENHANCEMENT] Add capability to configure the used S3 Storage Class [#1697](https://github.com/grafana/tempo/pull/1714) (@amitsetty) @@ -462,14 +493,16 @@ overrides: * [ENHANCEMENT] metrics-generator: filter out older spans before metrics are aggregated [#1612](https://github.com/grafana/tempo/pull/1612) (@ie-pham) * [ENHANCEMENT] Add hedging to trace by ID lookups created by the frontend. [#1735](https://github.com/grafana/tempo/pull/1735) (@mapno) New config options and defaults: + ``` query_frontend: trace_by_id: hedge_requests_at: 5s hedge_requests_up_to: 3 ``` + * [ENHANCEMENT] Vulture now has improved distribution of the random traces it searches. [#1763](https://github.com/grafana/tempo/pull/1763) (@rfratto) -* [ENHANCEMENT] Upgrade opentelemetry-proto submodule to v0.18.0 Internal types are updated to use `scope` instead of `instrumentation_library`. +* [ENHANCEMENT] Upgrade opentelemetry-proto submodule to v0.18.0 Internal types are updated to use `scope` instead of `instrumentation_library`. This is a breaking change in trace by ID queries if JSON is requested. [#1754](https://github.com/grafana/tempo/pull/1754) (@mapno) * [ENHANCEMENT] Add TLS support to the vulture [#1874](https://github.com/grafana/tempo/pull/1874) (@zalegrala) * [ENHANCEMENT] metrics-generator: extract `status_message` field from spans [#1786](https://github.com/grafana/tempo/pull/1786), [#1794](https://github.com/grafana/tempo/pull/1794) (@stoewer) @@ -505,24 +538,31 @@ query_frontend: * [CHANGE] **BREAKING CHANGE** Change spanmetrics metric names and labels to match OTel conventions. [#1478](https://github.com/grafana/tempo/pull/1478) (@mapno) * [FEATURE] Add support for time picker in jaeger query plugin. [#1631](https://github.com/grafana/tempo/pull/1631) (@rubenvp8510) Old metric names: + ``` traces_spanmetrics_duration_seconds_{sum,count,bucket} ``` + New metric names: + ``` traces_spanmetrics_latency_{sum,count,bucket} ``` + Additionally, default label `span_status` is renamed to `status_code`. + * [CHANGE] Update to Go 1.18 [#1504](https://github.com/grafana/tempo/pull/1504) (@annanay25) * [CHANGE] Change tag/value lookups to return partial results when reaching response size limit instead of failing [#1517](https://github.com/grafana/tempo/pull/1517) (@mdisibio) * [CHANGE] Change search to be case-sensitive [#1547](https://github.com/grafana/tempo/issues/1547) (@mdisibio) * [CHANGE] Relax Hedged request defaults for external endpoints. [#1566](https://github.com/grafana/tempo/pull/1566) (@joe-elliott) + ``` querier: search: external_hedge_requests_at: 4s -> 8s external_hedge_requests_up_to: 3 -> 2 ``` + * [CHANGE] **BREAKING CHANGE** Include emptyDir for metrics generator wal storage in jsonnet [#1556](https://github.com/grafana/tempo/pull/1556) (@zalegrala) Jsonnet users will now need to specify a storage request and limit for the generator wal. _config+:: { @@ -544,7 +584,7 @@ Jsonnet users will now need to specify a storage request and limit for the gener * [ENHANCEMENT] Add metric to track feature enablement [#1459](https://github.com/grafana/tempo/pull/1459) (@zalegrala) * [ENHANCEMENT] Added s3 config option `insecure_skip_verify` [#1470](https://github.com/grafana/tempo/pull/1470) (@zalegrala) * [ENHANCEMENT] Added polling option to reduce issues in Azure `blocklist_poll_jitter_ms` [#1518](https://github.com/grafana/tempo/pull/1518) (@joe-elliott) -* [ENHANCEMENT] Add a config to query single ingester instance based on trace id hash for Trace By ID API. (1484)[https://github.com/grafana/tempo/pull/1484] (@sagarwala, @bikashmishra100, @ashwinidulams) +* [ENHANCEMENT] Add a config to query single ingester instance based on trace id hash for Trace By ID API. [1484](https://github.com/grafana/tempo/pull/1484) (@sagarwala, @bikashmishra100, @ashwinidulams) * [ENHANCEMENT] Add blocklist metrics for total backend objects and total backend bytes [#1519](https://github.com/grafana/tempo/pull/1519) (@ie-pham) * [ENHANCEMENT] Adds `tempo_querier_external_endpoint_hedged_roundtrips_total` to count the total hedged requests [#1558](https://github.com/grafana/tempo/pull/1558) (@joe-elliott) **BREAKING CHANGE** Removed deprecated metrics `tempodb_(gcs|s3|azure)_request_duration_seconds` in favor of `tempodb_backend_request_duration_seconds`. These metrics @@ -602,21 +642,26 @@ Jsonnet users will now need to specify a storage request and limit for the gener * [ENHANCEMENT] Add forwarding queue from distributor to metrics-generator. [#1331](https://github.com/grafana/tempo/pull/1331) (@mapno) * [ENHANCEMENT] Add hedging to queries to external endpoints. [#1350](https://github.com/grafana/tempo/pull/1350) (@joe-elliott) New config options and defaults: + ``` querier: search: external_hedge_requests_at: 5s external_hedge_requests_up_to: 3 ``` - ** BREAKING CHANGE ** + + **BREAKING CHANGE** Querier options related to search have moved under a `search` block: + ``` querier: search_query_timeout: 30s search_external_endpoints: [] search_prefer_self: 2 ``` + becomes + ``` querier: search: @@ -624,6 +669,7 @@ Jsonnet users will now need to specify a storage request and limit for the gener prefer_self: 2 external_endpoints: [] ``` + * [ENHANCEMENT] Added tenant ID (instance ID) to `trace too large message`. [#1385](https://github.com/grafana/tempo/pull/1385) (@cristiangsp) * [ENHANCEMENT] Add a startTime and endTime parameter to the Trace by ID Tempo Query API to improve query performance [#1388](https://github.com/grafana/tempo/pull/1388) (@sagarwala, @bikashmishra100, @ashwinidulams) * [BUGFIX] Correct issue where Azure "Blob Not Found" errors were sometimes not handled correctly [#1390](https://github.com/grafana/tempo/pull/1390) (@joe-elliott) @@ -633,24 +679,29 @@ Jsonnet users will now need to specify a storage request and limit for the gener * [BUGFIX] Update OTLP port in examples (docker-compose & kubernetes) from legacy ports (55680/55681) to new ports (4317/4318) [#1294](https://github.com/grafana/tempo/pull/1294) (@mapno) * [BUGFIX] Fixes min/max time on blocks to be based on span times instead of ingestion time. [#1314](https://github.com/grafana/tempo/pull/1314) (@joe-elliott) * Includes new configuration option to restrict the amount of slack around now to update the block start/end time. [#1332](https://github.com/grafana/tempo/pull/1332) (@joe-elliott) + ``` storage: trace: wal: ingestion_time_range_slack: 2m0s ``` + * Includes a new metric to determine how often this range is exceeded: `tempo_warnings_total{reason="outside_ingestion_time_slack"}` * [BUGFIX] Prevent data race / ingester crash during searching by trace id by using xxhash instance as a local variable. [#1387](https://github.com/grafana/tempo/pull/1387) (@bikashmishra100, @sagarwala, @ashwinidulams) * [BUGFIX] Fix spurious "failed to mark block compacted during retention" errors [#1372](https://github.com/grafana/tempo/issues/1372) (@mdisibio) * [BUGFIX] Fix error message "Writer is closed" by resetting compression writer correctly on the error path. [#1379](https://github.com/grafana/tempo/issues/1379) (@annanay25) ## v1.3.2 / 2022-02-23 + * [BUGFIX] Fixed an issue where the query-frontend would corrupt start/end time ranges on searches which included the ingesters [#1295] (@joe-elliott) ## v1.3.1 / 2022-02-02 + * [BUGFIX] Fixed panic when using etcd as ring's kvstore [#1260](https://github.com/grafana/tempo/pull/1260) (@mapno) ## v1.3.0 / 2022-01-24 + * [FEATURE]: Add support for [inline environments](https://tanka.dev/inline-environments). [#1184](https://github.com/grafana/tempo/pull/1184) @irizzant * [CHANGE] Search: Add new per-tenant limit `max_bytes_per_tag_values_query` to limit the size of tag-values response. [#1068](https://github.com/grafana/tempo/pull/1068) (@annanay25) * [CHANGE] Reduce MaxSearchBytesPerTrace `ingester.max-search-bytes-per-trace` default to 5KB [#1129](https://github.com/grafana/tempo/pull/1129) @annanay25 @@ -693,15 +744,17 @@ Jsonnet users will now need to specify a storage request and limit for the gener * [BUGFIX] tempo-mixin: remove TempoDB Access panel from `Tempo / Reads`, metrics don't exist anymore [#1218](https://github.com/grafana/tempo/issues/1218) (@kvrhdn) ## v1.2.1 / 2021-11-15 + * [BUGFIX] Fix defaults for MaxBytesPerTrace (ingester.max-bytes-per-trace) and MaxSearchBytesPerTrace (ingester.max-search-bytes-per-trace) [#1109](https://github.com/grafana/tempo/pull/1109) (@bitprocessor) * [BUGFIX] Ignore empty objects during compaction [#1113](https://github.com/grafana/tempo/pull/1113) (@mdisibio) ## v1.2.0 / 2021-11-05 + * [CHANGE] **BREAKING CHANGE** Drop support for v0 and v1 blocks. See [1.1 changelog](https://github.com/grafana/tempo/releases/tag/v1.1.0) for details [#919](https://github.com/grafana/tempo/pull/919) (@joe-elliott) * [CHANGE] Renamed CLI flag from `--storage.trace.maintenance-cycle` to `--storage.trace.blocklist_poll`. This is a **breaking change** [#897](https://github.com/grafana/tempo/pull/897) (@mritunjaysharma394) * [CHANGE] update jsonnet alerts and recording rules to use `job_selectors` and `cluster_selectors` for configurable unique identifier labels [#935](https://github.com/grafana/tempo/pull/935) (@kevinschoonover) * [CHANGE] Modify generated tag keys in Vulture for easier filtering [#934](https://github.com/grafana/tempo/pull/934) (@zalegrala) -* [CHANGE] **BREAKING CHANGE** Consolidate status information onto /status endpoint [ #952 ](https://github.com/grafana/tempo/pull/952) @zalegrala) +* [CHANGE] **BREAKING CHANGE** Consolidate status information onto /status endpoint [#952](https://github.com/grafana/tempo/pull/952) @zalegrala) The following endpoints moved. `/runtime_config` moved to `/status/runtime_config` `/config` moved to `/status/config` @@ -722,6 +775,7 @@ Jsonnet users will now need to specify a storage request and limit for the gener * [ENHANCEMENT] Make `overrides_config` block name consistent with Loki and Cortex in microservice mode. [#906](https://github.com/grafana/tempo/pull/906) (@kavirajk) * [ENHANCEMENT] Changes the metrics name from `cortex_runtime_config_last_reload_successful` to `tempo_runtime_config_last_reload_successful` [#945](https://github.com/grafana/tempo/pull/945) (@kavirajk) * [ENHANCEMENT] Updated config defaults to reflect better capture operational knowledge. [#913](https://github.com/grafana/tempo/pull/913) (@joe-elliott) + ``` ingester: trace_idle_period: 30s => 10s # reduce ingester memory requirements with little impact on querying @@ -740,6 +794,7 @@ Jsonnet users will now need to specify a storage request and limit for the gener chunk_size_bytes: 10 MiB => 5 MiB # will reduce compactor memory needs compaction_window: 4h => 1h # will allow more compactors to participate in compaction without substantially increasing blocks ``` + * [ENHANCEMENT] Make s3 backend readError logic more robust [#905](https://github.com/grafana/tempo/pull/905) (@wei840222) * [ENHANCEMENT] Include additional detail when searching for traces [#916](https://github.com/grafana/tempo/pull/916) (@zalegrala) * [ENHANCEMENT] Add `gen index` and `gen bloom` commands to tempo-cli. [#903](https://github.com/grafana/tempo/pull/903) (@annanay25) @@ -782,6 +837,7 @@ Jsonnet users will now need to specify a storage request and limit for the gener * [BUGFIX] Update go-kit logger package to remove spurious debug logs [#1094](https://github.com/grafana/tempo/pull/1094) (@bboreham) ## v1.1.0 / 2021-08-26 + * [CHANGE] Upgrade Cortex from v1.9.0 to v1.9.0-131-ga4bf10354 [#841](https://github.com/grafana/tempo/pull/841) (@aknuds1) * [CHANGE] Change default tempo port from 3100 to 3200 [#770](https://github.com/grafana/tempo/pull/809) (@MurzNN) * [CHANGE] Jsonnet: use dedicated configmaps for distributors and ingesters [#775](https://github.com/grafana/tempo/pull/775) (@kvrhdn) @@ -817,7 +873,6 @@ Jsonnet users will now need to specify a storage request and limit for the gener * [BUGFIX] Update port spec for GCS docker-compose example [#869](https://github.com/grafana/tempo/pull/869) (@zalegrala) * [BUGFIX] Cortex upgrade to fix an issue where unhealthy compactors can't be forgotten [#878](https://github.com/grafana/tempo/pull/878) (@joe-elliott) - ## v1.0.1 / 2021-06-14 * [BUGFIX] Guard against negative dataLength [#763](https://github.com/grafana/tempo/pull/763) (@joe-elliott) diff --git a/cmd/tempo/main.go b/cmd/tempo/main.go index 5747a03af08..c79cd046188 100644 --- a/cmd/tempo/main.go +++ b/cmd/tempo/main.go @@ -14,6 +14,7 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/flagext" dslog "github.com/grafana/dskit/log" + "github.com/grafana/dskit/spanprofiler" "github.com/grafana/dskit/tracing" ot "github.com/opentracing/opentracing-go" "github.com/prometheus/client_golang/prometheus" @@ -228,6 +229,8 @@ func installOpenTracingTracer(config *app.Config) (func(), error) { if err != nil { return nil, fmt.Errorf("error initialising tracer: %w", err) } + ot.SetGlobalTracer(spanprofiler.NewTracer(ot.GlobalTracer())) + return func() { if err := trace.Close(); err != nil { level.Error(log.Logger).Log("msg", "error closing tracing", "err", err) @@ -286,7 +289,7 @@ func installOpenTelemetryTracer(config *app.Config) (func(), error) { bridgeTracer.SetWarningHandler(func(msg string) { level.Warn(log.Logger).Log("msg", msg, "source", "BridgeTracer.OnWarningHandler") }) - ot.SetGlobalTracer(bridgeTracer) + ot.SetGlobalTracer(spanprofiler.NewTracer(bridgeTracer)) // Install the OpenCensus bridge oc_bridge.InstallTraceBridge(oc_bridge.WithTracerProvider(tp)) diff --git a/modules/distributor/distributor_test.go b/modules/distributor/distributor_test.go index 863e9c9896d..f9cf8c33b18 100644 --- a/modules/distributor/distributor_test.go +++ b/modules/distributor/distributor_test.go @@ -1492,6 +1492,10 @@ func (r mockRing) ShuffleShardWithLookback(string, int, time.Duration, time.Time return r } +func (r mockRing) GetTokenRangesForInstance(_ string) (ring.TokenRanges, error) { + return nil, nil +} + func (r mockRing) InstancesCount() int { return len(r.ingesters) } @@ -1506,7 +1510,3 @@ func (r mockRing) CleanupShuffleShardCache(string) { func (r mockRing) GetInstanceState(string) (ring.InstanceState, error) { return ring.ACTIVE, nil } - -func (r mockRing) GetTokenRangesForInstance(_ string) (ring.TokenRanges, error) { - return nil, nil -} diff --git a/vendor/github.com/grafana/dskit/spanprofiler/README.md b/vendor/github.com/grafana/dskit/spanprofiler/README.md new file mode 100644 index 00000000000..a415985f664 --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/README.md @@ -0,0 +1,104 @@ +# Span Profiler for OpenTracing-Go + +## Overview + +The Span Profiler for OpenTracing-Go is a package that seamlessly integrates `opentracing-go` instrumentation with +profiling through the use of pprof labels. + +Accessing trace span profiles is made convenient through the Grafana Explore view. You can find a complete example setup +with Grafana Tempo in the [Pyroscope repository](https://github.com/grafana/pyroscope/tree/main/examples/tracing/tempo): + +![image](https://github.com/grafana/otel-profiling-go/assets/12090599/31e33cd1-818b-4116-b952-c9ec7b1fb593) + +## Usage + +There are two primary ways to use the Span Profiler: + +### 1. Wrap the Global Tracer. + +You can wrap the global tracer using `spanprofiler.NewTracer`: + +```go +import ( + "github.com/opentracing/opentracing-go" + "github.com/grafana/dskit/spanprofiler" +) + +func main() { + // Initialize your OpenTracing tracer + tracer := opentracing.GlobalTracer() + // Wrap it with the tracer-profiler + wrappedTracer := spanprofiler.NewTracer(tracer) + // Use the wrapped tracer in your application + opentracing.SetGlobalTracer(wrappedTracer) + + // Or, as an oneliner: + // opentracing.SetGlobalTracer(spanprofiler.NewTracer(opentracing.GlobalTracer())) + + // Your application logic here +} +``` + +For efficiency, the tracer selectively records profiles for _root_ spans — the initial _local_ span in a process — since +a trace may encompass thousands of spans. All stack trace samples accumulated during the execution of their child spans +contribute to the root span's profile. In practical terms, this signifies that, for instance, an HTTP request results +in a singular profile, irrespective of the numerous spans within the trace. It's important to note that these profiles +don't extend beyond the boundaries of a single process. + +The limitation of this approach is that only spans created within the same goroutine, or its children, as the parent are +taken into account. Consequently, in scenarios involving asynchronous execution, where the parent span context is passed +to another goroutine, explicit profiling becomes necessary using `spanprofiler.StartSpanFromContext`. + +### 2. Profile individual spans. + +The `spanprofiler.StartSpanFromContext` function allows you to granularly control which spans to profile: + +```go +func YourOperationName(ctx context.Background()) { + // Start a span and enable profiling for it + span, ctx := spanprofiler.StartSpanFromContext(ctx, "YourOperationName", tracer) + defer span.Finish() // Finish the span when done + + // Use the span in your application logic +} +``` + +The function guarantees that the span is to be profiled. + +Both methods can be employed either in conjunction or independently. Our recommendation is to utilize the tracer for +seamless integration, reserving explicit span profiling only for cases where spans are spawned in detached goroutines. + +## Implementation details + +When a new trace span is created, and is eligible for profiling, the tracer sets `span_id` and `span_name` [pprof labels](https://github.com/google/pprof/blob/master/doc/README.md#tag-filtering) +that point to the respective span. These labels are stored in the goroutine's local storage and inherited by any +subsequent child goroutines. + +`span_name` is available as a regular label and can be used in the query expressions. For example, the following query +will show you profile for the code that is not covered with traces: +``` +{service_name="my-service",span_name=""} +``` + +Additionally, trace spans are identified by the `pyroscope.profile.id` attribute, indicating the associated profile. +This allows to find such spans in the trace view (in the screenshot) and fetch profiles for specific spans. + +It's important to note that the presence of this attribute does not guarantee profile availability; stack trace samples +might not be collected if the CPU time utilized falls below the sample interval (10ms). + +It is crucial to understand that this module doesn't directly control the pprof profiler; its initialization is still +necessary for profile collection. This initialization can be achieved through the `runtime/pprof` package, or using the +[Pyroscope client](https://github.com/grafana/pyroscope-go). + +Limitations: + - Only CPU profiling is fully supported at the moment. + - Only [Jaeger tracer](https://github.com/jaegertracing/jaeger-client-go) implementation is supported. + +## Performance implications + +The typical performance impact is generally imperceptible and primarily arises from the cost of pprof labeling. However, +intensive use of pprof labels may have negative impact on the profiled application. + +In the case of the tracer provided by this package, the `StartSpan` method wrapper introduces an approximate 20% increase +in CPU time compared to the original call. In vase majority of cases, the overhead constitutes less than 0.01% of the total +CPU time and is considered safe for deployment in production systems. diff --git a/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go b/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go new file mode 100644 index 00000000000..8481d04498d --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go @@ -0,0 +1,107 @@ +package spanprofiler + +import ( + "context" + "runtime/pprof" + + "github.com/opentracing/opentracing-go" + "github.com/uber/jaeger-client-go" +) + +// StartSpanFromContext starts and returns a Span with `operationName`, using +// any Span found within `ctx` as a ChildOfRef. If no such parent could be +// found, StartSpanFromContext creates a root (parentless) Span. +// +// The call sets `operationName` as `span_name` pprof label, and the new span +// identifier as `span_id` pprof label, if the trace is sampled. +// +// The second return value is a context.Context object built around the +// returned Span. +// +// Example usage: +// +// SomeFunction(ctx context.Context, ...) { +// sp, ctx := opentracing.StartSpanFromContext(ctx, "SomeFunction") +// defer sp.Finish() +// ... +// } +func StartSpanFromContext(ctx context.Context, operationName string, opts ...opentracing.StartSpanOption) (opentracing.Span, context.Context) { + return StartSpanFromContextWithTracer(ctx, opentracing.GlobalTracer(), operationName, opts...) +} + +// StartSpanFromContextWithTracer starts and returns a span with `operationName` +// using a span found within the context as a ChildOfRef. If that doesn't exist +// it creates a root span. It also returns a context.Context object built +// around the returned span. +// +// The call sets `operationName` as `span_name` pprof label, and the new span +// identifier as `span_id` pprof label, if the trace is sampled. +// +// It's behavior is identical to StartSpanFromContext except that it takes an explicit +// tracer as opposed to using the global tracer. +func StartSpanFromContextWithTracer(ctx context.Context, tracer opentracing.Tracer, operationName string, opts ...opentracing.StartSpanOption) (opentracing.Span, context.Context) { + span, ctx := opentracing.StartSpanFromContextWithTracer(ctx, tracer, operationName, opts...) + spanCtx, ok := span.Context().(jaeger.SpanContext) + if ok { + span = wrapJaegerSpanWithGoroutineLabels(ctx, span, operationName, sampledSpanID(spanCtx)) + } + return span, ctx +} + +func wrapJaegerSpanWithGoroutineLabels( + parentCtx context.Context, + span opentracing.Span, + operationName string, + spanID string, +) *spanWrapper { + // Note that pprof labels are propagated through the goroutine's local + // storage and are always copied to child goroutines. This way, stack + // trace samples collected during execution of child spans will be taken + // into account at the root. + var ctx context.Context + if spanID != "" { + ctx = pprof.WithLabels(parentCtx, pprof.Labels( + spanNameLabelName, operationName, + spanIDLabelName, spanID)) + } else { + // Even if the trace has not been sampled, we still need to keep track + // of samples that belong to the span (all spans with the given name). + ctx = pprof.WithLabels(parentCtx, pprof.Labels( + spanNameLabelName, operationName)) + } + // Goroutine labels should be set as early as possible, + // in order to capture the overhead of the function call. + pprof.SetGoroutineLabels(ctx) + // We create a span wrapper to ensure we remove the newly attached pprof + // labels when span finishes. The need of this wrapper is questioned: + // as we do not have the original context, we could leave the goroutine + // labels – normally, span is finished at the very end of the goroutine's + // lifetime, so no significant side effects should take place. + w := spanWrapper{ + parentPprofCtx: parentCtx, + currentPprofCtx: ctx, + } + w.Span = span.SetTag(profileIDTagKey, spanID) + return &w +} + +type spanWrapper struct { + parentPprofCtx context.Context + currentPprofCtx context.Context + opentracing.Span +} + +func (s *spanWrapper) Finish() { + s.Span.Finish() + pprof.SetGoroutineLabels(s.parentPprofCtx) + s.currentPprofCtx = s.parentPprofCtx +} + +// sampledSpanID returns the span ID, if the span is sampled, +// otherwise an empty string is returned. +func sampledSpanID(spanCtx jaeger.SpanContext) string { + if spanCtx.IsSampled() { + return spanCtx.SpanID().String() + } + return "" +} diff --git a/vendor/github.com/grafana/dskit/spanprofiler/tracer.go b/vendor/github.com/grafana/dskit/spanprofiler/tracer.go new file mode 100644 index 00000000000..c28b52b11d4 --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/tracer.go @@ -0,0 +1,109 @@ +package spanprofiler + +import ( + "context" + "unsafe" + + "github.com/opentracing/opentracing-go" + "github.com/uber/jaeger-client-go" +) + +const ( + profileIDTagKey = "pyroscope.profile.id" + + spanIDLabelName = "span_id" + spanNameLabelName = "span_name" +) + +type tracer struct{ opentracing.Tracer } + +// NewTracer creates a new opentracing.Tracer with the span profiler integrated. +// +// For efficiency, the tracer selectively records profiles for _root_ spans +// — the initial _local_ span in a process — since a trace may encompass +// thousands of spans. All stack trace samples accumulated during the execution +// of their child spans contribute to the root span's profile. In practical +// terms, this signifies that, for instance, an HTTP request results in a +// singular profile, irrespective of the numerous spans within the trace. It's +// important to note that these profiles don't extend beyond the boundaries of +// a single process. +// +// The limitation of this approach is that only spans created within the same +// goroutine, or its children, as the parent are taken into account. +// Consequently, in scenarios involving asynchronous execution, where the parent +// span context is passed to another goroutine, explicit profiling becomes +// necessary using `spanprofiler.StartSpanFromContext`. +func NewTracer(tr opentracing.Tracer) opentracing.Tracer { return &tracer{tr} } + +func (t *tracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span { + span := t.Tracer.StartSpan(operationName, opts...) + spanCtx, ok := span.Context().(jaeger.SpanContext) + if !ok { + return span + } + // pprof labels are attached only once, at the span root level. + if !isRootSpan(opts...) { + return span + } + // The pprof label API assumes that pairs of labels are passed through the + // context. Unfortunately, the opentracing Tracer API doesn't match this + // concept: this makes it impossible to save an existing pprof context and + // all the original pprof labels associated with the goroutine. + ctx := context.Background() + return wrapJaegerSpanWithGoroutineLabels(ctx, span, operationName, sampledSpanID(spanCtx)) +} + +// isRootSpan reports whether the span is a root span. +// +// There are only two valid cases: if the span is the first span in the trace, +// or is the first _local_ span in the trace. +// +// An exception is made for FollowsFrom reference: spans without an explicit +// parent are considered as root ones. +func isRootSpan(opts ...opentracing.StartSpanOption) bool { + parent, ok := parentSpanContextFromRef(opts...) + return !ok || isRemoteSpan(parent) +} + +// parentSpanContextFromRef returns the first parent reference. +func parentSpanContextFromRef(options ...opentracing.StartSpanOption) (sc jaeger.SpanContext, ok bool) { + var sso opentracing.StartSpanOptions + for _, option := range options { + option.Apply(&sso) + } + for _, ref := range sso.References { + if ref.Type == opentracing.ChildOfRef && ref.ReferencedContext != nil { + sc, ok = ref.ReferencedContext.(jaeger.SpanContext) + return sc, ok + } + } + return sc, ok +} + +// isRemoteSpan reports whether the span context represents a remote parent. +// +// NOTE(kolesnikovae): this is ugly, but the only reliable method I found. +// The opentracing-go package and Jaeger client are not meant to change as +// both are deprecated. +func isRemoteSpan(c jaeger.SpanContext) bool { + jaegerCtx := *(*jaegerSpanCtx)(unsafe.Pointer(&c)) + return jaegerCtx.remote +} + +// jaegerSpanCtx represents memory layout of the jaeger.SpanContext type. +type jaegerSpanCtx struct { + traceID [16]byte // TraceID + spanID [8]byte // SpanID + parentID [8]byte // SpanID + baggage uintptr // map[string]string + debugID [2]uintptr // string + + // samplingState is a pointer to a struct that has "localRootSpan" member, + // which we could probably use: that would allow omitting quite expensive + // parentSpanContextFromRef call. However, interpreting the pointer and + // the complex struct memory layout is more complicated and dangerous. + samplingState uintptr + + // remote indicates that span context represents a remote parent + remote bool +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 6a9af4803a2..52b0b55b823 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -528,6 +528,7 @@ github.com/grafana/dskit/server github.com/grafana/dskit/services github.com/grafana/dskit/signals github.com/grafana/dskit/spanlogger +github.com/grafana/dskit/spanprofiler github.com/grafana/dskit/tenant github.com/grafana/dskit/tracing github.com/grafana/dskit/user