cortex/tsdb.libsonnet

{
  local pvc = $.core.v1.persistentVolumeClaim,
  local volumeMount = $.core.v1.volumeMount,
  local container = $.core.v1.container,
  local statefulSet = $.apps.v1.statefulSet,
  local service = $.core.v1.service,

  _config+:: {
    // Enforce blocks storage
    storage_backend: 'none',
    storage_engine: 'blocks',

    // Allow to configure the ingester disk.
    cortex_ingester_data_disk_size: '100Gi',
    cortex_ingester_data_disk_class: 'fast',

    // Allow to configure the store-gateway disk.
    cortex_store_gateway_data_disk_size: '50Gi',
    cortex_store_gateway_data_disk_class: 'standard',

    // Allow to configure the compactor disk.
    cortex_compactor_data_disk_size: '250Gi',
    cortex_compactor_data_disk_class: 'standard',

    // Allow to fine tune compactor.
    cortex_compactor_max_concurrency: 1,
    // While this is the default value, we want to pass the same to the -blocks-storage.bucket-store.sync-interval
    cortex_compactor_cleanup_interval: '15m',

    // Enable use of bucket index by querier, ruler and store-gateway.
    // Bucket index is generated by compactor from Cortex 1.7, there is no flag required to enable this on compactor.
    cortex_bucket_index_enabled: false,
  },

  blocks_chunks_caching_config::
    (
      if $._config.memcached_index_queries_enabled then {
        'blocks-storage.bucket-store.index-cache.backend': 'memcached',
        'blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config,
        'blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms',
        'blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024,
        'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000',
        'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50',
        'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100',
      } else {}
    ) + (
      if $._config.memcached_chunks_enabled then {
        'blocks-storage.bucket-store.chunks-cache.backend': 'memcached',
        'blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config,
        'blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms',
        'blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024,
        'blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000',
        'blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50',
        'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100',
      } else {}
    ),

  blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then {
    'blocks-storage.bucket-store.metadata-cache.backend': 'memcached',
    'blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config,
    'blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms',
    'blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024,
    'blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000',
    'blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50',
    'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100',
  } else {},

  bucket_index_config:: if $._config.cortex_bucket_index_enabled then {
    'blocks-storage.bucket-store.bucket-index.enabled': true,

    // Bucket index is updated by compactor on each cleanup cycle.
    'blocks-storage.bucket-store.sync-interval': $._config.cortex_compactor_cleanup_interval,
  } else {},

  querier_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config,
  ruler_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config,

  // The ingesters should persist TSDB blocks and WAL on a persistent
  // volume in order to be crash resilient.
  local ingester_data_pvc =
    pvc.new() +
    pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_ingester_data_disk_size }) +
    pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) +
    pvc.mixin.spec.withStorageClassName($._config.cortex_ingester_data_disk_class) +
    pvc.mixin.metadata.withName('ingester-data'),

  ingester_deployment: {},

  ingester_args+:: {
    'blocks-storage.tsdb.dir': '/data/tsdb',
    'blocks-storage.tsdb.block-ranges-period': '2h',
    'blocks-storage.tsdb.retention-period': '96h',  // 4 days protection against blocks not being uploaded from ingesters.
    'blocks-storage.tsdb.ship-interval': '1m',

    // Disable TSDB blocks transfer because of persistent volumes
    'ingester.max-transfer-retries': 0,
    'ingester.join-after': '0s',

    // Persist ring tokens so that when the ingester will be restarted
    // it will pick the same tokens
    'ingester.tokens-file-path': '/data/tokens',
  },

  newIngesterStatefulSet(name, container, with_anti_affinity=true)::
    statefulSet.new(name, 3, [
      container + $.core.v1.container.withVolumeMountsMixin([
        volumeMount.new('ingester-data', '/data'),
      ]),
    ], ingester_data_pvc) +
    statefulSet.mixin.spec.withServiceName(name) +
    statefulSet.mixin.metadata.withNamespace($._config.namespace) +
    statefulSet.mixin.metadata.withLabels({ name: name }) +
    statefulSet.mixin.spec.template.metadata.withLabels({ name: name } + $.ingester_deployment_labels) +
    statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) +
    statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) +
    // When the ingester needs to flush blocks to the storage, it may take quite a lot of time.
    // For this reason, we grant an high termination period (80 minutes).
    statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(1200) +
    statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
    $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') +
    $.util.podPriority('high') +
    // Parallelly scale up/down ingester instances instead of starting them
    // one by one. This does NOT affect rolling updates: they will continue to be
    // rolled out one by one (the next pod will be rolled out once the previous is
    // ready).
    statefulSet.mixin.spec.withPodManagementPolicy('Parallel') +
    (if with_anti_affinity then $.util.antiAffinity else {}),

  ingester_statefulset: self.newIngesterStatefulSet('ingester', $.ingester_container),

  ingester_service:
    $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels),

  // The compactor runs a statefulset with a single replica, because
  // it does not support horizontal scalability yet.
  local compactor_data_pvc =
    pvc.new() +
    pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_compactor_data_disk_size }) +
    pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) +
    pvc.mixin.spec.withStorageClassName($._config.cortex_compactor_data_disk_class) +
    pvc.mixin.metadata.withName('compactor-data'),

  compactor_args::
    $._config.grpcConfig +
    $._config.storageConfig +
    $._config.blocksStorageConfig +
    $._config.compactorLimitsConfig +
    {
      target: 'compactor',

      // Compactor config.
      'compactor.block-ranges': '2h,12h,24h',
      'compactor.data-dir': '/data',
      'compactor.compaction-interval': '30m',
      'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency,
      'compactor.cleanup-interval': $._config.cortex_compactor_cleanup_interval,

      // Enable sharding.
      'compactor.sharding-enabled': true,
      'compactor.ring.store': 'consul',
      'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace,
      'compactor.ring.prefix': '',

      // Limits config.
      'runtime-config.file': '/etc/cortex/overrides.yaml',
    },

  compactor_ports:: $.util.defaultPorts,

  compactor_container::
    container.new('compactor', $._images.compactor) +
    container.withPorts($.compactor_ports) +
    container.withArgsMixin($.util.mapToFlags($.compactor_args)) +
    container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) +
    // Do not limit compactor CPU and request enough cores to honor configured max concurrency.
    $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') +
    $.util.resourcesLimits(null, '6Gi') +
    $.util.readinessProbe +
    $.jaeger_mixin,

  newCompactorStatefulSet(name, container)::
    statefulSet.new(name, 1, [container], compactor_data_pvc) +
    statefulSet.mixin.spec.withServiceName(name) +
    statefulSet.mixin.metadata.withNamespace($._config.namespace) +
    statefulSet.mixin.metadata.withLabels({ name: name }) +
    statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) +
    statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) +
    statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) +
    statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
    statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) +
    // Parallelly scale up/down compactor instances instead of starting them
    // one by one. This does NOT affect rolling updates: they will continue to be
    // rolled out one by one (the next pod will be rolled out once the previous is
    // ready).
    statefulSet.mixin.spec.withPodManagementPolicy('Parallel') +
    $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'),

  compactor_statefulset:
    $.newCompactorStatefulSet('compactor', $.compactor_container),

  // The store-gateway runs a statefulset.
  local store_gateway_data_pvc =
    pvc.new() +
    pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_store_gateway_data_disk_size }) +
    pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) +
    pvc.mixin.spec.withStorageClassName($._config.cortex_store_gateway_data_disk_class) +
    pvc.mixin.metadata.withName('store-gateway-data'),

  store_gateway_args::
    $._config.grpcConfig +
    $._config.storageConfig +
    $._config.blocksStorageConfig +
    $._config.queryBlocksStorageConfig +
    {
      target: 'store-gateway',
      'runtime-config.file': '/etc/cortex/overrides.yaml',

      // Persist ring tokens so that when the store-gateway will be restarted
      // it will pick the same tokens
      'store-gateway.sharding-ring.tokens-file-path': '/data/tokens',

      // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time.
      'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true',
      'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m',

      'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024,

      // We should keep a number of idle connections equal to the max "get" concurrency,
      // in order to avoid re-opening connections continuously (this would be slower
      // and fill up the conntrack table too).
      //
      // The downside of this approach is that we'll end up with an higher number of
      // active connections to memcached, so we have to make sure connections limit
      // set in memcached is high enough.
      'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100,
      'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100,
      'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100,
      'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'],
      'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'],
      'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'],
    } +
    $.blocks_chunks_caching_config +
    $.blocks_metadata_caching_config +
    $.bucket_index_config,

  store_gateway_ports:: $.util.defaultPorts,

  store_gateway_container::
    container.new('store-gateway', $._images.store_gateway) +
    container.withPorts($.store_gateway_ports) +
    container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) +
    container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) +
    $.util.resourcesRequests('1', '12Gi') +
    $.util.resourcesLimits(null, '18Gi') +
    $.util.readinessProbe +
    $.jaeger_mixin,

  newStoreGatewayStatefulSet(name, container)::
    statefulSet.new(name, 3, [container], store_gateway_data_pvc) +
    statefulSet.mixin.spec.withServiceName(name) +
    statefulSet.mixin.metadata.withNamespace($._config.namespace) +
    statefulSet.mixin.metadata.withLabels({ name: name }) +
    statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) +
    statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) +
    statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) +
    statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
    statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120) +
    // Parallelly scale up/down store-gateway instances instead of starting them
    // one by one. This does NOT affect rolling updates: they will continue to be
    // rolled out one by one (the next pod will be rolled out once the previous is
    // ready).
    statefulSet.mixin.spec.withPodManagementPolicy('Parallel') +
    $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'),

  store_gateway_statefulset: self.newStoreGatewayStatefulSet('store-gateway', $.store_gateway_container),

  store_gateway_service:
    $.util.serviceFor($.store_gateway_statefulset),

  local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget,

  store_gateway_pdb:
    podDisruptionBudget.new() +
    podDisruptionBudget.mixin.metadata.withName('store-gateway-pdb') +
    podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) +
    podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) +
    // To avoid any disruption in the read path we need at least 1 replica of each
    // block available, so the disruption budget depends on the blocks replication factor.
    podDisruptionBudget.mixin.spec.withMaxUnavailable(if $._config.store_gateway_replication_factor > 1 then $._config.store_gateway_replication_factor - 1 else 1),
}