Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export geth metrics on VM testnet #1351

Merged
merged 9 commits into from
Nov 11, 2019
5 changes: 5 additions & 0 deletions packages/celotool/src/cmds/deploy/destroy/vm-testnet.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { switchToClusterFromEnv } from 'src/lib/cluster'
import { removeHelmRelease } from 'src/lib/prom-to-sd-utils'
import { destroy } from 'src/lib/vm-testnet-utils'
import { DestroyArgv } from '../../deploy/destroy'

Expand All @@ -6,5 +8,8 @@ export const describe = 'destroy an existing VM-based testnet'
export const builder = {}

export const handler = async (argv: DestroyArgv) => {
await switchToClusterFromEnv()
await destroy(argv.celoEnv)
// destroy prometheus to stackdriver statefulset
await removeHelmRelease(argv.celoEnv)
}
11 changes: 10 additions & 1 deletion packages/celotool/src/cmds/deploy/initial/vm-testnet.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
import { deploy } from '../../../lib/vm-testnet-utils'
import { createClusterIfNotExists, setupCluster, switchToClusterFromEnv } from 'src/lib/cluster'
import { installHelmChart } from 'src/lib/prom-to-sd-utils'
import { deploy } from 'src/lib/vm-testnet-utils'
import { InitialArgv } from '../../deploy/initial'

export const command = 'vm-testnet'
export const describe = 'upgrade a testnet on a VM'
export const builder = {}

export const handler = async (argv: InitialArgv) => {
// set up Kubernetes cluster that will have prometheus to stackdriver statefulset
const createdCluster = await createClusterIfNotExists()
await switchToClusterFromEnv()
await setupCluster(argv.celoEnv, createdCluster)
// deploy VM testnet with Terraform
await deploy(argv.celoEnv)
// deploy prom to sd statefulset
await installHelmChart(argv.celoEnv)
}
6 changes: 6 additions & 0 deletions packages/celotool/src/cmds/deploy/upgrade/vm-testnet.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { switchToClusterFromEnv } from 'src/lib/cluster'
import { upgradeHelmChart } from 'src/lib/prom-to-sd-utils'
import { deploy, taintTestnet, untaintTestnet } from 'src/lib/vm-testnet-utils'
import yargs from 'yargs'
import { UpgradeArgv } from '../../deploy/upgrade'
Expand All @@ -18,10 +20,14 @@ export const builder = (argv: yargs.Argv) => {
}

export const handler = async (argv: VmTestnetArgv) => {
await switchToClusterFromEnv()

let onDeployFailed = () => Promise.resolve()
if (argv.reset) {
onDeployFailed = () => untaintTestnet(argv.celoEnv)
await taintTestnet(argv.celoEnv)
}
await deploy(argv.celoEnv, onDeployFailed)
// upgrade prom to sd statefulset
await upgradeHelmChart(argv.celoEnv)
}
4 changes: 4 additions & 0 deletions packages/celotool/src/lib/env-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ export enum envVar {
GETH_ACCOUNT_SECRET = 'GETH_ACCOUNT_SECRET',
GETH_BOOTNODE_DOCKER_IMAGE_REPOSITORY = 'GETH_BOOTNODE_DOCKER_IMAGE_REPOSITORY',
GETH_BOOTNODE_DOCKER_IMAGE_TAG = 'GETH_BOOTNODE_DOCKER_IMAGE_TAG',
GETH_EXPORTER_DOCKER_IMAGE_REPOSITORY = 'GETH_EXPORTER_DOCKER_IMAGE_REPOSITORY',
GETH_EXPORTER_DOCKER_IMAGE_TAG = 'GETH_EXPORTER_DOCKER_IMAGE_TAG',
GETH_NODES_BACKUP_CRONJOB_ENABLED = 'GETH_NODES_BACKUP_CRONJOB_ENABLED',
GETH_NODE_DOCKER_IMAGE_REPOSITORY = 'GETH_NODE_DOCKER_IMAGE_REPOSITORY',
GETH_NODE_DOCKER_IMAGE_TAG = 'GETH_NODE_DOCKER_IMAGE_TAG',
Expand All @@ -55,6 +57,8 @@ export enum envVar {
NEXMO_KEY = 'NEXMO_KEY',
NEXMO_SECRET = 'NEXMO_SECRET',
NOTIFICATION_SERVICE_FIREBASE_DB = 'NOTIFICATION_SERVICE_FIREBASE_DB',
PROMTOSD_EXPORT_INTERVAL = 'PROMTOSD_EXPORT_INTERVAL',
PROMTOSD_SCRAPE_INTERVAL = 'PROMTOSD_SCRAPE_INTERVAL',
SMS_RETRIEVER_HASH_CODE = 'SMS_RETRIEVER_HASH_CODE',
STACKDRIVER_MONITORING_DASHBOARD = 'STACKDRIVER_MONITORING_DASHBOARD',
STACKDRIVER_NOTIFICATION_APPLICATIONS_PREFIX = 'STACKDRIVER_NOTIFICATION_APPLICATIONS_PREFIX',
Expand Down
85 changes: 85 additions & 0 deletions packages/celotool/src/lib/prom-to-sd-utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import sleep from 'sleep-promise'
import { envVar, fetchEnv } from 'src/lib/env-utils'
import { installGenericHelmChart, removeGenericHelmChart } from 'src/lib/helm_deploy'
import { getStatefulSetReplicas, scaleResource } from 'src/lib/kubernetes'
import { execCmdWithExitOnFailure } from 'src/lib/utils'
import { getInternalTxNodeIPs, getInternalValidatorIPs } from 'src/lib/vm-testnet-utils'

const helmChartPath = '../helm-charts/prometheus-to-sd'

// This deploys a helm chart to Kubernetes that exports prometheus metrics from
// VM testnets Stackdriver

export async function installHelmChart(celoEnv: string) {
return installGenericHelmChart(
celoEnv,
releaseName(celoEnv),
helmChartPath,
await helmParameters(celoEnv)
)
}

export async function removeHelmRelease(celoEnv: string) {
await removeGenericHelmChart(releaseName(celoEnv))
}

export async function upgradeHelmChart(celoEnv: string) {
console.info(`Upgrading helm release ${releaseName(celoEnv)}`)

const statefulSetName = `${celoEnv}-prom-to-sd`
const replicaCount = await getStatefulSetReplicas(celoEnv, statefulSetName)

console.info('Scaling StatefulSet down to 0...')
await scaleResource(celoEnv, 'statefulset', statefulSetName, 0)
await sleep(5000)

const helmParams = await helmParameters(celoEnv)

const upgradeCmdArgs = `${releaseName(
celoEnv
)} ${helmChartPath} --namespace ${celoEnv} ${helmParams.join(' ')}`

if (process.env.CELOTOOL_VERBOSE === 'true') {
await execCmdWithExitOnFailure(`helm upgrade --debug --dry-run ${upgradeCmdArgs}`)
}
await execCmdWithExitOnFailure(`helm upgrade ${upgradeCmdArgs}`)
console.info(`Helm release ${releaseName(celoEnv)} upgrade successful`)

console.info(`Scaling StatefulSet back up to ${replicaCount}...`)
await scaleResource(celoEnv, 'statefulset', statefulSetName, replicaCount)
}

async function helmParameters(celoEnv: string) {
// The metrics endpoints are only exposed internally
const validatorIpAddresses = await getInternalValidatorIPs(celoEnv)
const validatorCount = parseInt(fetchEnv(envVar.VALIDATORS), 10)
const validatorPodIds = []
for (let i = 0; i < validatorCount; i++) {
validatorPodIds.push(`${celoEnv}-validator-${i}`)
}

const txNodeIpAddresses = await getInternalTxNodeIPs(celoEnv)
const txNodeCount = parseInt(fetchEnv(envVar.TX_NODES), 10)
const txNodePodIds = []
for (let i = 0; i < txNodeCount; i++) {
txNodePodIds.push(`${celoEnv}-tx-node-${i}`)
}

const allIps = validatorIpAddresses.concat(txNodeIpAddresses)
const sources = allIps.map((ip: string) => `http://${ip}:9200/metrics`)

const allPodIds = validatorPodIds.concat(txNodePodIds)

return [
`--set metricsSources.geth="${sources.join('\\,')}"`,
`--set promtosd.scrape_interval=${fetchEnv(envVar.PROMTOSD_SCRAPE_INTERVAL)}`,
`--set promtosd.export_interval=${fetchEnv(envVar.PROMTOSD_EXPORT_INTERVAL)}`,
`--set promtosd.podIds="${allPodIds.join('\\,')}"`,
`--set promtosd.namespaceId=${celoEnv}`,
`--set replicaCount=${validatorCount + txNodeCount}`,
]
}

function releaseName(celoEnv: string) {
return `${celoEnv}-prom-to-sd`
}
12 changes: 12 additions & 0 deletions packages/celotool/src/lib/vm-testnet-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ const testnetEnvVars: TerraformVars = {
geth_verbosity: envVar.GETH_VERBOSITY,
geth_bootnode_docker_image_repository: envVar.GETH_BOOTNODE_DOCKER_IMAGE_REPOSITORY,
geth_bootnode_docker_image_tag: envVar.GETH_BOOTNODE_DOCKER_IMAGE_TAG,
geth_exporter_docker_image_repository: envVar.GETH_EXPORTER_DOCKER_IMAGE_REPOSITORY,
geth_exporter_docker_image_tag: envVar.GETH_EXPORTER_DOCKER_IMAGE_TAG,
geth_node_docker_image_repository: envVar.GETH_NODE_DOCKER_IMAGE_REPOSITORY,
geth_node_docker_image_tag: envVar.GETH_NODE_DOCKER_IMAGE_TAG,
in_memory_discovery_table: envVar.IN_MEMORY_DISCOVERY_TABLE,
Expand Down Expand Up @@ -297,6 +299,16 @@ export async function getTxNodeLoadBalancerIP(celoEnv: string) {
return outputs.tx_node_lb_ip_address.value
}

export async function getInternalValidatorIPs(celoEnv: string) {
const outputs = await getTestnetOutputs(celoEnv)
return outputs.validator_internal_ip_addresses.value
}

export async function getInternalTxNodeIPs(celoEnv: string) {
const outputs = await getTestnetOutputs(celoEnv)
return outputs.tx_node_internal_ip_addresses.value
}

function getTerraformBackendConfigVars(celoEnv: string, terraformModule: string) {
return {
prefix: `${celoEnv}/${terraformModule}`,
Expand Down
51 changes: 42 additions & 9 deletions packages/helm-charts/prometheus-to-sd/templates/deployment.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
apiVersion: apps/v1beta1
kind: Deployment
kind: StatefulSet
metadata:
name: {{ template "prometheus-to-sd.fullname" . }}
labels:
Expand All @@ -9,6 +9,7 @@ metadata:
heritage: {{ .Release.Service }}
spec:
replicas: {{ .Values.replicaCount }}
serviceName: {{ template "prometheus-to-sd.fullname" . }}
template:
metadata:
labels:
Expand All @@ -23,16 +24,48 @@ spec:
- name: profiler
containerPort: {{ .Values.port }}
command:
- /monitor
- --stackdriver-prefix=custom.googleapis.com
{{- range $key, $value := .Values.metricsSources }}
- --source={{ $key }}:{{ $value }}
{{- end }}
- --scrape-interval={{ .Values.promtosd.scrape_interval }}
- --export-interval={{ .Values.promtosd.export_interval }}
- /bin/sh
- "-c"
- |-
INDEX=${POD_NAME##*-}

NAMESPACE_ID="{{ .Values.promtosd.namespaceId }}"
NAMESPACE_ID_FLAG=""
[ "$NAMESPACE_ID" ] && NAMESPACE_ID_FLAG="--namespace-id=$NAMESPACE_ID"

POD_ID=`echo -n {{ .Values.promtosd.podIds }} | cut -d ',' -f $((INDEX + 1))`
POD_ID_FLAG=""
[ "$POD_ID" ] && POD_ID_FLAG="--pod-id=$POD_ID"

/monitor \
--stackdriver-prefix=custom.googleapis.com \
{{- range $key, $value := .Values.metricsSources }}
--source={{ $key }}:$(echo -n "{{ $value }}" | cut -d ',' -f $((INDEX + 1))) \
{{- end }}
--scrape-interval={{ .Values.promtosd.scrape_interval }} \
--export-interval={{ .Values.promtosd.export_interval }} \
$POD_ID_FLAG \
$NAMESPACE_ID_FLAG
resources:
{{ toYaml .Values.resources | indent 12 }}
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
{{- if .Values.nodeSelector }}
nodeSelector:
{{ toYaml .Values.nodeSelector | indent 8 }}
{{- end }}
{{- end }}
---

apiVersion: v1
kind: Service
metadata:
name: {{ template "prometheus-to-sd.fullname" . }}
labels:
component: {{ template "prometheus-to-sd.fullname" . }}
spec:
clusterIP: None
selector:
app: {{ template "prometheus-to-sd.name" . }}
10 changes: 8 additions & 2 deletions packages/helm-charts/prometheus-to-sd/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@ image:
repository: gcr.io/google-containers/prometheus-to-sd
tag: v0.3.2
pullPolicy: IfNotPresent
resources: {}
resources:
requests:
memory: 50M
cpu: 5m
port: 6060
metricsSources: {}
nodeSelector: {}
nodeSelector: {}
promtosd:
podIds: ""
namespaceId: ""
32 changes: 32 additions & 0 deletions packages/terraform-modules/testnet/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ data "terraform_remote_state" "state" {
}
}

locals {
firewall_target_tags_bootnode = ["${var.celo_env}-bootnode"]
firewall_target_tags_node = ["${var.celo_env}-node"]
}

data "google_compute_network" "network" {
name = var.network_name
}
Expand All @@ -28,6 +33,8 @@ resource "google_compute_firewall" "ssh_firewall" {
name = "${var.celo_env}-ssh-firewall"
network = data.google_compute_network.network.name

target_tags = concat(local.firewall_target_tags_bootnode, local.firewall_target_tags_node)

allow {
protocol = "tcp"
ports = ["22"]
Expand All @@ -38,6 +45,8 @@ resource "google_compute_firewall" "geth_firewall" {
name = "${var.celo_env}-geth-firewall"
network = data.google_compute_network.network.name

target_tags = local.firewall_target_tags_node

allow {
protocol = "tcp"
ports = ["30303"]
Expand All @@ -49,10 +58,27 @@ resource "google_compute_firewall" "geth_firewall" {
}
}

resource "google_compute_firewall" "geth_metrics_firewall" {
name = "${var.celo_env}-geth-metrics-firewall"
network = data.google_compute_network.network.name

target_tags = local.firewall_target_tags_node

# allow all IPs internal to the VPC
source_ranges = ["10.0.0.0/8"]

allow {
protocol = "tcp"
ports = ["9200"]
}
}

resource "google_compute_firewall" "rpc_firewall" {
name = "${var.celo_env}-rpc-firewall"
network = data.google_compute_network.network.name

target_tags = local.firewall_target_tags_node

allow {
protocol = "tcp"
ports = ["8545", "8546"]
Expand All @@ -63,6 +89,8 @@ resource "google_compute_firewall" "bootnode_firewall" {
name = "${var.celo_env}-bootnode-firewall"
network = data.google_compute_network.network.name

target_tags = local.firewall_target_tags_bootnode

allow {
protocol = "udp"
ports = ["30301"]
Expand Down Expand Up @@ -93,6 +121,8 @@ module "tx_node" {
gcloud_secrets_bucket = var.gcloud_secrets_bucket
gcloud_vm_service_account_email = var.gcloud_vm_service_account_email
genesis_content_base64 = var.genesis_content_base64
geth_exporter_docker_image_repository = var.geth_exporter_docker_image_repository
geth_exporter_docker_image_tag = var.geth_exporter_docker_image_tag
geth_node_docker_image_repository = var.geth_node_docker_image_repository
geth_node_docker_image_tag = var.geth_node_docker_image_tag
geth_verbosity = var.geth_verbosity
Expand Down Expand Up @@ -123,6 +153,8 @@ module "validator" {
gcloud_secrets_bucket = var.gcloud_secrets_bucket
gcloud_vm_service_account_email = var.gcloud_vm_service_account_email
genesis_content_base64 = var.genesis_content_base64
geth_exporter_docker_image_repository = var.geth_exporter_docker_image_repository
geth_exporter_docker_image_tag = var.geth_exporter_docker_image_tag
geth_node_docker_image_repository = var.geth_node_docker_image_repository
geth_node_docker_image_tag = var.geth_node_docker_image_tag
geth_verbosity = var.geth_verbosity
Expand Down
2 changes: 2 additions & 0 deletions packages/terraform-modules/testnet/modules/bootnode/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ resource "google_compute_instance" "bootnode" {
name = local.name_prefix
machine_type = "n1-standard-1"

tags = [local.name_prefix]

allow_stopping_for_update = true

boot_disk {
Expand Down
Loading