Skip to content

Commit

Permalink
roachprod: promhelper fix project and wipe
Browse files Browse the repository at this point in the history
This PR fixes a bug introduced in #138711 and also adds deletion of
Prometheus targets at cluster wipe.

Before #138711, the GCE provider defaults were defined during init().
This logic was moved to an init function to allow the `drtprod` command
to define its own defaults via environment variables.
This introduced a state in which where the promhelper client's defaults
for `SupportedPromProject` is initialized with `gce.DefaultProject()`
before this value is initialized, and no Prometheus targets are ever
pushed.

This PR removes the `promhelperclient.DefaultClient` that should not
be used anymore, and computing the defaults in `NewPromClient()`.
This PR also delegates the checks on whether or not providers and
projects are supported to the promhelperclient package to simplify the
logic in the callers.

Also, prior to this PR, if an `insecure` cluster was reused as a
`secure` cluster during a `roachtest` run, the promhelper client would
delete the `secure` configuration during cluster destruction, but would
leave the `insecure` configuration (as the promhelper clients tries to
delete `secure` first, then `insecure` if not found). This was creating
stale Prometheus targets.

This PR introduces the deletion of the Prometheus targets at cluster
wipe to fix this.

Epic: none
Release note: None

Signed-off-by: Ludovic Leroux <[email protected]>
  • Loading branch information
golgeek committed Jan 13, 2025
1 parent ac65101 commit f53d7cb
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 52 deletions.
68 changes: 47 additions & 21 deletions pkg/roachprod/cloud/cluster_cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,35 +402,61 @@ func ShrinkCluster(l *logger.Logger, c *Cluster, numNodes int) error {
})
}

// DestroyCluster TODO(peter): document
func DestroyCluster(l *logger.Logger, c *Cluster) error {
func (c *Cluster) DeletePrometheusConfig(ctx context.Context, l *logger.Logger) error {

cl := promhelperclient.NewPromClient()

stopSpinner := ui.NewDefaultSpinner(l, "Destroying Prometheus configs").Start()
// check if any node is supported as promhelper cluster
defer stopSpinner()

for _, node := range c.VMs {
if _, ok := promhelperclient.SupportedPromProjects[node.Project]; ok &&
node.Provider == gce.ProviderName {
if err := promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(),
c.Name, false, false /* insecure */, l); err != nil {
// TODO(bhaskar): Obtain secure cluster information.
// Cluster does not have the information on secure or not. So, we retry as insecure
// if delete fails with cluster as secure
if promhelperclient.IsNotFoundError(err) {
if err = promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(),
c.Name, false, true /* insecure */, l); err != nil {
l.Errorf("Failed to delete the cluster config with cluster as insecure and secure: %v", err)
}
} else {
l.Errorf("Failed to delete the cluster config with cluster as secure: %v", err)
}

// only gce is supported for prometheus
if !cl.IsSupportedNodeProvider(node.Provider) {
continue
}
if !cl.IsSupportedPromProject(node.Project) {
continue
}

err := cl.DeleteClusterConfig(ctx, c.Name, false, false /* insecure */, l)
if err != nil {

if !promhelperclient.IsNotFoundError(err) {
return errors.Wrapf(
err,
"failed to delete the cluster config with cluster as secure",
)
}

// TODO(bhaskar): Obtain secure cluster information.
// Cluster does not have the information on secure or not.
// So, we retry as insecure if delete fails with cluster as secure.
if err = cl.DeleteClusterConfig(ctx, c.Name, false, true /* insecure */, l); err != nil {
return errors.Wrapf(
err,
"failed to delete the cluster config with cluster as insecure and secure",
)
}
break

}
break

}

return nil
}

// DestroyCluster TODO(peter): document
func DestroyCluster(l *logger.Logger, c *Cluster) error {

if err := c.DeletePrometheusConfig(context.Background(), l); err != nil {
l.Printf("WARNING: failed to delete the prometheus config (already wiped?): %s", err)
}
stopSpinner()

// DNS entries are destroyed first to ensure that the GC job will not try
// and clean-up entries prematurely.
stopSpinner = ui.NewDefaultSpinner(l, "Destroying DNS entries").Start()
stopSpinner := ui.NewDefaultSpinner(l, "Destroying DNS entries").Start()
dnsErr := vm.FanOutDNS(c.VMs, func(p vm.DNSProvider, vms vm.List) error {
return p.DeleteRecordsBySubdomain(context.Background(), c.Name)
})
Expand Down
12 changes: 11 additions & 1 deletion pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ func (c *SyncedCluster) Wipe(ctx context.Context, l *logger.Logger, preserveCert
if err := c.Stop(ctx, l, int(unix.SIGKILL), true /* wait */, 0 /* gracePeriod */, ""); err != nil {
return err
}
return c.Parallel(ctx, l, WithNodes(c.Nodes).WithDisplay(display), func(ctx context.Context, node Node) (*RunResultDetails, error) {
err := c.Parallel(ctx, l, WithNodes(c.Nodes).WithDisplay(display), func(ctx context.Context, node Node) (*RunResultDetails, error) {
var cmd string
if c.IsLocal() {
// Not all shells like brace expansion, so we'll do it here
Expand Down Expand Up @@ -618,6 +618,16 @@ func (c *SyncedCluster) Wipe(ctx context.Context, l *logger.Logger, preserveCert
}
return c.runCmdOnSingleNode(ctx, l, node, cmd, defaultCmdOpts("wipe"))
})
if err != nil {
return err
}

err = c.Cluster.DeletePrometheusConfig(ctx, l)
if err != nil {
l.Printf("WARNING: failed to delete the prometheus config (already wiped?): %s", err)
}

return nil
}

// NodeStatus contains details about the status of a node.
Expand Down
40 changes: 28 additions & 12 deletions pkg/roachprod/promhelperclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ const (
ErrorMessagePrefix = "request failed with status %d"
)

// SupportedPromProjects are the projects supported for prometheus target
var SupportedPromProjects = map[string]struct{}{gce.DefaultProject(): {}}

// The URL for the Prometheus registration service. An empty string means that the
// Prometheus integration is disabled. Should be accessed through
// getPrometheusRegistrationUrl().
Expand All @@ -63,24 +60,29 @@ type PromClient struct {
// newTokenSource is the token generator source.
newTokenSource func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (
oauth2.TokenSource, error)
}

// DefaultPromClient is the default instance of PromClient. This instance should
// be used unless custom configuration is needed.
var DefaultPromClient = NewPromClient()
// supportedPromProviders are the providers supported for prometheus target
supportedPromProviders map[string]struct{}

// supportedPromProjects are the projects supported for prometheus target
supportedPromProjects map[string]struct{}
}

// IsNotFoundError returns true if the error is a 404 error.
func IsNotFoundError(err error) bool {
return strings.Contains(err.Error(), fmt.Sprintf(ErrorMessagePrefix, http.StatusNotFound))
}

// NewPromClient returns a new instance of PromClient
func NewPromClient() *PromClient {
return &PromClient{
promUrl: promRegistrationUrl,
disabled: promRegistrationUrl == "",
httpPut: httputil.Put,
httpDelete: httputil.Delete,
newTokenSource: idtoken.NewTokenSource,
promUrl: promRegistrationUrl,
disabled: promRegistrationUrl == "",
httpPut: httputil.Put,
httpDelete: httputil.Delete,
newTokenSource: idtoken.NewTokenSource,
supportedPromProviders: map[string]struct{}{gce.ProviderName: {}},
supportedPromProjects: map[string]struct{}{gce.DefaultProject(): {}},
}
}

Expand Down Expand Up @@ -183,6 +185,20 @@ func getUrl(promUrl, clusterName string) string {
return fmt.Sprintf("%s/%s/%s/%s", promUrl, resourceVersion, resourceName, clusterName)
}

// IsSupportedNodeProvider returns true if the provider is supported
// for prometheus target.
func (c *PromClient) IsSupportedNodeProvider(provider string) bool {
_, ok := c.supportedPromProviders[provider]
return ok
}

// IsSupportedPromProject returns true if the project is supported
// for prometheus target.
func (c *PromClient) IsSupportedPromProject(project string) bool {
_, ok := c.supportedPromProjects[project]
return ok
}

// CCParams are the params for the cluster configs
type CCParams struct {
Targets []string `yaml:"targets"`
Expand Down
43 changes: 25 additions & 18 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -803,32 +803,39 @@ func updatePrometheusTargets(
return err
}

cl := promhelperclient.NewPromClient()
nodeIPPorts := make(map[int]*promhelperclient.NodeInfo)
nodeIPPortsMutex := syncutil.RWMutex{}
var wg sync.WaitGroup
for _, node := range c.Nodes {
if _, ok := promhelperclient.SupportedPromProjects[c.VMs[node-1].Project]; ok &&
c.VMs[node-1].Provider == gce.ProviderName {
wg.Add(1)
go func(index int, v vm.VM) {
defer wg.Done()
// only gce is supported for prometheus
desc, err := c.DiscoverService(ctx, install.Node(index), "", install.ServiceTypeUI, 0)
if err != nil {
l.Errorf("error getting the port for node %d: %v", index, err)
return
}
nodeInfo := fmt.Sprintf("%s:%d", v.PrivateIP, desc.Port)
nodeIPPortsMutex.Lock()
// ensure atomicity in map update
nodeIPPorts[index] = &promhelperclient.NodeInfo{Target: nodeInfo, CustomLabels: createLabels(v)}
nodeIPPortsMutex.Unlock()
}(int(node), c.VMs[node-1])

// only gce is supported for prometheus
if !cl.IsSupportedNodeProvider(c.VMs[node-1].Provider) {
continue
}
if !cl.IsSupportedPromProject(c.VMs[node-1].Project) {
continue
}

wg.Add(1)
go func(index int, v vm.VM) {
defer wg.Done()
desc, err := c.DiscoverService(ctx, install.Node(index), "", install.ServiceTypeUI, 0)
if err != nil {
l.Errorf("error getting the port for node %d: %v", index, err)
return
}
nodeInfo := fmt.Sprintf("%s:%d", v.PrivateIP, desc.Port)
nodeIPPortsMutex.Lock()
// ensure atomicity in map update
nodeIPPorts[index] = &promhelperclient.NodeInfo{Target: nodeInfo, CustomLabels: createLabels(v)}
nodeIPPortsMutex.Unlock()
}(int(node), c.VMs[node-1])

}
wg.Wait()
if len(nodeIPPorts) > 0 {
if err := promhelperclient.DefaultPromClient.UpdatePrometheusTargets(ctx,
if err := cl.UpdatePrometheusTargets(ctx,
c.Name, false, nodeIPPorts, !c.Secure, l); err != nil {
l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err)
}
Expand Down

0 comments on commit f53d7cb

Please sign in to comment.