From fbfe739defbbca27b11aabeb1e893d6225df2b3e Mon Sep 17 00:00:00 2001 From: Naorem Khogendro Singh Date: Tue, 28 Jan 2025 14:33:18 -0800 Subject: [PATCH] [PLAT-16610] Verify that a cloud instance searched by the name, has the same node UUID tag before performing any action in python ybops Summary: Check again with node uuid tag value if it is set before picking up the node. Test Plan: Manually created universes on aws and gcp and modified the tag to make sure the nodes are not deleted. Reviewers: cwang, muthu, yshchetinin Reviewed By: yshchetinin Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D41535 --- .../devops/opscli/ybops/cloud/aws/cloud.py | 21 +++++++++++-------- .../devops/opscli/ybops/cloud/aws/method.py | 9 +++++--- .../devops/opscli/ybops/cloud/azure/cloud.py | 4 +++- .../devops/opscli/ybops/cloud/azure/utils.py | 16 +++++++++----- .../opscli/ybops/cloud/common/method.py | 6 ++++-- .../devops/opscli/ybops/cloud/gcp/cloud.py | 18 +++++++++------- .../devops/opscli/ybops/cloud/gcp/utils.py | 13 +++++++++--- .../com/yugabyte/yw/common/NodeManager.java | 8 +++---- .../yugabyte/yw/common/NodeManagerTest.java | 12 +++++------ 9 files changed, 67 insertions(+), 40 deletions(-) diff --git a/managed/devops/opscli/ybops/cloud/aws/cloud.py b/managed/devops/opscli/ybops/cloud/aws/cloud.py index 0d1c5501f9b4..b0bf42329627 100644 --- a/managed/devops/opscli/ybops/cloud/aws/cloud.py +++ b/managed/devops/opscli/ybops/cloud/aws/cloud.py @@ -365,7 +365,9 @@ def get_host_info(self, args, get_all=False, private_ip=None): """ region = args.region search_pattern = args.search_pattern - return self.get_host_info_specific_args(region, search_pattern, get_all, private_ip) + node_uuid = args.node_uuid + return self.get_host_info_specific_args(region, search_pattern, get_all, private_ip, + node_uuid=node_uuid) def get_host_info_specific_args(self, region, search_pattern, get_all=False, private_ip=None, filters=None, node_uuid=None): @@ -389,11 +391,6 @@ def get_host_info_specific_args(self, region, search_pattern, get_all=False, "Name": "tag:Name", "Values": [search_pattern] }) - if node_uuid: - filters.append({ - "Name": "tag:node-uuid", - "Values": [node_uuid] - }) instances = [] for _, client in self._get_clients(region=region).items(): instances.extend(list(client.instances.filter(Filters=filters))) @@ -401,7 +398,7 @@ def get_host_info_specific_args(self, region, search_pattern, get_all=False, for instance in instances: data = instance.meta.data if private_ip is not None and private_ip != data["PrivateIpAddress"]: - logging.warn("Node name {} is not unique. Expected IP {}, got IP {}".format( + logging.warning("Node name {} is not unique. Expected IP {}, got IP {}".format( search_pattern, private_ip, data["PrivateIpAddress"])) continue zone = data["Placement"]["AvailabilityZone"] @@ -416,7 +413,13 @@ def get_host_info_specific_args(self, region, search_pattern, get_all=False, node_uuid_tags = [t["Value"] for t in data["Tags"] if t["Key"] == "node-uuid"] universe_uuid_tags = [t["Value"] for t in data["Tags"] if t["Key"] == "universe-uuid"] - + host_node_uuid = node_uuid_tags[0] if node_uuid_tags else None + # Matching tag or no tag for backward compatibility. + if host_node_uuid is not None and node_uuid is not None \ + and host_node_uuid != node_uuid: + logging.warning("VM {}({}) with node UUID {} is not found.". + format(search_pattern, host_node_uuid, node_uuid)) + continue primary_private_ip = None secondary_private_ip = None primary_subnet = None @@ -447,7 +450,7 @@ def get_host_info_specific_args(self, region, search_pattern, get_all=False, instance_type=data["InstanceType"], server_type=server_tags[0] if server_tags else None, launched_by=launched_by_tags[0] if launched_by_tags else None, - node_uuid=node_uuid_tags[0] if node_uuid_tags else None, + node_uuid=host_node_uuid, universe_uuid=universe_uuid_tags[0] if universe_uuid_tags else None, vpc=data["VpcId"], instance_state=instance_state, diff --git a/managed/devops/opscli/ybops/cloud/aws/method.py b/managed/devops/opscli/ybops/cloud/aws/method.py index ef6be6d4b785..2ec7071d9d44 100644 --- a/managed/devops/opscli/ybops/cloud/aws/method.py +++ b/managed/devops/opscli/ybops/cloud/aws/method.py @@ -205,7 +205,8 @@ def callback(self, args): args.search_pattern, get_all=False, private_ip=args.node_ip, - filters=filters + filters=filters, + node_uuid=args.node_uuid ) if not host_info: logging.error("Host {} does not exist.".format(args.search_pattern)) @@ -234,7 +235,8 @@ def callback(self, args): args.region, args.search_pattern, get_all=False, - private_ip=args.node_ip + private_ip=args.node_ip, + node_uuid=args.node_uuid ) if not host_info: @@ -263,7 +265,8 @@ def callback(self, args): args.region, args.search_pattern, get_all=False, - private_ip=args.node_ip + private_ip=args.node_ip, + node_uuid=args.node_uuid ) if not host_info: diff --git a/managed/devops/opscli/ybops/cloud/azure/cloud.py b/managed/devops/opscli/ybops/cloud/azure/cloud.py index 9ba3506c1d65..acf8b8777398 100644 --- a/managed/devops/opscli/ybops/cloud/azure/cloud.py +++ b/managed/devops/opscli/ybops/cloud/azure/cloud.py @@ -210,7 +210,8 @@ def get_instance_types(self, args): return self.get_admin().get_instance_types(regions) def get_host_info(self, args, get_all=False): - return self.get_admin().get_host_info(args.search_pattern, get_all) + return self.get_admin().get_host_info(args.search_pattern, get_all, + node_uuid=args.node_uuid) def get_device_names(self, args): return ["sd{}".format(chr(i)) for i in range(ord('c'), ord('c') + args.num_volumes)] @@ -245,6 +246,7 @@ def modify_tags(self, args): instance = self.get_host_info(args) if not instance: raise YBOpsRuntimeError("Could not find instance {}".format(args.search_pattern)) + # TODO this method is not present! modify_tags(args.region, instance["id"], args.instance_tags, args.remove_tags) def start_instance(self, host_info, server_ports): diff --git a/managed/devops/opscli/ybops/cloud/azure/utils.py b/managed/devops/opscli/ybops/cloud/azure/utils.py index 5c0a83033a35..b669013a5a7d 100644 --- a/managed/devops/opscli/ybops/cloud/azure/utils.py +++ b/managed/devops/opscli/ybops/cloud/azure/utils.py @@ -1166,12 +1166,21 @@ def get_ultra_instances(self, regions, folder): json.dump(vms, writefile) return - def get_host_info(self, vm_name, get_all=False): + def get_host_info(self, vm_name, get_all=False, node_uuid=None): try: vm = self.compute_client.virtual_machines.get(RESOURCE_GROUP, vm_name, 'instanceView') except Exception as e: logging.error("Failed to get VM info for {} with error {}".format(vm_name, e)) return None + host_node_uuid = vm.tags.get("node-uuid", None) if vm.tags else None + server_type = vm.tags.get("yb-server-type", None) if vm.tags else None + universe_uuid = vm.tags.get("universe-uuid", None) if vm.tags else None + # Matching tag or no tag for backward compatibility. + if host_node_uuid is not None and node_uuid is not None \ + and host_node_uuid != node_uuid: + logging.warning("VM {}({}) with node UUID {} is not found.". + format(vm_name, host_node_uuid, node_uuid)) + return None nic_name = id_to_name(vm.network_profile.network_interfaces[0].id) nic = self.network_client.network_interfaces.get(NETWORK_RESOURCE_GROUP, nic_name) region = vm.location @@ -1185,16 +1194,13 @@ def get_host_info(self, vm_name, get_all=False): public_ip = (self.network_client.public_ip_addresses .get(NETWORK_RESOURCE_GROUP, ip_name).ip_address) subnet = id_to_name(nic.ip_configurations[0].subnet.id) - server_type = vm.tags.get("yb-server-type", None) if vm.tags else None - node_uuid = vm.tags.get("node-uuid", None) if vm.tags else None - universe_uuid = vm.tags.get("universe-uuid", None) if vm.tags else None zone_full = "{}-{}".format(region, zone) if zone is not None else region instance_state = self.extract_vm_instance_state(vm.instance_view) is_running = True if instance_state == "running" else False return {"private_ip": private_ip, "public_ip": public_ip, "region": region, "zone": zone_full, "name": vm.name, "ip_name": ip_name, "instance_type": vm.hardware_profile.vm_size, "server_type": server_type, - "subnet": subnet, "nic": nic_name, "id": vm.name, "node_uuid": node_uuid, + "subnet": subnet, "nic": nic_name, "id": vm.name, "node_uuid": host_node_uuid, "universe_uuid": universe_uuid, "instance_state": instance_state, "is_running": is_running, "root_volume": root_volume} diff --git a/managed/devops/opscli/ybops/cloud/common/method.py b/managed/devops/opscli/ybops/cloud/common/method.py index c3d77cee0d78..7c704467aee1 100644 --- a/managed/devops/opscli/ybops/cloud/common/method.py +++ b/managed/devops/opscli/ybops/cloud/common/method.py @@ -180,6 +180,10 @@ def add_extra_args(self): self.parser.add_argument("--instance_tags", required=False, help="Tags for instances being created.") + self.parser.add_argument("--node_uuid", + default=None, + required=False, + help="The uuid of the instance.") self.parser.add_argument("--systemd_services", action="store_true", default=False, @@ -624,8 +628,6 @@ def add_extra_args(self): action="store_true", default=False, help="Delete the static public ip.") - self.parser.add_argument("--node_uuid", default=None, - help="The uuid of the instance to delete.") def callback(self, args): self.update_ansible_vars_with_args(args) diff --git a/managed/devops/opscli/ybops/cloud/gcp/cloud.py b/managed/devops/opscli/ybops/cloud/gcp/cloud.py index 4df57fb3a68d..0c7f8a190afb 100644 --- a/managed/devops/opscli/ybops/cloud/gcp/cloud.py +++ b/managed/devops/opscli/ybops/cloud/gcp/cloud.py @@ -110,10 +110,11 @@ def unmount_disk(self, args, name): name, args['search_pattern'], args['zone'])) self.get_admin().unmount_disk(args['zone'], args['search_pattern'], name) - def stop_instance(self, args): - instance = self.get_admin().get_instances(args['zone'], args['search_pattern']) + def stop_instance(self, host_info): + instance = self.get_admin().get_instances(host_info['zone'], host_info['search_pattern'], + node_uuid=host_info.get('node_uuid', None)) if not instance: - logging.error("Host {} does not exist".format(args['search_pattern'])) + logging.error("Host {} does not exist".format(host_info['search_pattern'])) return instance_state = instance['instance_state'] if instance_state == 'TERMINATED': @@ -128,10 +129,11 @@ def stop_instance(self, args): raise YBOpsRuntimeError("Host {} cannot be stopped while in '{}' state".format( instance['name'], instance_state)) - def start_instance(self, args, server_ports): - instance = self.get_admin().get_instances(args['zone'], args['search_pattern']) + def start_instance(self, host_info, server_ports): + instance = self.get_admin().get_instances(host_info['zone'], host_info['search_pattern'], + node_uuid=host_info.get('node_uuid', None)) if not instance: - logging.error("Host {} does not exist".format(args['search_pattern'])) + logging.error("Host {} does not exist".format(host_info['search_pattern'])) return instance_state = instance['instance_state'] if instance_state == 'RUNNING': @@ -335,7 +337,9 @@ def get_host_info(self, args, get_all=False, filters=None): """ zone = args.zone search_pattern = args.search_pattern - return self.get_admin().get_instances(zone, search_pattern, get_all, filters=filters) + node_uuid = args.node_uuid + return self.get_admin().get_instances(zone, search_pattern, get_all, filters=filters, + node_uuid=node_uuid) def get_device_names(self, args): # Boot disk is also a persistent disk, so add persistent disks starting at index 1 diff --git a/managed/devops/opscli/ybops/cloud/gcp/utils.py b/managed/devops/opscli/ybops/cloud/gcp/utils.py index 8f08fac68aaf..00df6e142d5d 100644 --- a/managed/devops/opscli/ybops/cloud/gcp/utils.py +++ b/managed/devops/opscli/ybops/cloud/gcp/utils.py @@ -817,11 +817,11 @@ def get_pricing_map(self, ): pricing_map[key] = pricing_map_all[key] return pricing_map except Exception as e: - logging.warn("[app] Exception {} determining GCP pricing info".format(e)) + logging.warning("[app] Exception {} determining GCP pricing info".format(e)) return {} @gcp_request_limit_retry - def get_instances(self, zone, instance_name, get_all=False, filters=None): + def get_instances(self, zone, instance_name, get_all=False, filters=None, node_uuid=None): # TODO: filter should work to do (zone eq args.zone), but it doesn't right now... filter_params = [] if filters: @@ -853,6 +853,13 @@ def get_instances(self, zone, instance_name, get_all=False, filters=None): server_types = [i["value"] for i in metadata if i["key"] == "server_type"] node_uuid_tags = [i["value"] for i in metadata if i["key"] == "node-uuid"] universe_uuid_tags = [i["value"] for i in metadata if i["key"] == "universe-uuid"] + host_node_uuid = node_uuid_tags[0] if node_uuid_tags else None + # Matching label or no label for backward compatibility. + if host_node_uuid is not None and node_uuid is not None \ + and host_node_uuid != node_uuid: + logging.warning("VM {}({}) with node UUID {} is not found.". + format(instance_name, host_node_uuid, node_uuid)) + continue private_ip = None primary_subnet = None secondary_private_ip = None @@ -891,7 +898,7 @@ def get_instances(self, zone, instance_name, get_all=False, filters=None): zone=zone, instance_type=machine_type, server_type=server_types[0] if server_types else None, - node_uuid=node_uuid_tags[0] if node_uuid_tags else None, + node_uuid=host_node_uuid, universe_uuid=universe_uuid_tags[0] if universe_uuid_tags else None, launched_by=None, launch_time=data.get("creationTimestamp"), diff --git a/managed/src/main/java/com/yugabyte/yw/common/NodeManager.java b/managed/src/main/java/com/yugabyte/yw/common/NodeManager.java index c2c121185f60..4fb5e4491736 100644 --- a/managed/src/main/java/com/yugabyte/yw/common/NodeManager.java +++ b/managed/src/main/java/com/yugabyte/yw/common/NodeManager.java @@ -2226,10 +2226,6 @@ && imdsv2required(arch, userIntent, provider)) { if (taskParam.useSystemd) { commandArgs.add("--systemd_services"); } - if (taskParam.nodeUuid != null) { - commandArgs.add("--node_uuid"); - commandArgs.add(taskParam.nodeUuid.toString()); - } if (taskParam.deviceInfo != null) { commandArgs.addAll(getDeviceArgs(taskParam)); } @@ -2642,6 +2638,10 @@ && imdsv2required(arch, userIntent, provider)) { } return localNodeManager.nodeCommand(type, nodeTaskParam, commandArgs); } + if (nodeTaskParam.nodeUuid != null) { + commandArgs.add("--node_uuid"); + commandArgs.add(nodeTaskParam.nodeUuid.toString()); + } commandArgs.add(nodeTaskParam.nodeName); try { Map envVars = diff --git a/managed/src/test/java/com/yugabyte/yw/common/NodeManagerTest.java b/managed/src/test/java/com/yugabyte/yw/common/NodeManagerTest.java index 7ee23d0fccea..5f95bbb0dd75 100644 --- a/managed/src/test/java/com/yugabyte/yw/common/NodeManagerTest.java +++ b/managed/src/test/java/com/yugabyte/yw/common/NodeManagerTest.java @@ -1172,8 +1172,6 @@ private List nodeCommand( expectedCommand.add(destroyParams.instanceType); expectedCommand.add("--node_ip"); expectedCommand.add(destroyParams.nodeIP); - expectedCommand.add("--node_uuid"); - expectedCommand.add(destroyParams.nodeUuid.toString()); break; case Tags: InstanceActions.Params tagsParams = (InstanceActions.Params) params; @@ -1292,6 +1290,8 @@ private List nodeCommand( } expectedCommand.add("--remote_tmp_dir"); expectedCommand.add("/tmp"); + expectedCommand.add("--node_uuid"); + expectedCommand.add(params.nodeUuid.toString()); expectedCommand.add(params.nodeName); if (addGflags) { expectedCommand.add("--gflags"); @@ -1590,7 +1590,7 @@ private void runAndTestProvisionWithAccessKeyAndSG(String sgId) { addValidDeviceInfo(t, params); // Set up expected command - int accessKeyIndexOffset = 9; + int accessKeyIndexOffset = 11; if (t.cloudType.equals(Common.CloudType.aws) && params.deviceInfo.storageType.equals(PublicCloudConstants.StorageType.IO1)) { accessKeyIndexOffset += 2; @@ -1809,7 +1809,7 @@ private void runAndTestCreateWithAccessKeyAndSG(String sgId) { addValidDeviceInfo(t, params); // Set up expected command - int accessKeyIndexOffset = 8; + int accessKeyIndexOffset = 10; if (t.cloudType.equals(Common.CloudType.aws)) { accessKeyIndexOffset += 2; if (params.deviceInfo.storageType.equals(PublicCloudConstants.StorageType.IO1)) { @@ -2071,7 +2071,7 @@ public void testConfigureNodeCommandWithAccessKey() { "/path/to/private.key", "--custom_ssh_port", "3333"); - expectedCommand.addAll(expectedCommand.size() - 9, accessKeyCommand); + expectedCommand.addAll(expectedCommand.size() - 11, accessKeyCommand); reset(shellProcessHandler); nodeManager.nodeCommand(NodeManager.NodeCommandType.Configure, params); verify(shellProcessHandler, times(1)) @@ -3872,7 +3872,7 @@ public void testCronCheckWithAccessKey() { "/path/to/private.key")); accessKeyCommands.add("--custom_ssh_port"); accessKeyCommands.add("3333"); - expectedCommand.addAll(expectedCommand.size() - 3, accessKeyCommands); + expectedCommand.addAll(expectedCommand.size() - 5, accessKeyCommands); reset(shellProcessHandler); nodeManager.nodeCommand(NodeManager.NodeCommandType.CronCheck, params); verify(shellProcessHandler, times(1))