Skip to content

Commit

Permalink
[PLAT-16610] Verify that a cloud instance searched by the name, has t…
Browse files Browse the repository at this point in the history
…he same node UUID tag before performing any action in python ybops

Summary: Check again with node uuid tag value if it is set before picking up the node.

Test Plan: Manually created universes on aws and gcp and modified the tag to make sure the nodes are not deleted.

Reviewers: cwang, muthu, yshchetinin

Reviewed By: yshchetinin

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D41535
  • Loading branch information
nkhogen committed Jan 30, 2025
1 parent d93407a commit fbfe739
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 40 deletions.
21 changes: 12 additions & 9 deletions managed/devops/opscli/ybops/cloud/aws/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,9 @@ def get_host_info(self, args, get_all=False, private_ip=None):
"""
region = args.region
search_pattern = args.search_pattern
return self.get_host_info_specific_args(region, search_pattern, get_all, private_ip)
node_uuid = args.node_uuid
return self.get_host_info_specific_args(region, search_pattern, get_all, private_ip,
node_uuid=node_uuid)

def get_host_info_specific_args(self, region, search_pattern, get_all=False,
private_ip=None, filters=None, node_uuid=None):
Expand All @@ -389,19 +391,14 @@ def get_host_info_specific_args(self, region, search_pattern, get_all=False,
"Name": "tag:Name",
"Values": [search_pattern]
})
if node_uuid:
filters.append({
"Name": "tag:node-uuid",
"Values": [node_uuid]
})
instances = []
for _, client in self._get_clients(region=region).items():
instances.extend(list(client.instances.filter(Filters=filters)))
results = []
for instance in instances:
data = instance.meta.data
if private_ip is not None and private_ip != data["PrivateIpAddress"]:
logging.warn("Node name {} is not unique. Expected IP {}, got IP {}".format(
logging.warning("Node name {} is not unique. Expected IP {}, got IP {}".format(
search_pattern, private_ip, data["PrivateIpAddress"]))
continue
zone = data["Placement"]["AvailabilityZone"]
Expand All @@ -416,7 +413,13 @@ def get_host_info_specific_args(self, region, search_pattern, get_all=False,
node_uuid_tags = [t["Value"] for t in data["Tags"] if t["Key"] == "node-uuid"]
universe_uuid_tags = [t["Value"] for t in data["Tags"]
if t["Key"] == "universe-uuid"]

host_node_uuid = node_uuid_tags[0] if node_uuid_tags else None
# Matching tag or no tag for backward compatibility.
if host_node_uuid is not None and node_uuid is not None \
and host_node_uuid != node_uuid:
logging.warning("VM {}({}) with node UUID {} is not found.".
format(search_pattern, host_node_uuid, node_uuid))
continue
primary_private_ip = None
secondary_private_ip = None
primary_subnet = None
Expand Down Expand Up @@ -447,7 +450,7 @@ def get_host_info_specific_args(self, region, search_pattern, get_all=False,
instance_type=data["InstanceType"],
server_type=server_tags[0] if server_tags else None,
launched_by=launched_by_tags[0] if launched_by_tags else None,
node_uuid=node_uuid_tags[0] if node_uuid_tags else None,
node_uuid=host_node_uuid,
universe_uuid=universe_uuid_tags[0] if universe_uuid_tags else None,
vpc=data["VpcId"],
instance_state=instance_state,
Expand Down
9 changes: 6 additions & 3 deletions managed/devops/opscli/ybops/cloud/aws/method.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ def callback(self, args):
args.search_pattern,
get_all=False,
private_ip=args.node_ip,
filters=filters
filters=filters,
node_uuid=args.node_uuid
)
if not host_info:
logging.error("Host {} does not exist.".format(args.search_pattern))
Expand Down Expand Up @@ -234,7 +235,8 @@ def callback(self, args):
args.region,
args.search_pattern,
get_all=False,
private_ip=args.node_ip
private_ip=args.node_ip,
node_uuid=args.node_uuid
)

if not host_info:
Expand Down Expand Up @@ -263,7 +265,8 @@ def callback(self, args):
args.region,
args.search_pattern,
get_all=False,
private_ip=args.node_ip
private_ip=args.node_ip,
node_uuid=args.node_uuid
)

if not host_info:
Expand Down
4 changes: 3 additions & 1 deletion managed/devops/opscli/ybops/cloud/azure/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ def get_instance_types(self, args):
return self.get_admin().get_instance_types(regions)

def get_host_info(self, args, get_all=False):
return self.get_admin().get_host_info(args.search_pattern, get_all)
return self.get_admin().get_host_info(args.search_pattern, get_all,
node_uuid=args.node_uuid)

def get_device_names(self, args):
return ["sd{}".format(chr(i)) for i in range(ord('c'), ord('c') + args.num_volumes)]
Expand Down Expand Up @@ -245,6 +246,7 @@ def modify_tags(self, args):
instance = self.get_host_info(args)
if not instance:
raise YBOpsRuntimeError("Could not find instance {}".format(args.search_pattern))
# TODO this method is not present!
modify_tags(args.region, instance["id"], args.instance_tags, args.remove_tags)

def start_instance(self, host_info, server_ports):
Expand Down
16 changes: 11 additions & 5 deletions managed/devops/opscli/ybops/cloud/azure/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1166,12 +1166,21 @@ def get_ultra_instances(self, regions, folder):
json.dump(vms, writefile)
return

def get_host_info(self, vm_name, get_all=False):
def get_host_info(self, vm_name, get_all=False, node_uuid=None):
try:
vm = self.compute_client.virtual_machines.get(RESOURCE_GROUP, vm_name, 'instanceView')
except Exception as e:
logging.error("Failed to get VM info for {} with error {}".format(vm_name, e))
return None
host_node_uuid = vm.tags.get("node-uuid", None) if vm.tags else None
server_type = vm.tags.get("yb-server-type", None) if vm.tags else None
universe_uuid = vm.tags.get("universe-uuid", None) if vm.tags else None
# Matching tag or no tag for backward compatibility.
if host_node_uuid is not None and node_uuid is not None \
and host_node_uuid != node_uuid:
logging.warning("VM {}({}) with node UUID {} is not found.".
format(vm_name, host_node_uuid, node_uuid))
return None
nic_name = id_to_name(vm.network_profile.network_interfaces[0].id)
nic = self.network_client.network_interfaces.get(NETWORK_RESOURCE_GROUP, nic_name)
region = vm.location
Expand All @@ -1185,16 +1194,13 @@ def get_host_info(self, vm_name, get_all=False):
public_ip = (self.network_client.public_ip_addresses
.get(NETWORK_RESOURCE_GROUP, ip_name).ip_address)
subnet = id_to_name(nic.ip_configurations[0].subnet.id)
server_type = vm.tags.get("yb-server-type", None) if vm.tags else None
node_uuid = vm.tags.get("node-uuid", None) if vm.tags else None
universe_uuid = vm.tags.get("universe-uuid", None) if vm.tags else None
zone_full = "{}-{}".format(region, zone) if zone is not None else region
instance_state = self.extract_vm_instance_state(vm.instance_view)
is_running = True if instance_state == "running" else False
return {"private_ip": private_ip, "public_ip": public_ip, "region": region,
"zone": zone_full, "name": vm.name, "ip_name": ip_name,
"instance_type": vm.hardware_profile.vm_size, "server_type": server_type,
"subnet": subnet, "nic": nic_name, "id": vm.name, "node_uuid": node_uuid,
"subnet": subnet, "nic": nic_name, "id": vm.name, "node_uuid": host_node_uuid,
"universe_uuid": universe_uuid, "instance_state": instance_state,
"is_running": is_running, "root_volume": root_volume}

Expand Down
6 changes: 4 additions & 2 deletions managed/devops/opscli/ybops/cloud/common/method.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ def add_extra_args(self):
self.parser.add_argument("--instance_tags",
required=False,
help="Tags for instances being created.")
self.parser.add_argument("--node_uuid",
default=None,
required=False,
help="The uuid of the instance.")
self.parser.add_argument("--systemd_services",
action="store_true",
default=False,
Expand Down Expand Up @@ -624,8 +628,6 @@ def add_extra_args(self):
action="store_true",
default=False,
help="Delete the static public ip.")
self.parser.add_argument("--node_uuid", default=None,
help="The uuid of the instance to delete.")

def callback(self, args):
self.update_ansible_vars_with_args(args)
Expand Down
18 changes: 11 additions & 7 deletions managed/devops/opscli/ybops/cloud/gcp/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,11 @@ def unmount_disk(self, args, name):
name, args['search_pattern'], args['zone']))
self.get_admin().unmount_disk(args['zone'], args['search_pattern'], name)

def stop_instance(self, args):
instance = self.get_admin().get_instances(args['zone'], args['search_pattern'])
def stop_instance(self, host_info):
instance = self.get_admin().get_instances(host_info['zone'], host_info['search_pattern'],
node_uuid=host_info.get('node_uuid', None))
if not instance:
logging.error("Host {} does not exist".format(args['search_pattern']))
logging.error("Host {} does not exist".format(host_info['search_pattern']))
return
instance_state = instance['instance_state']
if instance_state == 'TERMINATED':
Expand All @@ -128,10 +129,11 @@ def stop_instance(self, args):
raise YBOpsRuntimeError("Host {} cannot be stopped while in '{}' state".format(
instance['name'], instance_state))

def start_instance(self, args, server_ports):
instance = self.get_admin().get_instances(args['zone'], args['search_pattern'])
def start_instance(self, host_info, server_ports):
instance = self.get_admin().get_instances(host_info['zone'], host_info['search_pattern'],
node_uuid=host_info.get('node_uuid', None))
if not instance:
logging.error("Host {} does not exist".format(args['search_pattern']))
logging.error("Host {} does not exist".format(host_info['search_pattern']))
return
instance_state = instance['instance_state']
if instance_state == 'RUNNING':
Expand Down Expand Up @@ -335,7 +337,9 @@ def get_host_info(self, args, get_all=False, filters=None):
"""
zone = args.zone
search_pattern = args.search_pattern
return self.get_admin().get_instances(zone, search_pattern, get_all, filters=filters)
node_uuid = args.node_uuid
return self.get_admin().get_instances(zone, search_pattern, get_all, filters=filters,
node_uuid=node_uuid)

def get_device_names(self, args):
# Boot disk is also a persistent disk, so add persistent disks starting at index 1
Expand Down
13 changes: 10 additions & 3 deletions managed/devops/opscli/ybops/cloud/gcp/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,11 +817,11 @@ def get_pricing_map(self, ):
pricing_map[key] = pricing_map_all[key]
return pricing_map
except Exception as e:
logging.warn("[app] Exception {} determining GCP pricing info".format(e))
logging.warning("[app] Exception {} determining GCP pricing info".format(e))
return {}

@gcp_request_limit_retry
def get_instances(self, zone, instance_name, get_all=False, filters=None):
def get_instances(self, zone, instance_name, get_all=False, filters=None, node_uuid=None):
# TODO: filter should work to do (zone eq args.zone), but it doesn't right now...
filter_params = []
if filters:
Expand Down Expand Up @@ -853,6 +853,13 @@ def get_instances(self, zone, instance_name, get_all=False, filters=None):
server_types = [i["value"] for i in metadata if i["key"] == "server_type"]
node_uuid_tags = [i["value"] for i in metadata if i["key"] == "node-uuid"]
universe_uuid_tags = [i["value"] for i in metadata if i["key"] == "universe-uuid"]
host_node_uuid = node_uuid_tags[0] if node_uuid_tags else None
# Matching label or no label for backward compatibility.
if host_node_uuid is not None and node_uuid is not None \
and host_node_uuid != node_uuid:
logging.warning("VM {}({}) with node UUID {} is not found.".
format(instance_name, host_node_uuid, node_uuid))
continue
private_ip = None
primary_subnet = None
secondary_private_ip = None
Expand Down Expand Up @@ -891,7 +898,7 @@ def get_instances(self, zone, instance_name, get_all=False, filters=None):
zone=zone,
instance_type=machine_type,
server_type=server_types[0] if server_types else None,
node_uuid=node_uuid_tags[0] if node_uuid_tags else None,
node_uuid=host_node_uuid,
universe_uuid=universe_uuid_tags[0] if universe_uuid_tags else None,
launched_by=None,
launch_time=data.get("creationTimestamp"),
Expand Down
8 changes: 4 additions & 4 deletions managed/src/main/java/com/yugabyte/yw/common/NodeManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -2226,10 +2226,6 @@ && imdsv2required(arch, userIntent, provider)) {
if (taskParam.useSystemd) {
commandArgs.add("--systemd_services");
}
if (taskParam.nodeUuid != null) {
commandArgs.add("--node_uuid");
commandArgs.add(taskParam.nodeUuid.toString());
}
if (taskParam.deviceInfo != null) {
commandArgs.addAll(getDeviceArgs(taskParam));
}
Expand Down Expand Up @@ -2642,6 +2638,10 @@ && imdsv2required(arch, userIntent, provider)) {
}
return localNodeManager.nodeCommand(type, nodeTaskParam, commandArgs);
}
if (nodeTaskParam.nodeUuid != null) {
commandArgs.add("--node_uuid");
commandArgs.add(nodeTaskParam.nodeUuid.toString());
}
commandArgs.add(nodeTaskParam.nodeName);
try {
Map<String, String> envVars =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1172,8 +1172,6 @@ private List<String> nodeCommand(
expectedCommand.add(destroyParams.instanceType);
expectedCommand.add("--node_ip");
expectedCommand.add(destroyParams.nodeIP);
expectedCommand.add("--node_uuid");
expectedCommand.add(destroyParams.nodeUuid.toString());
break;
case Tags:
InstanceActions.Params tagsParams = (InstanceActions.Params) params;
Expand Down Expand Up @@ -1292,6 +1290,8 @@ private List<String> nodeCommand(
}
expectedCommand.add("--remote_tmp_dir");
expectedCommand.add("/tmp");
expectedCommand.add("--node_uuid");
expectedCommand.add(params.nodeUuid.toString());
expectedCommand.add(params.nodeName);
if (addGflags) {
expectedCommand.add("--gflags");
Expand Down Expand Up @@ -1590,7 +1590,7 @@ private void runAndTestProvisionWithAccessKeyAndSG(String sgId) {
addValidDeviceInfo(t, params);

// Set up expected command
int accessKeyIndexOffset = 9;
int accessKeyIndexOffset = 11;
if (t.cloudType.equals(Common.CloudType.aws)
&& params.deviceInfo.storageType.equals(PublicCloudConstants.StorageType.IO1)) {
accessKeyIndexOffset += 2;
Expand Down Expand Up @@ -1809,7 +1809,7 @@ private void runAndTestCreateWithAccessKeyAndSG(String sgId) {
addValidDeviceInfo(t, params);

// Set up expected command
int accessKeyIndexOffset = 8;
int accessKeyIndexOffset = 10;
if (t.cloudType.equals(Common.CloudType.aws)) {
accessKeyIndexOffset += 2;
if (params.deviceInfo.storageType.equals(PublicCloudConstants.StorageType.IO1)) {
Expand Down Expand Up @@ -2071,7 +2071,7 @@ public void testConfigureNodeCommandWithAccessKey() {
"/path/to/private.key",
"--custom_ssh_port",
"3333");
expectedCommand.addAll(expectedCommand.size() - 9, accessKeyCommand);
expectedCommand.addAll(expectedCommand.size() - 11, accessKeyCommand);
reset(shellProcessHandler);
nodeManager.nodeCommand(NodeManager.NodeCommandType.Configure, params);
verify(shellProcessHandler, times(1))
Expand Down Expand Up @@ -3872,7 +3872,7 @@ public void testCronCheckWithAccessKey() {
"/path/to/private.key"));
accessKeyCommands.add("--custom_ssh_port");
accessKeyCommands.add("3333");
expectedCommand.addAll(expectedCommand.size() - 3, accessKeyCommands);
expectedCommand.addAll(expectedCommand.size() - 5, accessKeyCommands);
reset(shellProcessHandler);
nodeManager.nodeCommand(NodeManager.NodeCommandType.CronCheck, params);
verify(shellProcessHandler, times(1))
Expand Down

0 comments on commit fbfe739

Please sign in to comment.