From 2ba818062174f874e0b8ec2bcadc9dd607d122e5 Mon Sep 17 00:00:00 2001 From: Yury Shchetinin Date: Wed, 28 Aug 2024 21:40:09 +0300 Subject: [PATCH] [PLAT-14883][Azure] OS upgrades seems to be leaving over unattached disks Summary: Used the same approach of deleting disks after OS upgrade for azure. Removed duplicate code in VMImageUpgrade. Test Plan: 1) run OS upgrade with the same image and force=true and force it to fail while deleting disk 2) retry that task (without force=true) - verify that old disk is deleted Reviewers: svarshney, muthu, nsingh Reviewed By: muthu Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D37680 --- .../devops/opscli/ybops/cloud/azure/cloud.py | 4 + .../devops/opscli/ybops/cloud/azure/method.py | 6 +- .../devops/opscli/ybops/cloud/azure/utils.py | 20 +++++ .../tasks/upgrade/VMImageUpgrade.java | 85 ++++++++++--------- 4 files changed, 69 insertions(+), 46 deletions(-) diff --git a/managed/devops/opscli/ybops/cloud/azure/cloud.py b/managed/devops/opscli/ybops/cloud/azure/cloud.py index 002cd56f4031..9ba3506c1d65 100644 --- a/managed/devops/opscli/ybops/cloud/azure/cloud.py +++ b/managed/devops/opscli/ybops/cloud/azure/cloud.py @@ -237,6 +237,10 @@ def edit_dns_record_set(self, dns_zone_id, domain_name_prefix, ip_list): def delete_dns_record_set(self, dns_zone_id, domain_name_prefix): return self.get_admin().delete_dns_record_set(dns_zone_id, domain_name_prefix) + def delete_volumes(self, args): + tags = json.loads(args.instance_tags) if args.instance_tags is not None else {} + return self.get_admin().delete_disks(tags) + def modify_tags(self, args): instance = self.get_host_info(args) if not instance: diff --git a/managed/devops/opscli/ybops/cloud/azure/method.py b/managed/devops/opscli/ybops/cloud/azure/method.py index 63133753756d..3cf06efed7ff 100644 --- a/managed/devops/opscli/ybops/cloud/azure/method.py +++ b/managed/devops/opscli/ybops/cloud/azure/method.py @@ -131,11 +131,7 @@ def __init__(self, base_command): super(AzureReplaceRootVolumeMethod, self).__init__(base_command) def _mount_root_volume(self, host_info, volume): - curr_root_vol = host_info["root_volume"] self.cloud.mount_disk(host_info, volume) - disk_del = self.cloud.get_admin().delete_disk(curr_root_vol) - disk_del.wait() - logging.info("[app] Successfully deleted old OS disk {}".format(curr_root_vol)) def _host_info_with_current_root_volume(self, args, host_info): return (host_info, host_info.get("root_volume")) @@ -312,7 +308,7 @@ def __init__(self, base_command): super(AzureDeleteRootVolumesMethod, self).__init__(base_command) def delete_volumes(self, args): - pass + self.cloud.delete_volumes(args) class AzurePauseInstancesMethod(AbstractInstancesMethod): diff --git a/managed/devops/opscli/ybops/cloud/azure/utils.py b/managed/devops/opscli/ybops/cloud/azure/utils.py index 57a9822658c0..31d7330e604f 100644 --- a/managed/devops/opscli/ybops/cloud/azure/utils.py +++ b/managed/devops/opscli/ybops/cloud/azure/utils.py @@ -546,6 +546,26 @@ def update_os_disk(self, vm_name, os_disk): def delete_disk(self, disk_name): return self.compute_client.disks.begin_delete(RESOURCE_GROUP, os.path.basename(disk_name)) + def delete_disks(self, tags): + if not tags: + raise YBOpsRuntimeError('Tags must be specified') + universe_uuid = tags.get('universe-uuid') + if universe_uuid is None: + raise YBOpsRuntimeError('Universe UUID must be specified') + node_uuid = tags.get('node-uuid') + if node_uuid is None: + raise YBOpsRuntimeError('Node UUID must be specified') + disk_list = self.compute_client.disks.list_by_resource_group(RESOURCE_GROUP) + if disk_list: + for disk in disk_list: + if (disk.disk_state == "Unattached" and disk.tags + and disk.tags.get('universe-uuid') == universe_uuid + and disk.tags.get('node-uuid') == node_uuid): + logging.info("[app] Deleting disk {}".format(disk.name)) + disk_del = self.delete_disk(disk.name) + disk_del.wait() + logging.info("[app] Deleted disk {}".format(disk.name)) + def tag_disks(self, vm, tags): # Updating requires Disk as input rather than OSDisk. Retrieve Disk class with OSDisk name. disk = self.compute_client.disks.get( diff --git a/managed/src/main/java/com/yugabyte/yw/commissioner/tasks/upgrade/VMImageUpgrade.java b/managed/src/main/java/com/yugabyte/yw/commissioner/tasks/upgrade/VMImageUpgrade.java index 6fba0048361e..932210035024 100644 --- a/managed/src/main/java/com/yugabyte/yw/commissioner/tasks/upgrade/VMImageUpgrade.java +++ b/managed/src/main/java/com/yugabyte/yw/commissioner/tasks/upgrade/VMImageUpgrade.java @@ -30,10 +30,10 @@ import com.yugabyte.yw.models.helpers.NodeDetails.NodeState; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; @@ -145,11 +145,26 @@ public void run() { }); } - private void createVMImageUpgradeTasks(Set nodes) { - createRootVolumeCreationTasks(nodes).setSubTaskGroupType(getTaskSubGroupType()); + private static class ImageSettings { + final String machineImage; + final String sshUserOverride; + final Integer sshPortOverride; + final UUID imageBundleUUID; + + private ImageSettings( + String machineImage, + String sshUserOverride, + Integer sshPortOverride, + UUID imageBundleUUID) { + this.machineImage = machineImage; + this.sshUserOverride = sshUserOverride; + this.sshPortOverride = sshPortOverride; + this.imageBundleUUID = imageBundleUUID; + } + } - Map clusterToImageBundleMap = new HashMap<>(); - Universe universe = getUniverse(); + private Map getImageSettingsForNodes(Set nodes) { + Map result = new LinkedHashMap<>(); UUID imageBundleUUID; for (NodeDetails node : nodes) { UUID region = taskParams().nodeToRegion.get(node.nodeUuid); @@ -188,6 +203,25 @@ private void createVMImageUpgradeTasks(Set nodes) { machineImage); continue; } + result.put( + node, new ImageSettings(machineImage, sshUserOverride, sshPortOverride, imageBundleUUID)); + } + return result; + } + + private void createVMImageUpgradeTasks(Set nodes) { + Map imageSettingsMap = getImageSettingsForNodes(nodes); + + createRootVolumeCreationTasks(imageSettingsMap).setSubTaskGroupType(getTaskSubGroupType()); + + Map clusterToImageBundleMap = new HashMap<>(); + Universe universe = getUniverse(); + for (NodeDetails node : imageSettingsMap.keySet()) { + ImageSettings imageSettings = imageSettingsMap.get(node); + final UUID imageBundleUUID = imageSettings.imageBundleUUID; + final String sshUserOverride = imageSettings.sshUserOverride; + final Integer sshPortOverride = imageSettings.sshPortOverride; + final String machineImage = imageSettings.machineImage; Set processTypes = new LinkedHashSet<>(); if (node.isMaster) { processTypes.add(ServerType.MASTER); @@ -310,49 +344,18 @@ private void createVMImageUpgradeTasks(Set nodes) { .setSubTaskGroupType(getTaskSubGroupType()); } - private SubTaskGroup createRootVolumeCreationTasks(Collection nodes) { + private SubTaskGroup createRootVolumeCreationTasks(Map settingsMap) { Map> rootVolumesPerAZ = - nodes.stream().collect(Collectors.groupingBy(n -> n.azUuid)); + settingsMap.keySet().stream().collect(Collectors.groupingBy(n -> n.azUuid)); SubTaskGroup subTaskGroup = createSubTaskGroup("CreateRootVolumes"); - Universe universe = getUniverse(); rootVolumesPerAZ.forEach( (key, value) -> { NodeDetails node = value.get(0); - UUID region = taskParams().nodeToRegion.get(node.nodeUuid); - String updatedMachineImage = ""; - if (taskParams().imageBundles != null && taskParams().imageBundles.size() > 0) { - UUID imageBundleUUID = retrieveImageBundleUUID(taskParams().imageBundles, node); - ImageBundle.NodeProperties toOverwriteNodeProperties = - imageBundleUtil.getNodePropertiesOrFail( - imageBundleUUID, node.cloudInfo.region, node.cloudInfo.cloud); - updatedMachineImage = toOverwriteNodeProperties.getMachineImage(); - } else { - // Backward compatiblity. - updatedMachineImage = taskParams().machineImages.get(region); - } - final String machineImage = updatedMachineImage; - int numVolumes = value.size(); - - if (!taskParams().forceVMImageUpgrade) { - numVolumes = - (int) - value.stream() - .filter( - n -> { - String existingMachineImage = n.machineImage; - if (StringUtils.isBlank(existingMachineImage)) { - existingMachineImage = retreiveMachineImageForNode(n); - } - return !machineImage.equals(existingMachineImage); - }) - .count(); - } + ImageSettings imageSettings = settingsMap.get(node); - if (numVolumes == 0) { - log.info("Nothing to upgrade in AZ {}", node.cloudInfo.az); - return; - } + final String machineImage = imageSettings.machineImage; + int numVolumes = value.size(); CreateRootVolumes.Params params = new CreateRootVolumes.Params(); Cluster cluster = taskParams().getClusterByUuid(node.placementUuid);