From 0af1783228864c5a801e5ec72e67bfc11a2bc4c1 Mon Sep 17 00:00:00 2001 From: nikhil1697 Date: Wed, 5 Jun 2024 13:03:52 +0530 Subject: [PATCH 1/8] release_calico_leaked_ips --- kubemarine/procedures/upgrade.py | 50 ++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/kubemarine/procedures/upgrade.py b/kubemarine/procedures/upgrade.py index b24c4f854..527572a3d 100755 --- a/kubemarine/procedures/upgrade.py +++ b/kubemarine/procedures/upgrade.py @@ -15,6 +15,7 @@ import itertools from collections import OrderedDict from typing import List, Callable, Dict +import uuid from kubemarine import kubernetes, plugins, admission, jinja from kubemarine.core import flow, log, resources as res @@ -160,6 +161,54 @@ def upgrade_plugins(cluster: KubernetesCluster) -> None: plugins.install(cluster, upgrade_candidates) +def release_calico_leaked_ips(cluster): + """ + During drain command we ignore daemon sets, as result this such pods as ingress-nginx-controller arent't deleted before migration. + For this reason their ips can stay in calico ipam despite they aren't used. You can check this, if you run "calicoctl ipam check --show-problem-ips" right after apply_new_cri task. + Those ips are cleaned by calico garbage collector, but it can take about 20 minutes. + This task releases problem ips with force. + """ + first_control_plane = cluster.nodes['control-plane'].get_first_member() + cluster.log.debug("Getting leaked ips...") + random_report_name = "/tmp/%s.json" % uuid.uuid4().hex + result = first_control_plane.sudo(f"calicoctl ipam check --show-problem-ips -o {random_report_name} | grep 'leaked' || true", is_async=False, hide=False) + leaked_ips = result.get_simple_out() + leaked_ips_count = leaked_ips.count('leaked') + cluster.log.debug(f"Found {leaked_ips_count} leaked ips") + + # Initialize lists to store IPs with missing handles and handles with no matching IPs + ips_with_missing_handles = [] + handles_with_no_matching_ips = [] + + if leaked_ips_count != 0: + # Parse the leaked IPAM report to identify IPs with missing handles and handles with no matching IPs + with open(random_report_name, 'r') as report_file: + for line in report_file: + if 'IPs that are allocated but not actually in use' in line: + continue + elif 'IPAM handles with no matching IPs' in line: + break + elif 'affinity' in line: + ip_address = line.split()[2] + ips_with_missing_handles.append(ip_address) + elif 'affinity=host' in line: + handle = line.split()[3][:-1] # Remove trailing colon + handles_with_no_matching_ips.append(handle) + + # Release IPs with missing handles + if ips_with_missing_handles: + cluster.log.debug("Releasing IPs with missing handles...") + for ip in ips_with_missing_handles: + first_control_plane.sudo(f"calicoctl ipam release --ip={ip} --force", is_async=False, hide=False) + + # Release handles with no matching IPs + if handles_with_no_matching_ips: + cluster.log.debug("Releasing handles with no matching IPs...") + for handle in handles_with_no_matching_ips: + first_control_plane.sudo(f"calicoctl ipam release --handle={handle} --force", is_async=False, hide=False) + + # Clean up the temporary report file + first_control_plane.sudo(f"rm {random_report_name}", is_async=False, hide=False) tasks = OrderedDict({ "cleanup_tmp_dir": cleanup_tmp_dir, @@ -170,6 +219,7 @@ def upgrade_plugins(cluster: KubernetesCluster) -> None: "kubernetes_cleanup": kubernetes_cleanup_nodes_versions, "packages": upgrade_packages, "plugins": upgrade_plugins, + "release_calico_leaked_ips": release_calico_leaked_ips, # Added here "overview": install.overview }) From 4cc9a6580c58f1a9e16635319357e40d8c24f0de Mon Sep 17 00:00:00 2001 From: nikhil1697 Date: Mon, 24 Jun 2024 13:12:14 +0530 Subject: [PATCH 2/8] missing a type annotation --- kubemarine/procedures/upgrade.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubemarine/procedures/upgrade.py b/kubemarine/procedures/upgrade.py index 527572a3d..a55fa4c37 100755 --- a/kubemarine/procedures/upgrade.py +++ b/kubemarine/procedures/upgrade.py @@ -161,7 +161,7 @@ def upgrade_plugins(cluster: KubernetesCluster) -> None: plugins.install(cluster, upgrade_candidates) -def release_calico_leaked_ips(cluster): +def release_calico_leaked_ips(cluster) -> None: """ During drain command we ignore daemon sets, as result this such pods as ingress-nginx-controller arent't deleted before migration. For this reason their ips can stay in calico ipam despite they aren't used. You can check this, if you run "calicoctl ipam check --show-problem-ips" right after apply_new_cri task. From 93d228789db67b934ef6f9f3dc602bc8d704f37c Mon Sep 17 00:00:00 2001 From: nikhil1697 Date: Mon, 24 Jun 2024 13:32:34 +0530 Subject: [PATCH 3/8] missing a type annotation-1 --- kubemarine/procedures/upgrade.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubemarine/procedures/upgrade.py b/kubemarine/procedures/upgrade.py index a55fa4c37..88938ff77 100755 --- a/kubemarine/procedures/upgrade.py +++ b/kubemarine/procedures/upgrade.py @@ -161,7 +161,7 @@ def upgrade_plugins(cluster: KubernetesCluster) -> None: plugins.install(cluster, upgrade_candidates) -def release_calico_leaked_ips(cluster) -> None: +def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: """ During drain command we ignore daemon sets, as result this such pods as ingress-nginx-controller arent't deleted before migration. For this reason their ips can stay in calico ipam despite they aren't used. You can check this, if you run "calicoctl ipam check --show-problem-ips" right after apply_new_cri task. From 5ada6b68cc8e2f99c1bb1aa81464ae9ebffd7cd3 Mon Sep 17 00:00:00 2001 From: nikhil1697 Date: Mon, 24 Jun 2024 13:45:46 +0530 Subject: [PATCH 4/8] missing a type annotation-2 --- kubemarine/procedures/upgrade.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/kubemarine/procedures/upgrade.py b/kubemarine/procedures/upgrade.py index 88938ff77..87a145ea8 100755 --- a/kubemarine/procedures/upgrade.py +++ b/kubemarine/procedures/upgrade.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import itertools from collections import OrderedDict from typing import List, Callable, Dict @@ -161,17 +162,15 @@ def upgrade_plugins(cluster: KubernetesCluster) -> None: plugins.install(cluster, upgrade_candidates) + def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: """ - During drain command we ignore daemon sets, as result this such pods as ingress-nginx-controller arent't deleted before migration. - For this reason their ips can stay in calico ipam despite they aren't used. You can check this, if you run "calicoctl ipam check --show-problem-ips" right after apply_new_cri task. - Those ips are cleaned by calico garbage collector, but it can take about 20 minutes. - This task releases problem ips with force. + Releases leaked IPs with force to handle IPAM issues caused by leftover IPs from pods not properly cleaned up. """ first_control_plane = cluster.nodes['control-plane'].get_first_member() cluster.log.debug("Getting leaked ips...") random_report_name = "/tmp/%s.json" % uuid.uuid4().hex - result = first_control_plane.sudo(f"calicoctl ipam check --show-problem-ips -o {random_report_name} | grep 'leaked' || true", is_async=False, hide=False) + result = first_control_plane.sudo(f"calicoctl ipam check --show-problem-ips -o {random_report_name} | grep 'leaked' || true", hide=False) leaked_ips = result.get_simple_out() leaked_ips_count = leaked_ips.count('leaked') cluster.log.debug(f"Found {leaked_ips_count} leaked ips") @@ -188,27 +187,29 @@ def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: continue elif 'IPAM handles with no matching IPs' in line: break - elif 'affinity' in line: - ip_address = line.split()[2] - ips_with_missing_handles.append(ip_address) - elif 'affinity=host' in line: - handle = line.split()[3][:-1] # Remove trailing colon - handles_with_no_matching_ips.append(handle) + else: + ip = line.split()[0] + ips_with_missing_handles.append(ip) + + for line in report_file: + handle = line.split()[3][:-1] # Remove trailing colon + handles_with_no_matching_ips.append(handle) # Release IPs with missing handles if ips_with_missing_handles: cluster.log.debug("Releasing IPs with missing handles...") for ip in ips_with_missing_handles: - first_control_plane.sudo(f"calicoctl ipam release --ip={ip} --force", is_async=False, hide=False) + first_control_plane.sudo(f"calicoctl ipam release --ip={ip} --force", hide=False) # Release handles with no matching IPs if handles_with_no_matching_ips: cluster.log.debug("Releasing handles with no matching IPs...") for handle in handles_with_no_matching_ips: - first_control_plane.sudo(f"calicoctl ipam release --handle={handle} --force", is_async=False, hide=False) + first_control_plane.sudo(f"calicoctl ipam release --handle={handle} --force", hide=False) # Clean up the temporary report file - first_control_plane.sudo(f"rm {random_report_name}", is_async=False, hide=False) + first_control_plane.sudo(f"rm {random_report_name}", hide=False) + tasks = OrderedDict({ "cleanup_tmp_dir": cleanup_tmp_dir, From 651ed74449e7aa937da44652670bb6a478b5a123 Mon Sep 17 00:00:00 2001 From: nikhil1697 Date: Mon, 24 Jun 2024 13:55:46 +0530 Subject: [PATCH 5/8] fix pylinter error --- kubemarine/procedures/upgrade.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/kubemarine/procedures/upgrade.py b/kubemarine/procedures/upgrade.py index 87a145ea8..39a355ad7 100755 --- a/kubemarine/procedures/upgrade.py +++ b/kubemarine/procedures/upgrade.py @@ -170,7 +170,10 @@ def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: first_control_plane = cluster.nodes['control-plane'].get_first_member() cluster.log.debug("Getting leaked ips...") random_report_name = "/tmp/%s.json" % uuid.uuid4().hex - result = first_control_plane.sudo(f"calicoctl ipam check --show-problem-ips -o {random_report_name} | grep 'leaked' || true", hide=False) + result = first_control_plane.sudo( + "calicoctl ipam check --show-problem-ips -o {random_report_name} | grep 'leaked' || true", + hide=False + ) leaked_ips = result.get_simple_out() leaked_ips_count = leaked_ips.count('leaked') cluster.log.debug(f"Found {leaked_ips_count} leaked ips") @@ -179,21 +182,17 @@ def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: ips_with_missing_handles = [] handles_with_no_matching_ips = [] - if leaked_ips_count != 0: - # Parse the leaked IPAM report to identify IPs with missing handles and handles with no matching IPs - with open(random_report_name, 'r') as report_file: + if leaked_ips_count > 0: + cluster.log.debug("Collecting IPs with missing handles and handles with no matching IPs...") + with open(random_report_name, 'r', encoding='utf-8') as report_file: for line in report_file: - if 'IPs that are allocated but not actually in use' in line: - continue - elif 'IPAM handles with no matching IPs' in line: - break - else: + if 'no matching handle' in line: ip = line.split()[0] ips_with_missing_handles.append(ip) - - for line in report_file: - handle = line.split()[3][:-1] # Remove trailing colon - handles_with_no_matching_ips.append(handle) + continue + if 'no matching IP' in line: + handle = line.split()[3][:-1] # Remove trailing colon + handles_with_no_matching_ips.append(handle) # Release IPs with missing handles if ips_with_missing_handles: From 32a90edee8aa886f3f47f22700ed1be8cdcef397 Mon Sep 17 00:00:00 2001 From: niam0522 Date: Fri, 12 Jul 2024 02:10:50 +0530 Subject: [PATCH 6/8] fix --- kubemarine/procedures/upgrade.py | 64 +++++++++++--------------------- 1 file changed, 21 insertions(+), 43 deletions(-) diff --git a/kubemarine/procedures/upgrade.py b/kubemarine/procedures/upgrade.py index 39a355ad7..bfa742b20 100755 --- a/kubemarine/procedures/upgrade.py +++ b/kubemarine/procedures/upgrade.py @@ -17,7 +17,6 @@ from collections import OrderedDict from typing import List, Callable, Dict import uuid - from kubemarine import kubernetes, plugins, admission, jinja from kubemarine.core import flow, log, resources as res from kubemarine.core import utils @@ -163,52 +162,31 @@ def upgrade_plugins(cluster: KubernetesCluster) -> None: plugins.install(cluster, upgrade_candidates) -def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: +def release_calico_leaked_ips(cluster): """ - Releases leaked IPs with force to handle IPAM issues caused by leftover IPs from pods not properly cleaned up. + Sometimes ips can stay in calico ipam despite they aren't used. You can check this, if you run "calicoctl ipam check --show-problem-ips". + Those ips are cleaned by calico garbage collector, but it can take about 20 minutes. + This task releases problem ips with force. """ + # Identify the first control plane node first_control_plane = cluster.nodes['control-plane'].get_first_member() - cluster.log.debug("Getting leaked ips...") - random_report_name = "/tmp/%s.json" % uuid.uuid4().hex - result = first_control_plane.sudo( - "calicoctl ipam check --show-problem-ips -o {random_report_name} | grep 'leaked' || true", - hide=False - ) - leaked_ips = result.get_simple_out() - leaked_ips_count = leaked_ips.count('leaked') - cluster.log.debug(f"Found {leaked_ips_count} leaked ips") - - # Initialize lists to store IPs with missing handles and handles with no matching IPs - ips_with_missing_handles = [] - handles_with_no_matching_ips = [] - - if leaked_ips_count > 0: - cluster.log.debug("Collecting IPs with missing handles and handles with no matching IPs...") - with open(random_report_name, 'r', encoding='utf-8') as report_file: - for line in report_file: - if 'no matching handle' in line: - ip = line.split()[0] - ips_with_missing_handles.append(ip) - continue - if 'no matching IP' in line: - handle = line.split()[3][:-1] # Remove trailing colon - handles_with_no_matching_ips.append(handle) - - # Release IPs with missing handles - if ips_with_missing_handles: - cluster.log.debug("Releasing IPs with missing handles...") - for ip in ips_with_missing_handles: - first_control_plane.sudo(f"calicoctl ipam release --ip={ip} --force", hide=False) - - # Release handles with no matching IPs - if handles_with_no_matching_ips: - cluster.log.debug("Releasing handles with no matching IPs...") - for handle in handles_with_no_matching_ips: - first_control_plane.sudo(f"calicoctl ipam release --handle={handle} --force", hide=False) - - # Clean up the temporary report file - first_control_plane.sudo(f"rm {random_report_name}", hide=False) + cluster.log.debug("Getting leaked IPs...") + # Generate a unique report name + random_report_name = "/tmp/%s.json" % uuid.uuid4().hex + try: + # Run calicoctl ipam check and save the results + result = first_control_plane.sudo(f"calicoctl ipam check --show-problem-ips -o {random_report_name} | grep 'leaked' || true", hide=False) + cluster.log.debug(f"IPAM check completed and results saved to {random_report_name}") + + # Release the leaked IPs + release_command = f"calicoctl ipam release --from-report={random_report_name} --force" + release_output = first_control_plane.sudo(release_command, hide=False) + + finally: + # Clean up the temporary report file + first_control_plane.sudo(f"rm {random_report_name}", hide=False) + cluster.log.debug(f"Cleaned up report file: {random_report_name}") tasks = OrderedDict({ "cleanup_tmp_dir": cleanup_tmp_dir, From 1757045a3efb6f3c54088e093028d14653205775 Mon Sep 17 00:00:00 2001 From: niam0522 Date: Fri, 12 Jul 2024 02:13:38 +0530 Subject: [PATCH 7/8] fix-1 --- kubemarine/procedures/upgrade.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubemarine/procedures/upgrade.py b/kubemarine/procedures/upgrade.py index bfa742b20..b1a4e8b36 100755 --- a/kubemarine/procedures/upgrade.py +++ b/kubemarine/procedures/upgrade.py @@ -162,7 +162,7 @@ def upgrade_plugins(cluster: KubernetesCluster) -> None: plugins.install(cluster, upgrade_candidates) -def release_calico_leaked_ips(cluster): +def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: """ Sometimes ips can stay in calico ipam despite they aren't used. You can check this, if you run "calicoctl ipam check --show-problem-ips". Those ips are cleaned by calico garbage collector, but it can take about 20 minutes. From d93b486157e6ae253d602dfb47ea98cf45dae5c9 Mon Sep 17 00:00:00 2001 From: niam0522 Date: Fri, 12 Jul 2024 02:22:31 +0530 Subject: [PATCH 8/8] fix-2 --- kubemarine/procedures/upgrade.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kubemarine/procedures/upgrade.py b/kubemarine/procedures/upgrade.py index b1a4e8b36..5bb6ed472 100755 --- a/kubemarine/procedures/upgrade.py +++ b/kubemarine/procedures/upgrade.py @@ -164,9 +164,10 @@ def upgrade_plugins(cluster: KubernetesCluster) -> None: def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: """ - Sometimes ips can stay in calico ipam despite they aren't used. You can check this, if you run "calicoctl ipam check --show-problem-ips". - Those ips are cleaned by calico garbage collector, but it can take about 20 minutes. - This task releases problem ips with force. + Sometimes IPs can stay in Calico IPAM despite not being used. + You can check this by running "calicoctl ipam check --show-problem-ips". + Those IPs are cleaned by Calico garbage collector, but it can take about 20 minutes. + This task releases problem IPs with force. """ # Identify the first control plane node first_control_plane = cluster.nodes['control-plane'].get_first_member() @@ -176,18 +177,23 @@ def release_calico_leaked_ips(cluster: KubernetesCluster) -> None: random_report_name = "/tmp/%s.json" % uuid.uuid4().hex try: # Run calicoctl ipam check and save the results - result = first_control_plane.sudo(f"calicoctl ipam check --show-problem-ips -o {random_report_name} | grep 'leaked' || true", hide=False) + first_control_plane.sudo( + f"calicoctl ipam check --show-problem-ips -o {random_report_name} " + "| grep 'leaked' || true", hide=False + ) cluster.log.debug(f"IPAM check completed and results saved to {random_report_name}") # Release the leaked IPs - release_command = f"calicoctl ipam release --from-report={random_report_name} --force" - release_output = first_control_plane.sudo(release_command, hide=False) - + first_control_plane.sudo( + f"calicoctl ipam release --from-report={random_report_name} --force", + hide=False + ) finally: # Clean up the temporary report file first_control_plane.sudo(f"rm {random_report_name}", hide=False) cluster.log.debug(f"Cleaned up report file: {random_report_name}") + tasks = OrderedDict({ "cleanup_tmp_dir": cleanup_tmp_dir, "verify_upgrade_versions": kubernetes.verify_upgrade_versions,