From f3662dc547e027869590e58eef558e987abfeadb Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 24 Jul 2024 10:23:11 -0500 Subject: [PATCH] fix heap evaporation bug --- smartsim/_core/launcher/dragon/pqueue.py | 15 ++++++++------ tests/test_node_prioritizer.py | 25 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/pqueue.py b/smartsim/_core/launcher/dragon/pqueue.py index c50b280d45..5205c6882c 100644 --- a/smartsim/_core/launcher/dragon/pqueue.py +++ b/smartsim/_core/launcher/dragon/pqueue.py @@ -187,15 +187,18 @@ def _get_next_available_node( tracking_info = heapq.heappop(heap) is_dirty = tracking_info[2] + ref_count = tracking_info[0] + if ref_count == 0: + # increment the ref count before putting back onto heap + tracking_info[0] = ref_count + 1 + + if tracking_info is not None: + heapq.heappush(heap, tracking_info) + # next available must enforce only "open" return nodes - if tracking_info[0] > 0: + if ref_count > 0: return None - # increment the ref count and put back into heap - tracking_info[0] += 1 - heapq.heappush(heap, tracking_info) - is_dirty = tracking_info[2] - return tracking_info def _get_next_n_available_nodes( diff --git a/tests/test_node_prioritizer.py b/tests/test_node_prioritizer.py index ef4bbf7e2f..092e20f9cb 100644 --- a/tests/test_node_prioritizer.py +++ b/tests/test_node_prioritizer.py @@ -214,6 +214,31 @@ def test_node_prioritizer_indirect_increment() -> None: assert tracking_info is None # <--- get_next shouldn't have any nodes to give +def test_node_prioritizer_indirect_decrement_availability() -> None: + """Verify that a node who is decremented (dirty) is made assignable + on a subsequent request""" + + num_cpu_nodes, num_gpu_nodes = 1, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # increment our only node... + p.increment(cpu_hosts[0]) + + tracking_info = p.next() + assert tracking_info is None, "No nodes should be assignable" + + # perform a decrement... + p.decrement(cpu_hosts[0]) + + # ... and confirm that the node is available again + tracking_info = p.next() + assert tracking_info is not None, "A node should be assignable" + + def test_node_prioritizer_multi_increment() -> None: """Verify that retrieving multiple nodes via `next_n` API correctly increments reference counts and returns appropriate results"""