Skip to content

Commit

Permalink
fix heap evaporation bug
Browse files Browse the repository at this point in the history
  • Loading branch information
ankona committed Jul 24, 2024
1 parent 9a32a4b commit f3662dc
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 6 deletions.
15 changes: 9 additions & 6 deletions smartsim/_core/launcher/dragon/pqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,18 @@ def _get_next_available_node(
tracking_info = heapq.heappop(heap)
is_dirty = tracking_info[2]

ref_count = tracking_info[0]
if ref_count == 0:
# increment the ref count before putting back onto heap
tracking_info[0] = ref_count + 1

if tracking_info is not None:
heapq.heappush(heap, tracking_info)

# next available must enforce only "open" return nodes
if tracking_info[0] > 0:
if ref_count > 0:
return None

# increment the ref count and put back into heap
tracking_info[0] += 1
heapq.heappush(heap, tracking_info)
is_dirty = tracking_info[2]

return tracking_info

def _get_next_n_available_nodes(
Expand Down
25 changes: 25 additions & 0 deletions tests/test_node_prioritizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,31 @@ def test_node_prioritizer_indirect_increment() -> None:
assert tracking_info is None # <--- get_next shouldn't have any nodes to give


def test_node_prioritizer_indirect_decrement_availability() -> None:
"""Verify that a node who is decremented (dirty) is made assignable
on a subsequent request"""

num_cpu_nodes, num_gpu_nodes = 1, 0
cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)

lock = threading.RLock()
p = NodePrioritizer(nodes, lock)

# increment our only node...
p.increment(cpu_hosts[0])

tracking_info = p.next()
assert tracking_info is None, "No nodes should be assignable"

# perform a decrement...
p.decrement(cpu_hosts[0])

# ... and confirm that the node is available again
tracking_info = p.next()
assert tracking_info is not None, "A node should be assignable"


def test_node_prioritizer_multi_increment() -> None:
"""Verify that retrieving multiple nodes via `next_n` API correctly
increments reference counts and returns appropriate results"""
Expand Down

0 comments on commit f3662dc

Please sign in to comment.