Skip to content

Commit 81f5894

Browse files
dulinrileyfacebook-github-bot
authored andcommitted
Fix panic in RemoteProcessAlloc select_all (#281)
Summary: Pull Request resolved: #281 There was a bug in the remote process allocator, where if all hosts became closed, the loop would hit a panic in `select_all` given an empty list of handles. This change exits out of the loop instead, completing the task. Added some asserts en route to the allocator to ensure it doesn't get a list of zero hosts to query. And some more `__repr__`'s which were helpful when debugging with logs. Reviewed By: shayne-fletcher Differential Revision: D76750206 fbshipit-source-id: 6c24d0a46a2205ba472aeefd1e806a7a3d94093f
1 parent 984b168 commit 81f5894

File tree

3 files changed

+25
-0
lines changed

3 files changed

+25
-0
lines changed

hyperactor_extension/src/alloc.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,17 @@ impl PyAlloc {
4949
}
5050
}
5151

52+
#[pymethods]
53+
impl PyAlloc {
54+
fn __repr__(&self) -> PyResult<String> {
55+
let data = self.inner.lock().unwrap();
56+
match &*data {
57+
None => Ok("Alloc(None)".to_string()),
58+
Some(wrapper) => Ok(format!("Alloc({})", wrapper.shape())),
59+
}
60+
}
61+
}
62+
5263
/// Internal wrapper to translate from a dyn Alloc to an impl Alloc. Used
5364
/// to support polymorphism in the Python bindings.
5465
pub struct PyAllocWrapper {

hyperactor_mesh/src/alloc/remoteprocess.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,7 @@ impl RemoteProcessAlloc {
529529
let watcher = WatchStream::new(tx_status);
530530
tx_watchers.push((watcher, host.id.clone()));
531531
}
532+
assert!(!tx_watchers.is_empty());
532533
let tx = self.comm_watcher_tx.clone();
533534
tokio::spawn(async move {
534535
loop {
@@ -558,6 +559,10 @@ impl RemoteProcessAlloc {
558559
break;
559560
}
560561
tx_watchers.remove(index);
562+
if tx_watchers.is_empty() {
563+
// All of the statuses have been closed, exit the loop.
564+
break;
565+
}
561566
}
562567
}
563568
}
@@ -579,6 +584,9 @@ impl RemoteProcessAlloc {
579584
.initialize_alloc()
580585
.await
581586
.context("alloc initializer error")?;
587+
if hosts.is_empty() {
588+
anyhow::bail!("Initializer returned empty list of hosts");
589+
}
582590
// prepare a list of host names in this allocation to be sent
583591
// to remote allocators.
584592
let hostnames: Vec<_> = hosts.iter().map(|e| e.hostname.clone()).collect();

python/monarch/actor_mesh.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,9 @@ def __iter__(self):
337337
def __len__(self) -> int:
338338
return len(self._shape)
339339

340+
def __repr__(self) -> str:
341+
return f"ValueMesh({self._shape})"
342+
340343
@property
341344
def _ndslice(self) -> NDSlice:
342345
return self._shape.ndslice
@@ -702,6 +705,9 @@ def _new_with_shape(self, shape: Shape) -> "ActorMeshRef":
702705
self._mailbox,
703706
)
704707

708+
def __repr__(self) -> str:
709+
return f"ActorMeshRef(class={self._class}, shape={self._actor_mesh_ref._shape})"
710+
705711

706712
class ActorError(Exception):
707713
"""

0 commit comments

Comments
 (0)