Skip to content

Commit 066212f

Browse files
dulinrileyfacebook-github-bot
authored andcommitted
Expand workspace to cover all crates and enable rust testing (#1684)
Summary: Expand Github rust testing to the whole workspace of crates in monarch. Tests that do not pass in Github are marked as fb-only for now. Many of them can be fixed easily, but we can turn on the majority of tests right away. Reviewed By: colin2328 Differential Revision: D85676520
1 parent c932809 commit 066212f

File tree

23 files changed

+88
-13
lines changed

23 files changed

+88
-13
lines changed

.config/nextest.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[profile.ci]
2+
# Do not cancel the test run on the first failure.
3+
fail-fast = false
4+
5+
[profile.ci.junit]
6+
path = "junit.xml"

.github/workflows/test-gpu-rust.yml

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,18 @@ jobs:
5656
5757
# Run GPU Rust tests
5858
echo "Running OSS Rust tests..."
59-
# TODO: fix broken tests, then update to `cargo test --no-fail-fast`
60-
cargo test -p monarch_rdma
6159
# Uses cargo nextest to run tests in separate processes, which better matches
6260
# internal buck test behavior.
63-
# TODO: increase coverage to more crates.
64-
cargo nextest run -p hyperactor --no-fail-fast
61+
# The CI profile is configured in .config/nextest.toml
62+
# Exclude filter is for packages that don't build in Github Actions yet.
63+
# * monarch_messages: monarch/target/debug/deps/monarch_messages-...:
64+
# /lib64/libm.so.6: version `GLIBC_2.29' not found
65+
# (required by /meta-pytorch/monarch/libtorch/lib/libtorch_cpu.so)
66+
cargo nextest run --workspace --profile ci \
67+
--exclude monarch_messages \
68+
--exclude monarch_tensor_worker \
69+
--exclude monarch_simulator_lib \
70+
--exclude torch-sys \
71+
--exclude torch-sys-cuda
72+
# Copy the test results to the expected location
73+
cp target/nextest/ci/junit.xml "${RUNNER_TEST_RESUILTS_DIR:-test-results}/junit.xml"

Cargo.toml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
[workspace]
22
resolver = "2"
33
members = [
4+
"build_utils",
45
"controller",
56
"cuda-sys",
67
"erased_lifetime",
@@ -10,11 +11,22 @@ members = [
1011
"hyperactor_multiprocess",
1112
"hyperactor_mesh",
1213
"hyperactor_mesh_macros",
13-
"ndslice",
14+
"hyperactor_telemetry",
15+
"monarch_conda",
1416
"monarch_extension",
15-
"monarch_tensor_worker",
17+
"monarch_hyperactor",
18+
"monarch_messages",
19+
"monarch_perfetto_trace",
1620
"monarch_rdma",
21+
"monarch_simulator",
22+
"monarch_tensor_worker",
23+
"monarch_types",
1724
"nccl-sys",
25+
"ndslice",
26+
"preempt_rwlock",
1827
"rdmaxcel-sys",
28+
"serde_multipart",
29+
"timed_test",
1930
"torch-sys",
31+
"torch-sys-cuda",
2032
]

controller/src/lib.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,7 @@ mod tests {
660660
use monarch_messages::worker::CallFunctionParams;
661661
use monarch_messages::worker::WorkerMessage;
662662
use monarch_types::PyTree;
663+
use timed_test::async_timed_test;
663664
use torch_sys::RValue;
664665

665666
use super::*;
@@ -855,7 +856,7 @@ mod tests {
855856
);
856857
}
857858

858-
#[tokio::test]
859+
#[async_timed_test(30)]
859860
async fn worker_timeout() {
860861
tokio::time::pause();
861862
let timeout_secs = 3;
@@ -1838,7 +1839,7 @@ mod tests {
18381839

18391840
hyperactor::remote!(PanickingActor);
18401841

1841-
#[tokio::test]
1842+
#[async_timed_test(timeout_secs = 30)]
18421843
async fn test_supervision_fault() {
18431844
// Start system actor.
18441845
let timeout: Duration = Duration::from_secs(6);

hyperactor_mesh/src/actor_mesh.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1482,18 +1482,19 @@ mod tests {
14821482

14831483
use crate::alloc::process::ProcessAllocator;
14841484

1485+
#[cfg(feature = "fb")]
14851486
fn process_allocator() -> ProcessAllocator {
14861487
ProcessAllocator::new(Command::new(crate::testresource::get(
14871488
"monarch/hyperactor_mesh/bootstrap",
14881489
)))
14891490
}
14901491

1491-
#[cfg(fbcode_build)] // we use an external binary, produced by buck
1492+
#[cfg(feature = "fb")] // we use an external binary, produced by buck
14921493
actor_mesh_test_suite!(process_allocator());
14931494

14941495
// This test is concerned with correctly reporting failures
14951496
// when message sizes exceed configured limits.
1496-
#[cfg(fbcode_build)]
1497+
#[cfg(feature = "fb")]
14971498
//#[tracing_test::traced_test]
14981499
#[async_timed_test(timeout_secs = 30)]
14991500
async fn test_oversized_frames() {
@@ -1603,7 +1604,7 @@ mod tests {
16031604
// Set this test only for `mod process` because it relies on a
16041605
// trick to emulate router failure that only works when using
16051606
// non-local allocators.
1606-
#[cfg(fbcode_build)]
1607+
#[cfg(feature = "fb")]
16071608
#[tokio::test]
16081609
async fn test_router_undeliverable_return() {
16091610
// Test that an undeliverable message received by a
@@ -1928,6 +1929,7 @@ mod tests {
19281929
use crate::sel;
19291930

19301931
#[tokio::test]
1932+
#[cfg(feature = "fb")]
19311933
async fn test_basic() {
19321934
let instance = v1::testing::instance().await;
19331935
let host_mesh = v1::testing::host_mesh(extent!(host = 4)).await;

hyperactor_mesh/src/alloc.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,7 @@ pub(crate) mod testing {
914914
/// a proc that does not time out when it is asked to wait for
915915
/// a stuck actor.
916916
#[tokio::test]
917+
#[cfg(feature = "fb")]
917918
async fn test_allocator_stuck_task() {
918919
// Override config.
919920
// Use temporary config for this test

hyperactor_mesh/src/alloc/process.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -644,11 +644,12 @@ impl Drop for ProcessAlloc {
644644
mod tests {
645645
use super::*;
646646

647-
#[cfg(fbcode_build)] // we use an external binary, produced by buck
647+
#[cfg(feature = "fb")] // we use an external binary, produced by buck
648648
crate::alloc_test_suite!(ProcessAllocator::new(Command::new(
649649
crate::testresource::get("monarch/hyperactor_mesh/bootstrap")
650650
)));
651651

652+
#[cfg(feature = "fb")]
652653
#[tokio::test]
653654
async fn test_sigterm_on_group_fail() {
654655
let bootstrap_binary = crate::testresource::get("monarch/hyperactor_mesh/bootstrap");

hyperactor_mesh/src/alloc/remoteprocess.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2055,6 +2055,7 @@ mod test_alloc {
20552055
use super::*;
20562056

20572057
#[async_timed_test(timeout_secs = 60)]
2058+
#[cfg(feature = "fb")]
20582059
async fn test_alloc_simple() {
20592060
// Use temporary config for this test
20602061
let config = hyperactor::config::global::lock();
@@ -2184,6 +2185,7 @@ mod test_alloc {
21842185
}
21852186

21862187
#[async_timed_test(timeout_secs = 60)]
2188+
#[cfg(feature = "fb")]
21872189
async fn test_alloc_host_failure() {
21882190
// Use temporary config for this test
21892191
let config = hyperactor::config::global::lock();
@@ -2315,6 +2317,7 @@ mod test_alloc {
23152317
}
23162318

23172319
#[async_timed_test(timeout_secs = 15)]
2320+
#[cfg(feature = "fb")]
23182321
async fn test_alloc_inner_alloc_failure() {
23192322
// SAFETY: Test happens in single-threaded code.
23202323
unsafe {
@@ -2450,6 +2453,7 @@ mod test_alloc {
24502453

24512454
#[tracing_test::traced_test]
24522455
#[async_timed_test(timeout_secs = 60)]
2456+
#[cfg(feature = "fb")]
24532457
async fn test_remote_process_alloc_signal_handler() {
24542458
let num_proc_meshes = 5;
24552459
let hosts_per_proc_mesh = 5;

hyperactor_mesh/src/bootstrap.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1382,6 +1382,7 @@ impl BootstrapCommand {
13821382
/// bootstrap processes under proc manager control. Not available
13831383
/// outside of test builds.
13841384
#[cfg(test)]
1385+
#[cfg(feature = "fb")]
13851386
pub(crate) fn test() -> Self {
13861387
Self {
13871388
program: crate::testresource::get("monarch/hyperactor_mesh/bootstrap"),
@@ -3218,6 +3219,7 @@ mod tests {
32183219
}
32193220

32203221
#[tokio::test]
3222+
#[cfg(feature = "fb")]
32213223
async fn bootstrap_handle_terminate_graceful() {
32223224
// Create a root direct-addressed proc + client instance.
32233225
let root = hyperactor::Proc::direct(ChannelTransport::Unix.any(), "root".to_string())
@@ -3281,6 +3283,7 @@ mod tests {
32813283
}
32823284

32833285
#[tokio::test]
3286+
#[cfg(feature = "fb")]
32843287
async fn bootstrap_handle_kill_forced() {
32853288
// Root proc + client instance (so the child can dial back).
32863289
let root = hyperactor::Proc::direct(ChannelTransport::Unix.any(), "root".to_string())
@@ -3330,6 +3333,7 @@ mod tests {
33303333
}
33313334

33323335
#[tokio::test]
3336+
#[cfg(feature = "fb")]
33333337
async fn bootstrap_cannonical_simple() {
33343338
// SAFETY: unit-test scoped
33353339
unsafe {

hyperactor_mesh/src/proc_mesh.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,6 +1264,7 @@ mod tests {
12641264
use crate::sel;
12651265

12661266
#[tokio::test]
1267+
#[cfg(feature = "fb")]
12671268
async fn test_basic() {
12681269
let instance = v1::testing::instance().await;
12691270
let ext = extent!(host = 4);

0 commit comments

Comments
 (0)