Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .config/nextest.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[profile.ci]
# Do not cancel the test run on the first failure.
fail-fast = false

[profile.ci.junit]
path = "junit.xml"
20 changes: 16 additions & 4 deletions .github/workflows/test-gpu-rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,21 @@ jobs:

# Run GPU Rust tests
echo "Running OSS Rust tests..."
# TODO: fix broken tests, then update to `cargo test --no-fail-fast`
cargo test -p monarch_rdma
# Uses cargo nextest to run tests in separate processes, which better matches
# internal buck test behavior.
# TODO: increase coverage to more crates.
cargo nextest run -p hyperactor --no-fail-fast
# The CI profile is configured in .config/nextest.toml
# Exclude filter is for packages that don't build in Github Actions yet.
# * monarch_messages: monarch/target/debug/deps/monarch_messages-...:
# /lib64/libm.so.6: version `GLIBC_2.29' not found
# (required by /meta-pytorch/monarch/libtorch/lib/libtorch_cpu.so)
cargo nextest run --workspace --profile ci \
--exclude monarch_messages \
--exclude monarch_tensor_worker \
--exclude monarch_simulator_lib \
--exclude torch-sys \
--exclude torch-sys-cuda
# Copy the test results to the expected location
# TODO: error in pytest-results-action, TypeError: results.testsuites.testsuite.testcase is not iterable
# Don't try to parse these results for now.
# mkdir -p "${RUNNER_TEST_RESULTS_DIR:-test-results}"
# cp target/nextest/ci/junit.xml "${RUNNER_TEST_RESULTS_DIR:-test-results}/junit.xml"
16 changes: 14 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[workspace]
resolver = "2"
members = [
"build_utils",
"controller",
"cuda-sys",
"erased_lifetime",
Expand All @@ -10,11 +11,22 @@ members = [
"hyperactor_multiprocess",
"hyperactor_mesh",
"hyperactor_mesh_macros",
"ndslice",
"hyperactor_telemetry",
"monarch_conda",
"monarch_extension",
"monarch_tensor_worker",
"monarch_hyperactor",
"monarch_messages",
"monarch_perfetto_trace",
"monarch_rdma",
"monarch_simulator",
"monarch_tensor_worker",
"monarch_types",
"nccl-sys",
"ndslice",
"preempt_rwlock",
"rdmaxcel-sys",
"serde_multipart",
"timed_test",
"torch-sys",
"torch-sys-cuda",
]
4 changes: 4 additions & 0 deletions controller/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,8 @@ tracing = { version = "0.1.41", features = ["attributes", "valuable"] }

[dev-dependencies]
monarch_types = { version = "0.0.0", path = "../monarch_types" }
timed_test = { version = "0.0.0", path = "../timed_test" }
torch-sys = { version = "0.0.0", path = "../torch-sys" }

[lints]
rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } }
5 changes: 4 additions & 1 deletion controller/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ mod tests {
use monarch_messages::worker::CallFunctionParams;
use monarch_messages::worker::WorkerMessage;
use monarch_types::PyTree;
use timed_test::async_timed_test;
use torch_sys::RValue;

use super::*;
Expand Down Expand Up @@ -1838,7 +1839,9 @@ mod tests {

hyperactor::remote!(PanickingActor);

#[tokio::test]
#[async_timed_test(timeout_secs = 30)]
// times out (both internal and external).
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_supervision_fault() {
// Start system actor.
let timeout: Duration = Duration::from_secs(6);
Expand Down
3 changes: 3 additions & 0 deletions hyperactor/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,6 @@ tracing-test = { version = "0.2.3", features = ["no-env-filter"] }
[features]
default = []
stdio-write-probe = []

[lints]
rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } }
4 changes: 2 additions & 2 deletions hyperactor/src/channel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1143,7 +1143,7 @@ mod tests {

#[tokio::test]
// TODO: OSS: called `Result::unwrap()` on an `Err` value: Server(Listen(Tcp([::1]:0), Os { code: 99, kind: AddrNotAvailable, message: "Cannot assign requested address" }))
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_dial_serve() {
for addr in addrs() {
let (listen_addr, mut rx) = crate::channel::serve::<i32>(addr).unwrap();
Expand All @@ -1155,7 +1155,7 @@ mod tests {

#[tokio::test]
// TODO: OSS: called `Result::unwrap()` on an `Err` value: Server(Listen(Tcp([::1]:0), Os { code: 99, kind: AddrNotAvailable, message: "Cannot assign requested address" }))
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_send() {
let config = crate::config::global::lock();

Expand Down
18 changes: 9 additions & 9 deletions hyperactor/src/channel/net.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2572,7 +2572,7 @@ mod tests {
#[tracing_test::traced_test]
#[async_timed_test(timeout_secs = 60)]
// TODO: OSS: called `Result::unwrap()` on an `Err` value: Listen(Tcp([::1]:0), Os { code: 99, kind: AddrNotAvailable, message: "Cannot assign requested address" })
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_tcp_basic() {
let (addr, mut rx) = tcp::serve::<u64>("[::1]:0".parse().unwrap()).unwrap();
{
Expand Down Expand Up @@ -2605,7 +2605,7 @@ mod tests {
// The message size is limited by CODEC_MAX_FRAME_LENGTH.
#[async_timed_test(timeout_secs = 5)]
// TODO: OSS: called `Result::unwrap()` on an `Err` value: Listen(Tcp([::1]:0), Os { code: 99, kind: AddrNotAvailable, message: "Cannot assign requested address" })
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_tcp_message_size() {
let default_size_in_bytes = 100 * 1024 * 1024;
// Use temporary config for this test
Expand Down Expand Up @@ -2635,7 +2635,7 @@ mod tests {

#[async_timed_test(timeout_secs = 30)]
// TODO: OSS: called `Result::unwrap()` on an `Err` value: Listen(Tcp([::1]:0), Os { code: 99, kind: AddrNotAvailable, message: "Cannot assign requested address" })
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_ack_flush() {
let config = config::global::lock();
// Set a large value to effectively prevent acks from being sent except
Expand All @@ -2659,7 +2659,7 @@ mod tests {
#[tracing_test::traced_test]
#[tokio::test]
// TODO: OSS: failed to retrieve ipv6 address
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_meta_tls_basic() {
let addr = ChannelAddr::any(ChannelTransport::MetaTls(TlsMode::IpV6));
let meta_addr = match addr {
Expand Down Expand Up @@ -3273,7 +3273,7 @@ mod tests {
#[tracing_test::traced_test]
#[tokio::test]
// TODO: OSS: The logs_assert function returned an error: expected log not found
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_tcp_tx_delivery_timeout() {
// This link always fails to connect.
let link = MockLink::<u64>::fail_connects();
Expand Down Expand Up @@ -3699,15 +3699,15 @@ mod tests {
#[tracing_test::traced_test]
#[async_timed_test(timeout_secs = 30)]
// TODO: OSS: The logs_assert function returned an error: expected log not found
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_ack_exceeded_limit_with_connected_link() {
verify_ack_exceeded_limit(false).await;
}

#[tracing_test::traced_test]
#[async_timed_test(timeout_secs = 30)]
// TODO: OSS: The logs_assert function returned an error: expected log not found
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_ack_exceeded_limit_with_broken_link() {
verify_ack_exceeded_limit(true).await;
}
Expand Down Expand Up @@ -3878,7 +3878,7 @@ mod tests {

#[async_timed_test(timeout_secs = 300)]
// TODO: OSS: called `Result::unwrap()` on an `Err` value: Listen(Tcp([::1]:0), Os { code: 99, kind: AddrNotAvailable, message: "Cannot assign requested address" })
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_tcp_throughput() {
let config = config::global::lock();
let _guard =
Expand Down Expand Up @@ -3930,7 +3930,7 @@ mod tests {
#[tracing_test::traced_test]
#[async_timed_test(timeout_secs = 60)]
// TODO: OSS: The logs_assert function returned an error: expected log not found
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_net_tx_closed_on_server_reject() {
let link = MockLink::<u64>::new();
let receiver_storage = link.receiver_storage();
Expand Down
2 changes: 1 addition & 1 deletion hyperactor/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ mod tests {
#[tracing_test::traced_test]
#[test]
// TODO: OSS: The logs_assert function returned an error: missing log lines: {"# export HYPERACTOR_DEFAULT_ENCODING=serde_multipart", ...}
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
fn test_from_env() {
// Set environment variables
// SAFETY: TODO: Audit that the environment access only happens in single-threaded code.
Expand Down
2 changes: 1 addition & 1 deletion hyperactor/src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,7 @@ mod tests {

#[tokio::test]
// TODO: OSS: called `Result::unwrap()` on an `Err` value: ReadFailed { manifest_path: "/meta-pytorch/monarch/target/debug/deps/hyperactor-0e1fe83af739d976.resources.json", source: Os { code: 2, kind: NotFound, message: "No such file or directory" } }
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_process_proc_manager() {
hyperactor_telemetry::initialize_logging(crate::clock::ClockKind::default());

Expand Down
4 changes: 2 additions & 2 deletions hyperactor/src/mailbox.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3371,7 +3371,7 @@ mod tests {

#[async_timed_test(timeout_secs = 30)]
// TODO: OSS: this test is flaky in OSS. Need to repo and fix it.
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_split_port_id_no_reducer() {
let Setup {
mut receiver,
Expand Down Expand Up @@ -3457,7 +3457,7 @@ mod tests {

#[async_timed_test(timeout_secs = 30)]
// TODO: OSS: this test is flaky in OSS. Need to repo and fix it.
#[cfg_attr(not(feature = "fb"), ignore)]
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_split_port_id_every_n_messages() {
let config = crate::config::global::lock();
let _config_guard = config.override_key(
Expand Down
2 changes: 2 additions & 0 deletions hyperactor_mesh/src/actor_mesh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1496,6 +1496,7 @@ mod tests {

use crate::alloc::process::ProcessAllocator;

#[cfg(fbcode_build)]
fn process_allocator() -> ProcessAllocator {
ProcessAllocator::new(Command::new(crate::testresource::get(
"monarch/hyperactor_mesh/bootstrap",
Expand Down Expand Up @@ -1947,6 +1948,7 @@ mod tests {
use crate::sel;

#[tokio::test]
#[cfg(fbcode_build)]
async fn test_basic() {
let instance = v1::testing::instance().await;
let host_mesh = v1::testing::host_mesh(extent!(host = 4)).await;
Expand Down
1 change: 1 addition & 0 deletions hyperactor_mesh/src/alloc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,7 @@ pub(crate) mod testing {
/// a proc that does not time out when it is asked to wait for
/// a stuck actor.
#[tokio::test]
#[cfg(fbcode_build)]
async fn test_allocator_stuck_task() {
// Override config.
// Use temporary config for this test
Expand Down
1 change: 1 addition & 0 deletions hyperactor_mesh/src/alloc/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,7 @@ mod tests {
crate::testresource::get("monarch/hyperactor_mesh/bootstrap")
)));

#[cfg(fbcode_build)]
#[tokio::test]
async fn test_sigterm_on_group_fail() {
let bootstrap_binary = crate::testresource::get("monarch/hyperactor_mesh/bootstrap");
Expand Down
4 changes: 4 additions & 0 deletions hyperactor_mesh/src/alloc/remoteprocess.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2194,6 +2194,7 @@ mod test_alloc {
use super::*;

#[async_timed_test(timeout_secs = 60)]
#[cfg(fbcode_build)]
async fn test_alloc_simple() {
// Use temporary config for this test
let config = hyperactor::config::global::lock();
Expand Down Expand Up @@ -2324,6 +2325,7 @@ mod test_alloc {
}

#[async_timed_test(timeout_secs = 60)]
#[cfg(fbcode_build)]
async fn test_alloc_host_failure() {
// Use temporary config for this test
let config = hyperactor::config::global::lock();
Expand Down Expand Up @@ -2456,6 +2458,7 @@ mod test_alloc {
}

#[async_timed_test(timeout_secs = 15)]
#[cfg(fbcode_build)]
async fn test_alloc_inner_alloc_failure() {
// SAFETY: Test happens in single-threaded code.
unsafe {
Expand Down Expand Up @@ -2592,6 +2595,7 @@ mod test_alloc {

#[tracing_test::traced_test]
#[async_timed_test(timeout_secs = 60)]
#[cfg(fbcode_build)]
async fn test_remote_process_alloc_signal_handler() {
let num_proc_meshes = 5;
let hosts_per_proc_mesh = 5;
Expand Down
4 changes: 4 additions & 0 deletions hyperactor_mesh/src/bootstrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,7 @@ impl BootstrapCommand {
/// bootstrap processes under proc manager control. Not available
/// outside of test builds.
#[cfg(test)]
#[cfg(fbcode_build)]
pub(crate) fn test() -> Self {
Self {
program: crate::testresource::get("monarch/hyperactor_mesh/bootstrap"),
Expand Down Expand Up @@ -3422,6 +3423,7 @@ mod tests {
}

#[tokio::test]
#[cfg(fbcode_build)]
async fn bootstrap_handle_terminate_graceful() {
// Create a root direct-addressed proc + client instance.
let root = hyperactor::Proc::direct(ChannelTransport::Unix.any(), "root".to_string())
Expand Down Expand Up @@ -3485,6 +3487,7 @@ mod tests {
}

#[tokio::test]
#[cfg(fbcode_build)]
async fn bootstrap_handle_kill_forced() {
// Root proc + client instance (so the child can dial back).
let root = hyperactor::Proc::direct(ChannelTransport::Unix.any(), "root".to_string())
Expand Down Expand Up @@ -3534,6 +3537,7 @@ mod tests {
}

#[tokio::test]
#[cfg(fbcode_build)]
async fn bootstrap_canonical_simple() {
// SAFETY: unit-test scoped
unsafe {
Expand Down
1 change: 1 addition & 0 deletions hyperactor_mesh/src/proc_mesh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1265,6 +1265,7 @@ mod tests {
use crate::sel;

#[tokio::test]
#[cfg(fbcode_build)]
async fn test_basic() {
let instance = v1::testing::instance().await;
let ext = extent!(host = 4);
Expand Down
1 change: 1 addition & 0 deletions hyperactor_mesh/src/testresource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use std::path::PathBuf;
///
/// We should convert these tests to integration tests, so that cargo can
/// also manage the binaries.
#[cfg(fbcode_build)]
pub fn get<S>(name: S) -> PathBuf
where
S: AsRef<str>,
Expand Down
5 changes: 5 additions & 0 deletions hyperactor_mesh/src/v1/actor_mesh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ mod tests {
use crate::v1::testing;

#[tokio::test]
#[cfg(fbcode_build)]
async fn test_actor_mesh_ref_lazy_materialization() {
// 1) Bring up procs and spawn actors.
let instance = testing::instance().await;
Expand Down Expand Up @@ -566,6 +567,7 @@ mod tests {
}

#[async_timed_test(timeout_secs = 30)]
#[cfg(fbcode_build)]
async fn test_actor_states_with_panic() {
hyperactor_telemetry::initialize_logging_for_test();

Expand Down Expand Up @@ -632,6 +634,7 @@ mod tests {
}

#[async_timed_test(timeout_secs = 30)]
#[cfg(fbcode_build)]
async fn test_actor_states_with_process_exit() {
hyperactor_telemetry::initialize_logging_for_test();

Expand Down Expand Up @@ -699,6 +702,7 @@ mod tests {
}

#[async_timed_test(timeout_secs = 30)]
#[cfg(fbcode_build)]
async fn test_actor_states_on_sliced_mesh() {
hyperactor_telemetry::initialize_logging_for_test();

Expand Down Expand Up @@ -772,6 +776,7 @@ mod tests {
}

#[async_timed_test(timeout_secs = 30)]
#[cfg(fbcode_build)]
async fn test_cast() {
let config = hyperactor::config::global::lock();
let _guard = config.override_key(crate::bootstrap::MESH_BOOTSTRAP_ENABLE_PDEATHSIG, false);
Expand Down
Loading
Loading