From bb256dcbb1ff6a36ca994329ab514393bcd91fb1 Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Wed, 26 Mar 2025 15:06:21 -0700 Subject: [PATCH] Fix broken component integration test due to compute_world_size app not respecting env vars set by torchrun (#1029) Summary: `compute_world_size` is run as an integration test in a `-j 2x2` configuration using `torchrun` which sets `MASTER_ADDR` and `MASTER_PORT`. However, it was ignoring those env vars and overriding them with the ones in the hydra config (added to make `compute_world_size` work as a single process without `torchrun`). Integ tests are failing in CI because `localhost:0` (pick random free port) is used as the `MASTER_ADDR:MASTER_PORT` on all 4 workers, hence all 4 workers are deadlocked waiting for each other to join the job. This diff fixes this by only setting the env vars if one is not already set. Differential Revision: D71919903 --- torchx/examples/apps/compute_world_size/module/util.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/torchx/examples/apps/compute_world_size/module/util.py b/torchx/examples/apps/compute_world_size/module/util.py index 0b20414dd..6ef5c0fa1 100644 --- a/torchx/examples/apps/compute_world_size/module/util.py +++ b/torchx/examples/apps/compute_world_size/module/util.py @@ -17,10 +17,12 @@ def compute_world_size(cfg: DictConfig) -> int: # required env vars for initializing pg with the default init_method (env://) - os.environ["RANK"] = str(cfg.main.rank) - os.environ["WORLD_SIZE"] = str(cfg.main.world_size) - os.environ["MASTER_ADDR"] = cfg.main.master_addr - os.environ["MASTER_PORT"] = str(cfg.main.master_port) + # read from hydra config in config/defaults.yaml if not set already + # this can happen is compute_world_size is run directly (not with torchrun) + os.environ.setdefault("RANK", str(cfg.main.rank)) + os.environ.setdefault("WORLD_SIZE", str(cfg.main.world_size)) + os.environ.setdefault("MASTER_ADDR", cfg.main.master_addr) + os.environ.setdefault("MASTER_PORT", str(cfg.main.master_port)) backend = cfg.main.backend