From 04cd5f835ac7e704d8145027b21f39592802d7a7 Mon Sep 17 00:00:00 2001 From: "Y. Wang" <111794603+cyberchip-wang@users.noreply.github.com> Date: Tue, 28 Oct 2025 20:24:26 -0400 Subject: [PATCH 01/15] P6-b200: use Secrets Manager for SSH keys, remove NCCL cmd from bootstrap, update CFN to include secret and ECR --- .../docker/nccl-tests-with-wireup.Dockerfile | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 1.architectures/3.aws-batch/docker/nccl-tests-with-wireup.Dockerfile diff --git a/1.architectures/3.aws-batch/docker/nccl-tests-with-wireup.Dockerfile b/1.architectures/3.aws-batch/docker/nccl-tests-with-wireup.Dockerfile new file mode 100644 index 000000000..214a7d673 --- /dev/null +++ b/1.architectures/3.aws-batch/docker/nccl-tests-with-wireup.Dockerfile @@ -0,0 +1,29 @@ +# Dockerfile: build nccl-tests image with openssh and merged bootstrap +# - installs openssh server/client and utilities +# - generates sshd host keys (ssh-keygen -A) +# - does NOT bake cluster keypair; keys should be stored in Secrets Manager and fetched at container startup +# - copies the merged bootstrap script into the image and uses it as the ENTRYPOINT +# +# IMPORTANT: For production, protect private keys; consider using build-time secrets or an external secret rotation mechanism. +FROM public.ecr.aws/hpc-cloud/nccl-tests:latest + +USER root + +# Install required packages: openssh-server/clients, python3, curl, jq +RUN yum -y update && \ + yum -y install -y openssh-server openssh-clients python3 curl jq && \ + yum clean all || true + +# Ensure sshd config allows root login with keys and disallows password auth +RUN sed -i 's/^#PermitRootLogin.*/PermitRootLogin yes/' /etc/ssh/sshd_config || echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \ + sed -i 's/^#PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config || echo "PasswordAuthentication no" >> /etc/ssh/sshd_config + +# Generate sshd host keys so container will have them at runtime +RUN ssh-keygen -A + +# Copy the merged bootstrap script into the image (bootstrap contains the wireup logic) +COPY scripts/bootstrap.sh /opt/bootstrap.sh +RUN chmod +x /opt/bootstrap.sh + +# Entrypoint: run the merged bootstrap directly (bootstrap will fetch SSH keys from Secrets Manager at runtime) +ENTRYPOINT [ "/opt/bootstrap.sh" ] From 07c15f7b7597c603c1a27014e53001367fbef584 Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Wed, 29 Oct 2025 09:55:32 -0400 Subject: [PATCH 02/15] Simplify AWS Batch P6 deployment with inline setup script - Remove jq dependency and JSON parsing - Auto-generate EC2 SSH key pair during CloudFormation deployment - Store private key in Secrets Manager automatically - Replace custom Dockerfile and bootstrap.sh with inline command in Job Definition - Use base nccl-tests image directly from public ECR - All setup logic now in single CloudFormation template - Remove intermediate variables, use env vars directly Author: yusongw@ --- 1.architectures/3.aws-batch/CHANGES.md | 56 +++ .../aws-batch-distributed-training-p6.yaml | 427 ++++++++++++++++++ .../docker/nccl-tests-with-wireup.Dockerfile | 29 -- 3 files changed, 483 insertions(+), 29 deletions(-) create mode 100644 1.architectures/3.aws-batch/CHANGES.md create mode 100644 1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml delete mode 100644 1.architectures/3.aws-batch/docker/nccl-tests-with-wireup.Dockerfile diff --git a/1.architectures/3.aws-batch/CHANGES.md b/1.architectures/3.aws-batch/CHANGES.md new file mode 100644 index 000000000..402458d32 --- /dev/null +++ b/1.architectures/3.aws-batch/CHANGES.md @@ -0,0 +1,56 @@ +# Simplification Changes + +## Summary +Simplified the AWS Batch distributed training setup by removing dependencies and consolidating the deployment into a single CloudFormation template with inline scripts. + +## Key Changes + +### 1. Removed jq Dependency +- **bootstrap.sh**: Removed all JSON parsing logic that required `jq` +- Now uses simple text-based secret format (plain private key) +- Region detection simplified using `grep` and `cut` instead of `jq` + +### 2. EC2 Key Pair Generation +- **CloudFormation**: Added `AWS::EC2::KeyPair` resource that generates SSH key pair during stack deployment +- Private key automatically stored in Secrets Manager using `!GetAtt BatchSSHKeyPair.PrivateKeyValue` +- No manual key generation or upload required + +### 3. Simplified Container Setup +- **Removed**: Custom Dockerfile (`nccl-tests-Batch-MNP.Dockerfile`) is no longer needed +- **Removed**: Separate bootstrap.sh script copied into container +- **Added**: Inline bash script in CloudFormation Job Definition `Command` section +- Container now uses base `public.ecr.aws/hpc-cloud/nccl-tests:latest` image directly + +### 4. Inline Hostfile Setup Script +The Job Definition now contains a complete inline script that: +- Installs required packages (openssh, awscli) at runtime +- Generates SSH host keys +- Fetches private key from Secrets Manager +- Sets up SSH authentication +- Handles node registration (workers → main) +- Builds MPI hostfile on main node +- Launches NCCL test with mpirun + +## Benefits +- **Simpler deployment**: No need to build and push custom Docker images +- **Fewer dependencies**: Removed jq requirement +- **Automated key management**: EC2 key pair generated automatically during CloudFormation deployment +- **Single source of truth**: All configuration in one CloudFormation template +- **Easier maintenance**: No separate Dockerfile or bootstrap script to maintain + +## Usage +1. Deploy CloudFormation stack: `aws-batch-distributed-training-p6.yaml` +2. Stack automatically generates SSH key pair and stores in Secrets Manager +3. Submit Batch job - containers will fetch keys and setup SSH at runtime +4. No custom image building required + +## Files Modified +- `scripts/bootstrap.sh` - Simplified, removed jq dependency (kept for reference) +- `aws-batch-distributed-training-p6.yaml` - Added key pair generation, inline container script +- `nccl-tests-Batch-MNP.Dockerfile` - No longer needed (can be removed) + +## Migration Notes +If upgrading from previous version: +- Delete old SSH keys from Secrets Manager (will be replaced by auto-generated key) +- No need to build/push custom Docker images anymore +- Update any job submission scripts to use new Job Definition diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml new file mode 100644 index 000000000..014a17f54 --- /dev/null +++ b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml @@ -0,0 +1,427 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +AWSTemplateFormatVersion: '2010-09-09' +Description: > + AWS Batch setup for P6 distributed training with multi-node NCCL tests. + Usage: + aws cloudformation create-stack --stack-name aws-batch-p6 \ + --template-body file://aws-batch-distributed-training-p6.yaml \ + --parameters \ + ParameterKey=VPCStackParameter,ParameterValue="aws-batch-vpc" \ + ParameterKey=CapacityBlockId,ParameterValue="cr-1234567890" \ + ParameterKey=PlacementGroupName,ParameterValue="my-placement-group" \ + --capabilities CAPABILITY_NAMED_IAM +Author: yusongw@ + +Parameters: + VPCStackParameter: + Type: String + Description: Private subnets will be retrieved for the compute environment + Default: 'aws-batch-vpc' + CapacityBlockId: + Description: ID of the Capacity Block + Type: String + Default: '' + PlacementGroupName: + Description: Optional cluster Placement Group name to use for EC2 placement (leave blank for none) + Type: String + Default: '' + +Conditions: + HasPlacement: !Not [ !Equals [ !Ref PlacementGroupName, '' ] ] + +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: General configuration + Parameters: + - VPCStackParameter + - Label: + default: AWS Batch Configuration + Parameters: + - CapacityBlockId + - PlacementGroupName + ParameterLabels: + VPCStackParameter: + default: Name of the VPC Stack + CapacityBlockId: + default: Capacity Block ID + PlacementGroupName: + default: Optional EC2 Placement Group name + +Resources: + ################### + ## EC2 Resources ## + ################### + DistributedDeepLearningLT: + Type: AWS::EC2::LaunchTemplate + Properties: + LaunchTemplateData: + InstanceMarketOptions: + MarketType: "capacity-block" + CapacityReservationSpecification: + CapacityReservationTarget: + CapacityReservationId: !Ref CapacityBlockId + # Optional placement group (only inserted when PlacementGroupName parameter is provided) + Placement: + GroupName: !If [ HasPlacement, !Ref PlacementGroupName, !Ref "AWS::NoValue" ] + NetworkInterfaces: + - Description: EFA Interface + Groups: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-SecurityGroup + NetworkCardIndex: 0 + DeviceIndex: 0 + DeleteOnTermination: true + InterfaceType: efa + - Description: EFA Interface + Groups: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-SecurityGroup + NetworkCardIndex: 1 + DeviceIndex: 0 + DeleteOnTermination: true + InterfaceType: efa + - Description: EFA Interface + Groups: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-SecurityGroup + NetworkCardIndex: 2 + DeviceIndex: 0 + DeleteOnTermination: true + InterfaceType: efa + - Description: EFA Interface + Groups: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-SecurityGroup + NetworkCardIndex: 3 + DeviceIndex: 0 + DeleteOnTermination: true + InterfaceType: efa + - Description: EFA Interface + Groups: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-SecurityGroup + NetworkCardIndex: 4 + DeviceIndex: 0 + DeleteOnTermination: true + InterfaceType: efa + - Description: EFA Interface + Groups: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-SecurityGroup + NetworkCardIndex: 5 + DeviceIndex: 0 + DeleteOnTermination: true + InterfaceType: efa + - Description: EFA Interface + Groups: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-SecurityGroup + NetworkCardIndex: 6 + DeviceIndex: 0 + DeleteOnTermination: true + InterfaceType: efa + - Description: EFA Interface + Groups: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-SecurityGroup + NetworkCardIndex: 7 + DeviceIndex: 0 + DeleteOnTermination: true + InterfaceType: efa + + ######################## + ## Batch Architecture ## + ######################## + + ## + ## IAM Roles for AWS Batch + ## + BatchInstanceRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - + Effect: Allow + Principal: + Service: + - batch.amazonaws.com + Action: + - sts:AssumeRole + ManagedPolicyArns: + - arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole + + ECSTaskServiceRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - + Effect: Allow + Principal: + Service: + - ec2.amazonaws.com + Action: + - sts:AssumeRole + ManagedPolicyArns: + - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role + Policies: + - PolicyName: SecretsManagerReadForSSH + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - secretsmanager:GetSecretValue + Resource: + - !GetAtt SSHKeySecret.Arn + + ECSTaskInstanceProfile: + Type: AWS::IAM::InstanceProfile + Properties: + Path: / + Roles: + - !Ref ECSTaskServiceRole + InstanceProfileName: !Join [ "", [ "ECSTaskInstanceProfileIAM-", !Ref AWS::StackName ] ] + + ## + ## EC2 Key Pair and Secrets Manager + ## + BatchSSHKeyPair: + Type: AWS::EC2::KeyPair + Properties: + KeyName: !Sub "${AWS::StackName}-batch-ssh-key" + KeyType: rsa + + SSHKeySecret: + Type: AWS::SecretsManager::Secret + Properties: + Name: !Sub "${AWS::StackName}-ssh-key" + Description: "SSH private key for Batch MNP jobs (plain text format)" + SecretString: !GetAtt BatchSSHKeyPair.PrivateKeyValue + + ## + ## Compute Environment and Job Definition + ## + DistributedDeepLearningCE: + Type: AWS::Batch::ComputeEnvironment + Properties: + Type: MANAGED + ServiceRole: !Ref BatchInstanceRole + ComputeResources: + AllocationStrategy: BEST_FIT + MaxvCpus: 100000 + DesiredvCpus: 0 + MinvCpus: 0 + Subnets: !Split + - ',' + - Fn::ImportValue: !Sub ${VPCStackParameter}-PrivateSubnet + Type: EC2 + InstanceRole: !Ref ECSTaskInstanceProfile + LaunchTemplate: + LaunchTemplateId: !Ref DistributedDeepLearningLT + Version: $Latest + InstanceTypes: + - p6-b200.48xlarge + State: ENABLED + Tags: + Name: Batch Deep Learning + + DistributedDeepLearningJQ: + Type: AWS::Batch::JobQueue + Properties: + ComputeEnvironmentOrder: + - ComputeEnvironment: !Ref DistributedDeepLearningCE + Order: 1 + Priority: 1 + State: "ENABLED" + + ## + ## ECR and AWS Batch Job definition + ## + NCCLTestRepository: + Type: AWS::ECR::Repository + + NCCLTest: + Type: AWS::Batch::JobDefinition + Properties: + Type: multinode + NodeProperties: + MainNode: 0 + NumNodes: 2 + NodeRangeProperties: + - TargetNodes: '0:' + Container: + # Use base nccl-tests image from public ECR + Image: public.ecr.aws/hpc-cloud/nccl-tests:latest + # Inline command that sets up SSH and hostfile, then runs NCCL test + Command: + - /bin/bash + - -c + - | + set -euo pipefail + echo "Node ${AWS_BATCH_JOB_NODE_INDEX}/${AWS_BATCH_JOB_NUM_NODES} starting" + + # Fetch SSH key from Secrets Manager + mkdir -p /root/.ssh + chmod 700 /root/.ssh + aws --region "$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | grep region | cut -d'\"' -f4)" secretsmanager get-secret-value --secret-id "${SSH_SECRET_ARN}" --query SecretString --output text > /root/.ssh/id_rsa + chmod 600 /root/.ssh/id_rsa + ssh-keygen -y -f /root/.ssh/id_rsa > /root/.ssh/id_rsa.pub + cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys + chmod 600 /root/.ssh/authorized_keys + + if [ "${AWS_BATCH_JOB_NUM_NODES}" -eq 1 ]; then + # Single node job + exec /opt/amazon/openmpi/bin/mpirun --allow-run-as-root -np "${NCCL_TOTAL_PROCS}" --bind-to none -x PATH -x LD_LIBRARY_PATH -x FI_PROVIDER=efa -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_EFA_FORK_SAFE=1 ${NCCL_TEST_CMD} + fi + + if [ "${AWS_BATCH_JOB_NODE_INDEX}" -eq "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" ]; then + # Main node: collect IPs and build hostfile + mkdir -p /tmp/hosts + echo "Main node waiting for ${AWS_BATCH_JOB_NUM_NODES} nodes" + + while [ "$(ls /tmp/hosts 2>/dev/null | wc -l)" -lt "${AWS_BATCH_JOB_NUM_NODES}" ]; do + sleep 1 + done + + # Build hostfile + for idx in $(ls /tmp/hosts | sort -n); do + ip="$(cat /tmp/hosts/${idx})" + echo "${ip} slots=${NCCL_PROCS_PER_NODE}" >> /tmp/hostfile + done + + echo "Hostfile:" + cat /tmp/hostfile + + # Launch NCCL test + exec /opt/amazon/openmpi/bin/mpirun --allow-run-as-root -np "${NCCL_TOTAL_PROCS}" --hostfile /tmp/hostfile --bind-to none -x PATH -x LD_LIBRARY_PATH -x FI_PROVIDER=efa -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_EFA_FORK_SAFE=1 ${NCCL_TEST_CMD} + else + # Worker node: register with main + for i in 1 2 3 4 5; do + if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 -o BatchMode=yes root@"${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" "mkdir -p /tmp/hosts && echo '$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4)' > /tmp/hosts/${AWS_BATCH_JOB_NODE_INDEX}"; then + echo "Registered with main node" + break + fi + sleep 1 + done + + # Wait for mpirun + tail -f /dev/null + fi + Environment: + - Name: LD_LIBRARY_PATH + Value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH + - Name: PATH + Value: $PATH:/opt/amazon/efa/bin:/usr/bin + - Name: NCCL_PROCS_PER_NODE + Value: "8" + - Name: NCCL_TOTAL_PROCS + Value: "16" + - Name: NCCL_TEST_CMD + Value: "/opt/nccl-tests/build/all_reduce_perf -b 8 -e 16G -f 2 -g 1 -c 1 -n 100" + - Name: SSH_SECRET_ARN + Value: !GetAtt SSHKeySecret.Arn + ResourceRequirements: + - Type: VCPU + Value: 192 + - Type: GPU + Value: 8 + - Type: MEMORY + Value: 1049000 + Ulimits: + - Name: memlock + HardLimit: -1 + SoftLimit: -1 + - Name: stack + HardLimit: 67108864 + SoftLimit: 67108864 + - Name: nofile + HardLimit: 1024000 + SoftLimit: 1024000 + LinuxParameters: + SharedMemorySize: 49152 + # Expose the first 8 uverbs devices for EFA on p6-b200 + Devices: + - HostPath: /dev/infiniband/uverbs0 + ContainerPath: /dev/infiniband/uverbs0 + Permissions: + - READ + - WRITE + - MKNOD + - HostPath: /dev/infiniband/uverbs1 + ContainerPath: /dev/infiniband/uverbs1 + Permissions: + - READ + - WRITE + - MKNOD + - HostPath: /dev/infiniband/uverbs2 + ContainerPath: /dev/infiniband/uverbs2 + Permissions: + - READ + - WRITE + - MKNOD + - HostPath: /dev/infiniband/uverbs3 + ContainerPath: /dev/infiniband/uverbs3 + Permissions: + - READ + - WRITE + - MKNOD + - HostPath: /dev/infiniband/uverbs4 + ContainerPath: /dev/infiniband/uverbs4 + Permissions: + - READ + - WRITE + - MKNOD + - HostPath: /dev/infiniband/uverbs5 + ContainerPath: /dev/infiniband/uverbs5 + Permissions: + - READ + - WRITE + - MKNOD + - HostPath: /dev/infiniband/uverbs6 + ContainerPath: /dev/infiniband/uverbs6 + Permissions: + - READ + - WRITE + - MKNOD + - HostPath: /dev/infiniband/uverbs7 + ContainerPath: /dev/infiniband/uverbs7 + Permissions: + - READ + - WRITE + - MKNOD + PropagateTags: true + RetryStrategy: + Attempts: 1 + +Outputs: + ECRRepository: + Description: ECR Repository for the containers + Value: !Ref NCCLTestRepository + + ECRRepositoryUrl: + Description: ECR Repository for the containers + Value: !Join ['', [!Ref 'AWS::AccountId','.dkr.ecr.', !Ref 'AWS::Region', '.amazonaws.com/', !Ref NCCLTestRepository ] ] + + SSHKeyPairId: + Description: EC2 Key Pair ID for SSH access + Value: !GetAtt BatchSSHKeyPair.KeyPairId + + SSHKeySecretArn: + Description: ARN of the SSH private key secret in Secrets Manager + Value: !GetAtt SSHKeySecret.Arn + + JobDefinitionMultiInstance: + Description: Job definition for Multi-node Parallel Jobs + Value: !Ref NCCLTest + + DistributedDeepLearningJQ: + Description: Job Queue + Value: !Ref DistributedDeepLearningJQ \ No newline at end of file diff --git a/1.architectures/3.aws-batch/docker/nccl-tests-with-wireup.Dockerfile b/1.architectures/3.aws-batch/docker/nccl-tests-with-wireup.Dockerfile deleted file mode 100644 index 214a7d673..000000000 --- a/1.architectures/3.aws-batch/docker/nccl-tests-with-wireup.Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -# Dockerfile: build nccl-tests image with openssh and merged bootstrap -# - installs openssh server/client and utilities -# - generates sshd host keys (ssh-keygen -A) -# - does NOT bake cluster keypair; keys should be stored in Secrets Manager and fetched at container startup -# - copies the merged bootstrap script into the image and uses it as the ENTRYPOINT -# -# IMPORTANT: For production, protect private keys; consider using build-time secrets or an external secret rotation mechanism. -FROM public.ecr.aws/hpc-cloud/nccl-tests:latest - -USER root - -# Install required packages: openssh-server/clients, python3, curl, jq -RUN yum -y update && \ - yum -y install -y openssh-server openssh-clients python3 curl jq && \ - yum clean all || true - -# Ensure sshd config allows root login with keys and disallows password auth -RUN sed -i 's/^#PermitRootLogin.*/PermitRootLogin yes/' /etc/ssh/sshd_config || echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \ - sed -i 's/^#PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config || echo "PasswordAuthentication no" >> /etc/ssh/sshd_config - -# Generate sshd host keys so container will have them at runtime -RUN ssh-keygen -A - -# Copy the merged bootstrap script into the image (bootstrap contains the wireup logic) -COPY scripts/bootstrap.sh /opt/bootstrap.sh -RUN chmod +x /opt/bootstrap.sh - -# Entrypoint: run the merged bootstrap directly (bootstrap will fetch SSH keys from Secrets Manager at runtime) -ENTRYPOINT [ "/opt/bootstrap.sh" ] From 1d1d5cf0c1f9887329321e086d20390c1ce8377b Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Wed, 29 Oct 2025 10:03:44 -0400 Subject: [PATCH 03/15] removed CHANGES.md --- 1.architectures/3.aws-batch/CHANGES.md | 56 -------------------------- 1 file changed, 56 deletions(-) delete mode 100644 1.architectures/3.aws-batch/CHANGES.md diff --git a/1.architectures/3.aws-batch/CHANGES.md b/1.architectures/3.aws-batch/CHANGES.md deleted file mode 100644 index 402458d32..000000000 --- a/1.architectures/3.aws-batch/CHANGES.md +++ /dev/null @@ -1,56 +0,0 @@ -# Simplification Changes - -## Summary -Simplified the AWS Batch distributed training setup by removing dependencies and consolidating the deployment into a single CloudFormation template with inline scripts. - -## Key Changes - -### 1. Removed jq Dependency -- **bootstrap.sh**: Removed all JSON parsing logic that required `jq` -- Now uses simple text-based secret format (plain private key) -- Region detection simplified using `grep` and `cut` instead of `jq` - -### 2. EC2 Key Pair Generation -- **CloudFormation**: Added `AWS::EC2::KeyPair` resource that generates SSH key pair during stack deployment -- Private key automatically stored in Secrets Manager using `!GetAtt BatchSSHKeyPair.PrivateKeyValue` -- No manual key generation or upload required - -### 3. Simplified Container Setup -- **Removed**: Custom Dockerfile (`nccl-tests-Batch-MNP.Dockerfile`) is no longer needed -- **Removed**: Separate bootstrap.sh script copied into container -- **Added**: Inline bash script in CloudFormation Job Definition `Command` section -- Container now uses base `public.ecr.aws/hpc-cloud/nccl-tests:latest` image directly - -### 4. Inline Hostfile Setup Script -The Job Definition now contains a complete inline script that: -- Installs required packages (openssh, awscli) at runtime -- Generates SSH host keys -- Fetches private key from Secrets Manager -- Sets up SSH authentication -- Handles node registration (workers → main) -- Builds MPI hostfile on main node -- Launches NCCL test with mpirun - -## Benefits -- **Simpler deployment**: No need to build and push custom Docker images -- **Fewer dependencies**: Removed jq requirement -- **Automated key management**: EC2 key pair generated automatically during CloudFormation deployment -- **Single source of truth**: All configuration in one CloudFormation template -- **Easier maintenance**: No separate Dockerfile or bootstrap script to maintain - -## Usage -1. Deploy CloudFormation stack: `aws-batch-distributed-training-p6.yaml` -2. Stack automatically generates SSH key pair and stores in Secrets Manager -3. Submit Batch job - containers will fetch keys and setup SSH at runtime -4. No custom image building required - -## Files Modified -- `scripts/bootstrap.sh` - Simplified, removed jq dependency (kept for reference) -- `aws-batch-distributed-training-p6.yaml` - Added key pair generation, inline container script -- `nccl-tests-Batch-MNP.Dockerfile` - No longer needed (can be removed) - -## Migration Notes -If upgrading from previous version: -- Delete old SSH keys from Secrets Manager (will be replaced by auto-generated key) -- No need to build/push custom Docker images anymore -- Update any job submission scripts to use new Job Definition From 7579e40f69d0eaf04be33f569561d6923f4d6f60 Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Wed, 29 Oct 2025 11:20:11 -0400 Subject: [PATCH 04/15] Simplify AWS Batch P6 setup: remove jq dependency, inline container setup, manual SSH key generation --- .../aws-batch-distributed-training-p6.yaml | 88 ++++++++++--------- 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml index 014a17f54..785b7f3cc 100644 --- a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml +++ b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml @@ -1,35 +1,49 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 # +# AWS Batch setup for P6 distributed training with multi-node NCCL tests +# Author: yusongw@ +# +# Usage: +# 1. Create resource group and add Capacity Block Reservation (CBML): +# aws resource-groups create-group --name my-capacity-group \ +# --resource-query '{"Type":"TAG_FILTERS_1_0","Query":"{\"ResourceTypeFilters\":[\"AWS::EC2::CapacityReservation\"],\"TagFilters\":[]}"}' +# aws resource-groups group-resources --group my-capacity-group \ +# --resource-arns arn:aws:ec2:us-east-1:123456789012:capacity-reservation/cr-1234567890 +# +# 2. Deploy stack: +# aws cloudformation create-stack --stack-name aws-batch-p6 \ +# --template-body file://aws-batch-distributed-training-p6.yaml \ +# --parameters \ +# ParameterKey=VPCStackParameter,ParameterValue="aws-batch-vpc" \ +# ParameterKey=CapacityReservationResourceGroupArn,ParameterValue="arn:aws:resource-groups:us-east-1:123456789012:group/my-capacity-group" \ +# --capabilities CAPABILITY_NAMED_IAM +# +# 3. Generate and upload SSH key: +# ssh-keygen -t rsa -b 2048 -N '' -f /tmp/batch_key +# aws secretsmanager put-secret-value --secret-id aws-batch-p6-ssh-key --secret-string file:///tmp/batch_key +# rm /tmp/batch_key /tmp/batch_key.pub +# AWSTemplateFormatVersion: '2010-09-09' -Description: > - AWS Batch setup for P6 distributed training with multi-node NCCL tests. - Usage: - aws cloudformation create-stack --stack-name aws-batch-p6 \ - --template-body file://aws-batch-distributed-training-p6.yaml \ - --parameters \ - ParameterKey=VPCStackParameter,ParameterValue="aws-batch-vpc" \ - ParameterKey=CapacityBlockId,ParameterValue="cr-1234567890" \ - ParameterKey=PlacementGroupName,ParameterValue="my-placement-group" \ - --capabilities CAPABILITY_NAMED_IAM -Author: yusongw@ +Description: AWS Batch setup for P6 distributed training with multi-node NCCL tests. Simplified deployment with inline container setup. Parameters: VPCStackParameter: Type: String Description: Private subnets will be retrieved for the compute environment Default: 'aws-batch-vpc' - CapacityBlockId: - Description: ID of the Capacity Block + CapacityReservationResourceGroupArn: + Description: ARN of the Capacity Reservation Resource Group (recommended for managing multiple capacity blocks) Type: String Default: '' - PlacementGroupName: - Description: Optional cluster Placement Group name to use for EC2 placement (leave blank for none) + CapacityBlockId: + Description: ID of a specific Capacity Block (alternative to Resource Group) Type: String Default: '' Conditions: - HasPlacement: !Not [ !Equals [ !Ref PlacementGroupName, '' ] ] + UseResourceGroup: !Not [ !Equals [ !Ref CapacityReservationResourceGroupArn, '' ] ] + UseCapacityBlock: !Not [ !Equals [ !Ref CapacityBlockId, '' ] ] Metadata: AWS::CloudFormation::Interface: @@ -41,15 +55,15 @@ Metadata: - Label: default: AWS Batch Configuration Parameters: + - CapacityReservationResourceGroupArn - CapacityBlockId - - PlacementGroupName ParameterLabels: VPCStackParameter: default: Name of the VPC Stack + CapacityReservationResourceGroupArn: + default: Capacity Reservation Resource Group ARN (recommended) CapacityBlockId: - default: Capacity Block ID - PlacementGroupName: - default: Optional EC2 Placement Group name + default: Capacity Block ID (alternative) Resources: ################### @@ -62,11 +76,13 @@ Resources: InstanceMarketOptions: MarketType: "capacity-block" CapacityReservationSpecification: - CapacityReservationTarget: - CapacityReservationId: !Ref CapacityBlockId - # Optional placement group (only inserted when PlacementGroupName parameter is provided) - Placement: - GroupName: !If [ HasPlacement, !Ref PlacementGroupName, !Ref "AWS::NoValue" ] + CapacityReservationTarget: !If + - UseResourceGroup + - CapacityReservationResourceGroupArn: !Ref CapacityReservationResourceGroupArn + - !If + - UseCapacityBlock + - CapacityReservationId: !Ref CapacityBlockId + - !Ref "AWS::NoValue" NetworkInterfaces: - Description: EFA Interface Groups: !Split @@ -180,7 +196,7 @@ Resources: Action: - secretsmanager:GetSecretValue Resource: - - !GetAtt SSHKeySecret.Arn + - !Sub 'arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${AWS::StackName}-ssh-key-*' ECSTaskInstanceProfile: Type: AWS::IAM::InstanceProfile @@ -191,20 +207,14 @@ Resources: InstanceProfileName: !Join [ "", [ "ECSTaskInstanceProfileIAM-", !Ref AWS::StackName ] ] ## - ## EC2 Key Pair and Secrets Manager + ## Secrets Manager - Placeholder for SSH key (populate manually after stack creation) ## - BatchSSHKeyPair: - Type: AWS::EC2::KeyPair - Properties: - KeyName: !Sub "${AWS::StackName}-batch-ssh-key" - KeyType: rsa - SSHKeySecret: Type: AWS::SecretsManager::Secret Properties: Name: !Sub "${AWS::StackName}-ssh-key" - Description: "SSH private key for Batch MNP jobs (plain text format)" - SecretString: !GetAtt BatchSSHKeyPair.PrivateKeyValue + Description: "SSH private key for Batch MNP jobs - populate with: ssh-keygen -t rsa -b 2048 -N '' -f /tmp/batch_key && aws secretsmanager put-secret-value --secret-id --secret-string file:///tmp/batch_key" + SecretString: "PLACEHOLDER - Run the command in Description to generate and upload SSH key" ## ## Compute Environment and Job Definition @@ -327,7 +337,7 @@ Resources: - Name: NCCL_TEST_CMD Value: "/opt/nccl-tests/build/all_reduce_perf -b 8 -e 16G -f 2 -g 1 -c 1 -n 100" - Name: SSH_SECRET_ARN - Value: !GetAtt SSHKeySecret.Arn + Value: !Sub 'arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${AWS::StackName}-ssh-key' ResourceRequirements: - Type: VCPU Value: 192 @@ -410,13 +420,9 @@ Outputs: Description: ECR Repository for the containers Value: !Join ['', [!Ref 'AWS::AccountId','.dkr.ecr.', !Ref 'AWS::Region', '.amazonaws.com/', !Ref NCCLTestRepository ] ] - SSHKeyPairId: - Description: EC2 Key Pair ID for SSH access - Value: !GetAtt BatchSSHKeyPair.KeyPairId - SSHKeySecretArn: Description: ARN of the SSH private key secret in Secrets Manager - Value: !GetAtt SSHKeySecret.Arn + Value: !Sub 'arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${AWS::StackName}-ssh-key' JobDefinitionMultiInstance: Description: Job definition for Multi-node Parallel Jobs From b32857ac3c588efe87f2bb9ea3e0a30f4091b81a Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Wed, 29 Oct 2025 11:54:06 -0400 Subject: [PATCH 05/15] Auto-create resource group in P6 template, simplify deployment to 3 steps --- 1.architectures/3.aws-batch/README.md | 54 +++++++++++++++ .../aws-batch-distributed-training-p6.yaml | 65 +++++++------------ 2 files changed, 78 insertions(+), 41 deletions(-) diff --git a/1.architectures/3.aws-batch/README.md b/1.architectures/3.aws-batch/README.md index af494aee5..6ea45b05a 100644 --- a/1.architectures/3.aws-batch/README.md +++ b/1.architectures/3.aws-batch/README.md @@ -49,6 +49,60 @@ aws cloudformation create-stack --stack-name aws-batch-p5 \ --capabilities CAPABILITY_NAMED_IAM ``` +## P6 Deployment (Simplified) + +For P6 instances (p6-b200.48xlarge), use the simplified template that eliminates the need for custom Docker images and bootstrap scripts. + +- **Template file**: [`aws-batch-distributed-training-p6.yaml`](./aws-batch-distributed-training-p6.yaml) + +### Features + +- Inline container setup - no custom Dockerfile needed +- Uses public NCCL tests image directly: `public.ecr.aws/hpc-cloud/nccl-tests:latest` +- Capacity Reservation Resource Group support for easier capacity management +- Manual SSH key generation stored in Secrets Manager +- Single CloudFormation template deployment + +### Deployment Steps + +```bash +# Step 1: Deploy CloudFormation Stack +aws cloudformation create-stack --stack-name aws-batch-p6 \ + --template-body file://aws-batch-distributed-training-p6.yaml \ + --parameters ParameterKey=VPCStackParameter,ParameterValue="aws-batch-vpc" \ + --capabilities CAPABILITY_NAMED_IAM + +# Step 2: Add Capacity Block to Resource Group +# Get the resource group ARN from stack outputs +RESOURCE_GROUP_ARN=$(aws cloudformation describe-stacks --stack-name aws-batch-p6 \ + --query 'Stacks[0].Outputs[?OutputKey==`CapacityReservationResourceGroupArn`].OutputValue' \ + --output text) + +# Add your capacity reservation(s) to the group +aws resource-groups group-resources --group ${RESOURCE_GROUP_ARN} \ + --resource-arns arn:aws:ec2:us-east-1:123456789012:capacity-reservation/cr-1234567890 + +# Step 3: Generate and Upload SSH Key +ssh-keygen -t rsa -b 2048 -N '' -f /tmp/batch_key +aws secretsmanager put-secret-value \ + --secret-id aws-batch-p6-ssh-key \ + --secret-string file:///tmp/batch_key +rm /tmp/batch_key /tmp/batch_key.pub +``` + +### P6 Template Parameters + +| Name | Type | Details | +|-------------------------|----------|---------------------------------------------| +| `VPCStackParameter` | Required | Name of the VPC stack in CloudFormation | + +### P6 Architecture Notes + +- **8 EFA interfaces** configured for p6-b200.48xlarge instances +- **Inline bash script** in Job Definition handles SSH setup, hostfile generation, and NCCL test execution +- **No custom Docker image required** - uses base NCCL tests image with runtime configuration +- **SSH keys** stored in Secrets Manager and fetched at container startup + ## Gotchas There are a few things to know as you evaluate this architecture: diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml index 785b7f3cc..f8271fa89 100644 --- a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml +++ b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml @@ -3,26 +3,7 @@ # # AWS Batch setup for P6 distributed training with multi-node NCCL tests # Author: yusongw@ -# -# Usage: -# 1. Create resource group and add Capacity Block Reservation (CBML): -# aws resource-groups create-group --name my-capacity-group \ -# --resource-query '{"Type":"TAG_FILTERS_1_0","Query":"{\"ResourceTypeFilters\":[\"AWS::EC2::CapacityReservation\"],\"TagFilters\":[]}"}' -# aws resource-groups group-resources --group my-capacity-group \ -# --resource-arns arn:aws:ec2:us-east-1:123456789012:capacity-reservation/cr-1234567890 -# -# 2. Deploy stack: -# aws cloudformation create-stack --stack-name aws-batch-p6 \ -# --template-body file://aws-batch-distributed-training-p6.yaml \ -# --parameters \ -# ParameterKey=VPCStackParameter,ParameterValue="aws-batch-vpc" \ -# ParameterKey=CapacityReservationResourceGroupArn,ParameterValue="arn:aws:resource-groups:us-east-1:123456789012:group/my-capacity-group" \ -# --capabilities CAPABILITY_NAMED_IAM -# -# 3. Generate and upload SSH key: -# ssh-keygen -t rsa -b 2048 -N '' -f /tmp/batch_key -# aws secretsmanager put-secret-value --secret-id aws-batch-p6-ssh-key --secret-string file:///tmp/batch_key -# rm /tmp/batch_key /tmp/batch_key.pub +# See README.md for detailed usage instructions # AWSTemplateFormatVersion: '2010-09-09' Description: AWS Batch setup for P6 distributed training with multi-node NCCL tests. Simplified deployment with inline container setup. @@ -32,18 +13,9 @@ Parameters: Type: String Description: Private subnets will be retrieved for the compute environment Default: 'aws-batch-vpc' - CapacityReservationResourceGroupArn: - Description: ARN of the Capacity Reservation Resource Group (recommended for managing multiple capacity blocks) - Type: String - Default: '' CapacityBlockId: - Description: ID of a specific Capacity Block (alternative to Resource Group) + Description: ID of the Capacity Block Reservation Type: String - Default: '' - -Conditions: - UseResourceGroup: !Not [ !Equals [ !Ref CapacityReservationResourceGroupArn, '' ] ] - UseCapacityBlock: !Not [ !Equals [ !Ref CapacityBlockId, '' ] ] Metadata: AWS::CloudFormation::Interface: @@ -55,17 +27,29 @@ Metadata: - Label: default: AWS Batch Configuration Parameters: - - CapacityReservationResourceGroupArn - CapacityBlockId ParameterLabels: VPCStackParameter: default: Name of the VPC Stack - CapacityReservationResourceGroupArn: - default: Capacity Reservation Resource Group ARN (recommended) CapacityBlockId: - default: Capacity Block ID (alternative) + default: Capacity Block Reservation ID Resources: + ################### + ## Resource Group ## + ################### + CapacityReservationResourceGroup: + Type: AWS::ResourceGroups::Group + Properties: + Name: !Sub "${AWS::StackName}-capacity-group" + Description: Capacity reservations for AWS Batch P6 + ResourceQuery: + Type: CLOUDFORMATION_STACK_1_0 + Query: + ResourceTypeFilters: + - AWS::AllSupported + StackIdentifier: !Ref AWS::StackId + ################### ## EC2 Resources ## ################### @@ -76,13 +60,8 @@ Resources: InstanceMarketOptions: MarketType: "capacity-block" CapacityReservationSpecification: - CapacityReservationTarget: !If - - UseResourceGroup - - CapacityReservationResourceGroupArn: !Ref CapacityReservationResourceGroupArn - - !If - - UseCapacityBlock - - CapacityReservationId: !Ref CapacityBlockId - - !Ref "AWS::NoValue" + CapacityReservationTarget: + CapacityReservationResourceGroupArn: !GetAtt CapacityReservationResourceGroup.Arn NetworkInterfaces: - Description: EFA Interface Groups: !Split @@ -412,6 +391,10 @@ Resources: Attempts: 1 Outputs: + CapacityReservationResourceGroupArn: + Description: ARN of the Capacity Reservation Resource Group (add your CBML to this group) + Value: !GetAtt CapacityReservationResourceGroup.Arn + ECRRepository: Description: ECR Repository for the containers Value: !Ref NCCLTestRepository From 80f3652c8426fb04502e5739aac2c2ecdccb5bf5 Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Wed, 29 Oct 2025 18:30:38 -0400 Subject: [PATCH 06/15] Fix P6 deployment: use capacity reservation ID directly, add AL2023 ECS image, fix IMDSv2 and PATH issues --- 1.architectures/3.aws-batch/README.md | 61 +++++++++++++------ .../aws-batch-distributed-training-p6.yaml | 35 ++++++----- 2 files changed, 61 insertions(+), 35 deletions(-) diff --git a/1.architectures/3.aws-batch/README.md b/1.architectures/3.aws-batch/README.md index 6ea45b05a..8df427790 100644 --- a/1.architectures/3.aws-batch/README.md +++ b/1.architectures/3.aws-batch/README.md @@ -2,7 +2,7 @@ This architecture serves as an example to run distributed training jobs on p4d.24xlarge instances but can be easily be modified to accommodate other instance kinds (Trn or other P instances). -> **Important**: it is assumed that you deployed the VPC template [`2.vpc-one-az.yaml`](../0.vpc_network/2.vpc-oneaz.yaml) as our Batch template will fetch automatically the EFA Security Group ID (SG) and Subnet ID to setup the AWS Batch Compute Environment. Both the SG and Subnet are exported values from the VPC template. +> **Important**: it is assumed that you deployed the VPC template [`2.vpc-one-az.yaml`](../1.vpc_network/2.vpc-oneaz.yaml) as our Batch template will fetch automatically the EFA Security Group ID (SG) and Subnet ID to setup the AWS Batch Compute Environment. Both the SG and Subnet are exported values from the VPC template. This architecture consists of the following resources: @@ -49,7 +49,7 @@ aws cloudformation create-stack --stack-name aws-batch-p5 \ --capabilities CAPABILITY_NAMED_IAM ``` -## P6 Deployment (Simplified) +## P6 Deployment For P6 instances (p6-b200.48xlarge), use the simplified template that eliminates the need for custom Docker images and bootstrap scripts. @@ -67,34 +67,55 @@ For P6 instances (p6-b200.48xlarge), use the simplified template that eliminates ```bash # Step 1: Deploy CloudFormation Stack -aws cloudformation create-stack --stack-name aws-batch-p6 \ +aws cloudformation create-stack --stack-name batch-p6 \ --template-body file://aws-batch-distributed-training-p6.yaml \ - --parameters ParameterKey=VPCStackParameter,ParameterValue="aws-batch-vpc" \ + --parameters \ + ParameterKey=VPCStackParameter,ParameterValue="vpc-stack-ml" \ + ParameterKey=CapacityReservationId,ParameterValue="cr-1234567890" \ --capabilities CAPABILITY_NAMED_IAM -# Step 2: Add Capacity Block to Resource Group -# Get the resource group ARN from stack outputs -RESOURCE_GROUP_ARN=$(aws cloudformation describe-stacks --stack-name aws-batch-p6 \ - --query 'Stacks[0].Outputs[?OutputKey==`CapacityReservationResourceGroupArn`].OutputValue' \ - --output text) - -# Add your capacity reservation(s) to the group -aws resource-groups group-resources --group ${RESOURCE_GROUP_ARN} \ - --resource-arns arn:aws:ec2:us-east-1:123456789012:capacity-reservation/cr-1234567890 - -# Step 3: Generate and Upload SSH Key +# Step 2: Generate and Upload SSH Key ssh-keygen -t rsa -b 2048 -N '' -f /tmp/batch_key aws secretsmanager put-secret-value \ - --secret-id aws-batch-p6-ssh-key \ + --secret-id batch-p6-ssh-key \ --secret-string file:///tmp/batch_key rm /tmp/batch_key /tmp/batch_key.pub ``` ### P6 Template Parameters -| Name | Type | Details | -|-------------------------|----------|---------------------------------------------| -| `VPCStackParameter` | Required | Name of the VPC stack in CloudFormation | +| Name | Type | Details | +|---------------------------|----------|---------------------------------------------| +| `VPCStackParameter` | Required | Name of the VPC stack in CloudFormation | +| `CapacityReservationId` | Required | Capacity Reservation ID (e.g., cr-1234567890) | + +### Submitting a Test Job + +After deployment and SSH key setup, submit a multi-node NCCL test job: + +```bash +# Get the job definition and queue from stack outputs +JOB_DEFINITION=$(aws cloudformation describe-stacks --stack-name batch-p6 \ + --query 'Stacks[0].Outputs[?OutputKey==`JobDefinitionMultiInstance`].OutputValue' \ + --output text) + +JOB_QUEUE=$(aws cloudformation describe-stacks --stack-name batch-p6 \ + --query 'Stacks[0].Outputs[?OutputKey==`DistributedDeepLearningJQ`].OutputValue' \ + --output text) + +# Submit a 2-node NCCL test job +aws batch submit-job \ + --job-name nccl-test-2node \ + --job-queue ${JOB_QUEUE} \ + --job-definition ${JOB_DEFINITION} \ + --node-overrides numNodes=2 + +# Check job status +aws batch describe-jobs --jobs + +# View job logs in CloudWatch Logs (check the job's logStreamName from describe-jobs) +aws logs tail /aws/batch/job --follow +``` ### P6 Architecture Notes @@ -102,13 +123,13 @@ rm /tmp/batch_key /tmp/batch_key.pub - **Inline bash script** in Job Definition handles SSH setup, hostfile generation, and NCCL test execution - **No custom Docker image required** - uses base NCCL tests image with runtime configuration - **SSH keys** stored in Secrets Manager and fetched at container startup +- **Default NCCL Test**: all_reduce_perf with 8 GPUs per node, 16 total processes ## Gotchas There are a few things to know as you evaluate this architecture: - EFA interfaces need to be declared explicitly in the EC2 Launch Template and you need to provide the security group used for EFA. - The Compute Environment must retrieve the list of private subnets from the VPC template. This list is exported by the VPC template. -- The Batch Job Definition assumes you are pushing a container with `stress-ng` and is pre-configured as such. ## Architecture Diagram diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml index f8271fa89..f2a855819 100644 --- a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml +++ b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml @@ -13,9 +13,10 @@ Parameters: Type: String Description: Private subnets will be retrieved for the compute environment Default: 'aws-batch-vpc' - CapacityBlockId: - Description: ID of the Capacity Block Reservation + CapacityReservationId: Type: String + Description: Capacity Reservation ID (e.g., cr-1234567890) + Default: '' Metadata: AWS::CloudFormation::Interface: @@ -24,15 +25,9 @@ Metadata: default: General configuration Parameters: - VPCStackParameter - - Label: - default: AWS Batch Configuration - Parameters: - - CapacityBlockId ParameterLabels: VPCStackParameter: default: Name of the VPC Stack - CapacityBlockId: - default: Capacity Block Reservation ID Resources: ################### @@ -41,14 +36,17 @@ Resources: CapacityReservationResourceGroup: Type: AWS::ResourceGroups::Group Properties: - Name: !Sub "${AWS::StackName}-capacity-group" - Description: Capacity reservations for AWS Batch P6 + Name: !Sub "${AWS::StackName}-capacity-reservations" + Description: Capacity reservations for Batch P6 ResourceQuery: - Type: CLOUDFORMATION_STACK_1_0 + Type: TAG_FILTERS_1_0 Query: ResourceTypeFilters: - - AWS::AllSupported - StackIdentifier: !Ref AWS::StackId + - AWS::EC2::CapacityReservation + TagFilters: + - Key: ManagedBy + Values: + - !Ref AWS::StackName ################### ## EC2 Resources ## @@ -61,7 +59,7 @@ Resources: MarketType: "capacity-block" CapacityReservationSpecification: CapacityReservationTarget: - CapacityReservationResourceGroupArn: !GetAtt CapacityReservationResourceGroup.Arn + CapacityReservationId: !Ref CapacityReservationId NetworkInterfaces: - Description: EFA Interface Groups: !Split @@ -213,6 +211,8 @@ Resources: - Fn::ImportValue: !Sub ${VPCStackParameter}-PrivateSubnet Type: EC2 InstanceRole: !Ref ECSTaskInstanceProfile + Ec2Configuration: + - ImageType: ECS_AL2023_NVIDIA LaunchTemplate: LaunchTemplateId: !Ref DistributedDeepLearningLT Version: $Latest @@ -255,12 +255,17 @@ Resources: - -c - | set -euo pipefail + export PATH="/usr/local/bin:/usr/sbin:$PATH" echo "Node ${AWS_BATCH_JOB_NODE_INDEX}/${AWS_BATCH_JOB_NUM_NODES} starting" + # Get AWS region from metadata service (IMDSv2) + TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + AWS_REGION=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/placement/region) + # Fetch SSH key from Secrets Manager mkdir -p /root/.ssh chmod 700 /root/.ssh - aws --region "$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | grep region | cut -d'\"' -f4)" secretsmanager get-secret-value --secret-id "${SSH_SECRET_ARN}" --query SecretString --output text > /root/.ssh/id_rsa + /usr/local/bin/aws --region "${AWS_REGION}" secretsmanager get-secret-value --secret-id "${SSH_SECRET_ARN}" --query SecretString --output text > /root/.ssh/id_rsa chmod 600 /root/.ssh/id_rsa ssh-keygen -y -f /root/.ssh/id_rsa > /root/.ssh/id_rsa.pub cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys From a5e7178dd92f92fe54582eb2b11bf97384898601 Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Wed, 29 Oct 2025 19:49:27 -0400 Subject: [PATCH 07/15] Add SSH key parameter for deployment, start sshd, fix main node self-registration and worker IP passing --- 1.architectures/3.aws-batch/README.md | 2 +- .../3.aws-batch/aws-batch-distributed-training-p6.yaml | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/1.architectures/3.aws-batch/README.md b/1.architectures/3.aws-batch/README.md index 8df427790..95fca36d8 100644 --- a/1.architectures/3.aws-batch/README.md +++ b/1.architectures/3.aws-batch/README.md @@ -14,7 +14,7 @@ This architecture consists of the following resources: ## Template -This template deploys AWS Batch and EC2 resources. It can be deployed via the console and the AWS CLI. Regardless of the deployment method it is assumed that you deployed the VPC template [`2.vpc-one-az.yaml`](../0.vpc_network/2.vpc-oneaz.yaml) prior to deploying that one. +This template deploys AWS Batch and EC2 resources. It can be deployed via the console and the AWS CLI. Regardless of the deployment method it is assumed that you deployed the VPC template [`2.vpc-one-az.yaml`](../1.vpc_network/2.vpc-oneaz.yaml) prior to deploying that one. - **Template file**: [`0.aws-batch-distributed-training.yaml`](./0.aws-batch-distributed-training.yaml) diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml index f2a855819..e9fb741be 100644 --- a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml +++ b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml @@ -271,14 +271,19 @@ Resources: cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys chmod 600 /root/.ssh/authorized_keys + # Start SSH daemon + /usr/sbin/sshd + if [ "${AWS_BATCH_JOB_NUM_NODES}" -eq 1 ]; then # Single node job exec /opt/amazon/openmpi/bin/mpirun --allow-run-as-root -np "${NCCL_TOTAL_PROCS}" --bind-to none -x PATH -x LD_LIBRARY_PATH -x FI_PROVIDER=efa -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_EFA_FORK_SAFE=1 ${NCCL_TEST_CMD} fi if [ "${AWS_BATCH_JOB_NODE_INDEX}" -eq "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" ]; then - # Main node: collect IPs and build hostfile + # Main node: register own IP and collect worker IPs mkdir -p /tmp/hosts + MY_IP=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/local-ipv4) + echo "${MY_IP}" > /tmp/hosts/${AWS_BATCH_JOB_NODE_INDEX} echo "Main node waiting for ${AWS_BATCH_JOB_NUM_NODES} nodes" while [ "$(ls /tmp/hosts 2>/dev/null | wc -l)" -lt "${AWS_BATCH_JOB_NUM_NODES}" ]; do @@ -298,8 +303,9 @@ Resources: exec /opt/amazon/openmpi/bin/mpirun --allow-run-as-root -np "${NCCL_TOTAL_PROCS}" --hostfile /tmp/hostfile --bind-to none -x PATH -x LD_LIBRARY_PATH -x FI_PROVIDER=efa -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_EFA_FORK_SAFE=1 ${NCCL_TEST_CMD} else # Worker node: register with main + MY_IP=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/local-ipv4) for i in 1 2 3 4 5; do - if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 -o BatchMode=yes root@"${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" "mkdir -p /tmp/hosts && echo '$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4)' > /tmp/hosts/${AWS_BATCH_JOB_NODE_INDEX}"; then + if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 -o BatchMode=yes root@"${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" "mkdir -p /tmp/hosts && echo '${MY_IP}' > /tmp/hosts/${AWS_BATCH_JOB_NODE_INDEX}"; then echo "Registered with main node" break fi From 8e7126fb6e53f38c2aa9e82d362c682c9347805d Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Thu, 30 Oct 2025 00:07:29 -0400 Subject: [PATCH 08/15] Fix MNP networking: use container IP and exclude bridge interfaces - Use hostname -i for container IP in awsvpc mode - Set NCCL_SOCKET_IFNAME=^lo,docker,ecs to exclude bridge interfaces - Add BatchJobRole with ecs-tasks trust for container credentials - Simplify SSH key generation with runtime generation - Remove debug output and set NCCL_DEBUG=WARN --- .../aws-batch-distributed-training-p6.yaml | 49 +++++++++++++++---- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml index e9fb741be..0785869b2 100644 --- a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml +++ b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml @@ -183,6 +183,29 @@ Resources: - !Ref ECSTaskServiceRole InstanceProfileName: !Join [ "", [ "ECSTaskInstanceProfileIAM-", !Ref AWS::StackName ] ] + BatchJobRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: + - ecs-tasks.amazonaws.com + Action: + - sts:AssumeRole + Policies: + - PolicyName: SecretsManagerReadForSSH + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - secretsmanager:GetSecretValue + Resource: + - !Sub 'arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${AWS::StackName}-ssh-key-*' + ## ## Secrets Manager - Placeholder for SSH key (populate manually after stack creation) ## @@ -225,6 +248,7 @@ Resources: DistributedDeepLearningJQ: Type: AWS::Batch::JobQueue Properties: + JobQueueName: !Sub "${AWS::StackName}-job-queue" ComputeEnvironmentOrder: - ComputeEnvironment: !Ref DistributedDeepLearningCE Order: 1 @@ -240,6 +264,7 @@ Resources: NCCLTest: Type: AWS::Batch::JobDefinition Properties: + JobDefinitionName: !Sub "${AWS::StackName}-nccl-test" Type: multinode NodeProperties: MainNode: 0 @@ -249,6 +274,7 @@ Resources: Container: # Use base nccl-tests image from public ECR Image: public.ecr.aws/hpc-cloud/nccl-tests:latest + JobRoleArn: !GetAtt BatchJobRole.Arn # Inline command that sets up SSH and hostfile, then runs NCCL test Command: - /bin/bash @@ -258,21 +284,22 @@ Resources: export PATH="/usr/local/bin:/usr/sbin:$PATH" echo "Node ${AWS_BATCH_JOB_NODE_INDEX}/${AWS_BATCH_JOB_NUM_NODES} starting" - # Get AWS region from metadata service (IMDSv2) - TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") - AWS_REGION=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/placement/region) - # Fetch SSH key from Secrets Manager mkdir -p /root/.ssh chmod 700 /root/.ssh - /usr/local/bin/aws --region "${AWS_REGION}" secretsmanager get-secret-value --secret-id "${SSH_SECRET_ARN}" --query SecretString --output text > /root/.ssh/id_rsa + /usr/local/bin/aws --region "${AWS_DEFAULT_REGION}" secretsmanager get-secret-value --secret-id "${SSH_SECRET_ARN}" --query SecretString --output text > /root/.ssh/id_rsa chmod 600 /root/.ssh/id_rsa ssh-keygen -y -f /root/.ssh/id_rsa > /root/.ssh/id_rsa.pub cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys chmod 600 /root/.ssh/authorized_keys # Start SSH daemon - /usr/sbin/sshd + mkdir -p /run/sshd + /usr/sbin/sshd || echo "ERROR: sshd failed to start" + sleep 1 + + # Get container IP address (not host IP) - in awsvpc mode, container has its own ENI + MY_IP=$(hostname -i | awk '{print $1}') if [ "${AWS_BATCH_JOB_NUM_NODES}" -eq 1 ]; then # Single node job @@ -282,7 +309,6 @@ Resources: if [ "${AWS_BATCH_JOB_NODE_INDEX}" -eq "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" ]; then # Main node: register own IP and collect worker IPs mkdir -p /tmp/hosts - MY_IP=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/local-ipv4) echo "${MY_IP}" > /tmp/hosts/${AWS_BATCH_JOB_NODE_INDEX} echo "Main node waiting for ${AWS_BATCH_JOB_NUM_NODES} nodes" @@ -299,11 +325,12 @@ Resources: echo "Hostfile:" cat /tmp/hostfile - # Launch NCCL test - exec /opt/amazon/openmpi/bin/mpirun --allow-run-as-root -np "${NCCL_TOTAL_PROCS}" --hostfile /tmp/hostfile --bind-to none -x PATH -x LD_LIBRARY_PATH -x FI_PROVIDER=efa -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_EFA_FORK_SAFE=1 ${NCCL_TEST_CMD} + # Launch NCCL test - exclude loopback and bridge interfaces + export NCCL_SOCKET_IFNAME=^lo,docker,ecs + export NCCL_DEBUG=WARN + exec /opt/amazon/openmpi/bin/mpirun --allow-run-as-root --mca btl tcp,self -np "${NCCL_TOTAL_PROCS}" --hostfile /tmp/hostfile --bind-to none -x PATH -x LD_LIBRARY_PATH -x NCCL_SOCKET_IFNAME -x NCCL_NET_PLUGIN=ofi -x FI_PROVIDER=efa -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_EFA_FORK_SAFE=1 -x FI_EFA_ENABLE_SHM_TRANSFER=1 -x NCCL_DEBUG ${NCCL_TEST_CMD} else # Worker node: register with main - MY_IP=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/local-ipv4) for i in 1 2 3 4 5; do if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 -o BatchMode=yes root@"${AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS}" "mkdir -p /tmp/hosts && echo '${MY_IP}' > /tmp/hosts/${AWS_BATCH_JOB_NODE_INDEX}"; then echo "Registered with main node" @@ -316,6 +343,8 @@ Resources: tail -f /dev/null fi Environment: + - Name: AWS_DEFAULT_REGION + Value: !Ref AWS::Region - Name: LD_LIBRARY_PATH Value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH - Name: PATH From 6d13c0a065bc663a919cec15f034bbeb8cb188da Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Thu, 30 Oct 2025 15:09:55 -0400 Subject: [PATCH 09/15] updated README.md to have P6 support --- 1.architectures/3.aws-batch/README.md | 184 ++++++++++++++++---------- 1 file changed, 111 insertions(+), 73 deletions(-) diff --git a/1.architectures/3.aws-batch/README.md b/1.architectures/3.aws-batch/README.md index 95fca36d8..e73b00c63 100644 --- a/1.architectures/3.aws-batch/README.md +++ b/1.architectures/3.aws-batch/README.md @@ -1,136 +1,174 @@ -# AWS Batch distributed training architectures +# AWS Batch Distributed Training Architectures -This architecture serves as an example to run distributed training jobs on p4d.24xlarge instances but can be easily be modified to accommodate other instance kinds (Trn or other P instances). +This repository provides CloudFormation templates and examples for running distributed training jobs on AWS Batch using GPU instances. The architecture can be easily modified to accommodate different instance types including Trainium (Trn) and other P-series instances. -> **Important**: it is assumed that you deployed the VPC template [`2.vpc-one-az.yaml`](../1.vpc_network/2.vpc-oneaz.yaml) as our Batch template will fetch automatically the EFA Security Group ID (SG) and Subnet ID to setup the AWS Batch Compute Environment. Both the SG and Subnet are exported values from the VPC template. +## Table of Contents -This architecture consists of the following resources: +- [Prerequisites](#prerequisites) +- [Architecture Overview](#architecture-overview) +- [Available Templates](#available-templates) +- [Standard Deployment (P4/P5)](#standard-deployment-p4p5) +- [P6 Deployment](#p6-deployment) +- [Important Considerations](#important-considerations) +- [Architecture Diagram](#architecture-diagram) -- [AWS Batch Compute Environment](https://docs.aws.amazon.com/batch/latest/userguide/compute_environments.html) for [Multi-node parallel jobs](https://docs.aws.amazon.com/batch/latest/userguide/multi-node-parallel-jobs.html). It is similar to a compute cluster. -- [AWS Batch Job Queue](https://docs.aws.amazon.com/batch/latest/userguide/job_queues.html) attached to the compute environment. It is similar to a queue for job schedulers (Slurm, LSF...). -- [EC2 Launch Template](https://docs.aws.amazon.com/autoscaling/ec2/userguide/launch-templates.html) which used to setup 4 EFA cards on our instance. -- [Job Definition](https://docs.aws.amazon.com/batch/latest/userguide/job_definitions.html) serves as a template for our jobs and refers to the container registry to pull containers -- [ECR Container Registry](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html) is used to store containers. +## Prerequisites -## Template +> **⚠️ Important**: You must deploy the VPC template [`2.vpc-one-az.yaml`](../../1.vpc_network/2.vpc-oneaz.yaml) before deploying any Batch template. The Batch templates automatically fetch the EFA Security Group ID and Subnet ID from the VPC template's exported values. -This template deploys AWS Batch and EC2 resources. It can be deployed via the console and the AWS CLI. Regardless of the deployment method it is assumed that you deployed the VPC template [`2.vpc-one-az.yaml`](../1.vpc_network/2.vpc-oneaz.yaml) prior to deploying that one. +## Architecture Overview -- **Template file**: [`0.aws-batch-distributed-training.yaml`](./0.aws-batch-distributed-training.yaml) +This architecture consists of the following AWS resources: -### Quick Create +| Component | Purpose | Documentation | +|-----------|---------|---------------| +| **AWS Batch Compute Environment** | Manages compute resources for multi-node parallel jobs (similar to a compute cluster) | [AWS Docs](https://docs.aws.amazon.com/batch/latest/userguide/compute_environments.html) | +| **AWS Batch Job Queue** | Queues jobs for execution (similar to Slurm/LSF schedulers) | [AWS Docs](https://docs.aws.amazon.com/batch/latest/userguide/job_queues.html) | +| **EC2 Launch Template** | Configures EFA network interfaces for high-performance networking | [AWS Docs](https://docs.aws.amazon.com/autoscaling/ec2/userguide/launch-templates.html) | +| **Job Definition** | Template for job execution, references container images | [AWS Docs](https://docs.aws.amazon.com/batch/latest/userguide/job_definitions.html) | +| **ECR Container Registry** | Stores Docker container images | [AWS Docs](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html) | -[
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/0.aws-batch-distributed-training.yaml&stackName=AWS-Batch) +AWS Batch Architecture Diagram +## Available Templates -## List of Parameters +| Template | Instance Types | Features | +|----------|----------------|----------| +| [`0.aws-batch-distributed-training.yaml`](./0.aws-batch-distributed-training.yaml) | P4d.24xlarge (default) | Standard deployment with 4 EFA interfaces | +| [`0.aws-batch-distributed-training-p5.yaml`](./0.aws-batch-distributed-training-p5.yaml) | P5.48xlarge | Optimized for P5 instances | +| [`aws-batch-distributed-training-p6.yaml`](./aws-batch-distributed-training-p6.yaml) | P6-b200.48xlarge | P6 deployment with sample AWS Batch MNP job setup | -The templates takes parameters that are mandatory and optional, see below for more details. +## P4 Instance Deployment -| Name | Type | Details | -|-------------------------|-------------|-----------------------------------------------------------------------| -| `VPCStackParameter` | Required | Name of the VPC stack in CloudFormation. | -| `AMI` | Optional | ID of the AMI if using a custom one otherwise leave blank | -| `CapacityReservationId` | Optional | Use that or the ResourceGroup to refer to an EC2 reservation | -| `CapacityReservationResourceGroup` | Optional | Use that or the CapacityReservationId. | -| `EC2KeyPair` | Optional | EC2 key pair to use in case you want to connect through ssh for debug.| -| `CreatePlacementGroup` | Optional | Create a placement group for the instances. | +### Quick Deploy +Deploy the standard template with one click: -## Deploy with the AWS CLI +[
1-Click Deploy 🚀
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/0.aws-batch-distributed-training.yaml&stackName=AWS-Batch) -If you'd like to deploy through the AWS CLI instead of the quick create link above, the command to deploy the template is shown below. Please edit the parameters values with your own configuration. +### Parameters -```bash -aws cloudformation create-stack --stack-name aws-batch-p5 \ - --template-body file://0.aws-batch-distributed-training-p5.yaml \ - --parameters ParameterKey=VPCStackParameter,ParameterValue="aws-batch-vpc" \ - ParameterKey=CapacityReservationId,ParameterValue="cr-1234567890" \ - --capabilities CAPABILITY_NAMED_IAM -``` +| Parameter | Type | Description | +|-----------|------|-------------| +| `VPCStackParameter` | **Required** | Name of the VPC CloudFormation stack | +| `AMI` | Optional | Custom AMI ID (leave blank for default) | +| `CapacityReservationId` | Optional | EC2 Capacity Reservation ID | +| `CapacityReservationResourceGroup` | Optional | Alternative to CapacityReservationId | +| `EC2KeyPair` | Optional | EC2 key pair for SSH debugging | +| `CreatePlacementGroup` | Optional | Create placement group for instances | -## P6 Deployment +### P5 Instance Deployment -For P6 instances (p6-b200.48xlarge), use the simplified template that eliminates the need for custom Docker images and bootstrap scripts. +```bash +aws cloudformation create-stack \ + --stack-name aws-batch-distributed-training \ + --template-body file://0.aws-batch-distributed-training.yaml \ + --parameters \ + ParameterKey=VPCStackParameter,ParameterValue="your-vpc-stack-name" \ + ParameterKey=CapacityReservationId,ParameterValue="cr-1234567890" \ + --capabilities CAPABILITY_NAMED_IAM +``` -- **Template file**: [`aws-batch-distributed-training-p6.yaml`](./aws-batch-distributed-training-p6.yaml) +## P6 Instance Deployment -### Features +### Template Parameters -- Inline container setup - no custom Dockerfile needed -- Uses public NCCL tests image directly: `public.ecr.aws/hpc-cloud/nccl-tests:latest` -- Capacity Reservation Resource Group support for easier capacity management -- Manual SSH key generation stored in Secrets Manager -- Single CloudFormation template deployment +| Parameter | Type | Description | +|-----------|------|-------------| +| `VPCStackParameter` | **Required** | Name of the VPC CloudFormation stack | +| `CapacityReservationId` | **Required** | Capacity Reservation ID (e.g., cr-1234567890) | ### Deployment Steps +#### Step 1: Deploy CloudFormation Stack + ```bash -# Step 1: Deploy CloudFormation Stack -aws cloudformation create-stack --stack-name batch-p6 \ +aws cloudformation create-stack \ + --stack-name batch-p6 \ --template-body file://aws-batch-distributed-training-p6.yaml \ --parameters \ - ParameterKey=VPCStackParameter,ParameterValue="vpc-stack-ml" \ + ParameterKey=VPCStackParameter,ParameterValue="your-vpc-stack-name" \ ParameterKey=CapacityReservationId,ParameterValue="cr-1234567890" \ --capabilities CAPABILITY_NAMED_IAM +``` -# Step 2: Generate and Upload SSH Key +#### Step 2: Generate and Store SSH Key + +```bash +# Generate SSH key pair ssh-keygen -t rsa -b 2048 -N '' -f /tmp/batch_key + +# Store private key in Secrets Manager aws secretsmanager put-secret-value \ --secret-id batch-p6-ssh-key \ --secret-string file:///tmp/batch_key + +# Clean up temporary files rm /tmp/batch_key /tmp/batch_key.pub ``` -### P6 Template Parameters - -| Name | Type | Details | -|---------------------------|----------|---------------------------------------------| -| `VPCStackParameter` | Required | Name of the VPC stack in CloudFormation | -| `CapacityReservationId` | Required | Capacity Reservation ID (e.g., cr-1234567890) | +### Testing Your Deployment -### Submitting a Test Job - -After deployment and SSH key setup, submit a multi-node NCCL test job: +Submit a multi-node NCCL test job to verify the setup: ```bash -# Get the job definition and queue from stack outputs -JOB_DEFINITION=$(aws cloudformation describe-stacks --stack-name batch-p6 \ +# Retrieve stack outputs +JOB_DEFINITION=$(aws cloudformation describe-stacks \ + --stack-name batch-p6 \ --query 'Stacks[0].Outputs[?OutputKey==`JobDefinitionMultiInstance`].OutputValue' \ --output text) -JOB_QUEUE=$(aws cloudformation describe-stacks --stack-name batch-p6 \ +JOB_QUEUE=$(aws cloudformation describe-stacks \ + --stack-name batch-p6 \ --query 'Stacks[0].Outputs[?OutputKey==`DistributedDeepLearningJQ`].OutputValue' \ --output text) -# Submit a 2-node NCCL test job +# Submit test job aws batch submit-job \ --job-name nccl-test-2node \ --job-queue ${JOB_QUEUE} \ --job-definition ${JOB_DEFINITION} \ --node-overrides numNodes=2 -# Check job status +# Monitor job status aws batch describe-jobs --jobs -# View job logs in CloudWatch Logs (check the job's logStreamName from describe-jobs) +# View logs aws logs tail /aws/batch/job --follow ``` -### P6 Architecture Notes +### P6 Architecture Details + +- **Container Image**: `public.ecr.aws/hpc-cloud/nccl-tests:latest` +- **Network Configuration**: 8 EFA interfaces per instance +- **SSH Setup**: Automated via inline bash script in Job Definition +- **Default Test**: `all_reduce_perf` with 8 GPUs per node (16 total processes for 2-node job) +- **Key Management**: SSH keys retrieved from Secrets Manager at container startup + +## Important Considerations + +### EFA Network Configuration + +- EFA interfaces must be explicitly declared in the EC2 Launch Template +- The EFA security group must be provided and properly configured +- Network performance is critical for distributed training workloads + +### VPC Dependencies + +- The Compute Environment retrieves private subnet information from the VPC template +- Ensure the VPC template exports the required subnet and security group values +- Both templates must be deployed in the same AWS region -- **8 EFA interfaces** configured for p6-b200.48xlarge instances -- **Inline bash script** in Job Definition handles SSH setup, hostfile generation, and NCCL test execution -- **No custom Docker image required** - uses base NCCL tests image with runtime configuration -- **SSH keys** stored in Secrets Manager and fetched at container startup -- **Default NCCL Test**: all_reduce_perf with 8 GPUs per node, 16 total processes +### Capacity Management -## Gotchas +- Use Capacity Reservations for guaranteed instance availability +- Consider using Capacity Reservation Resource Groups for easier management +- Monitor your EC2 limits and request increases if needed -There are a few things to know as you evaluate this architecture: -- EFA interfaces need to be declared explicitly in the EC2 Launch Template and you need to provide the security group used for EFA. -- The Compute Environment must retrieve the list of private subnets from the VPC template. This list is exported by the VPC template. +--- -## Architecture Diagram +## Additional Resources - +- [AWS Batch User Guide](https://docs.aws.amazon.com/batch/latest/userguide/) +- [Multi-node Parallel Jobs](https://docs.aws.amazon.com/batch/latest/userguide/multi-node-parallel-jobs.html) +- [EFA Documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) From ab955ca3f492746b40d7d056777a0b6089cd2f0e Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Thu, 30 Oct 2025 15:15:55 -0400 Subject: [PATCH 10/15] Fix table of contents links in README --- 1.architectures/3.aws-batch/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/1.architectures/3.aws-batch/README.md b/1.architectures/3.aws-batch/README.md index e73b00c63..de2e55efb 100644 --- a/1.architectures/3.aws-batch/README.md +++ b/1.architectures/3.aws-batch/README.md @@ -7,10 +7,10 @@ This repository provides CloudFormation templates and examples for running distr - [Prerequisites](#prerequisites) - [Architecture Overview](#architecture-overview) - [Available Templates](#available-templates) -- [Standard Deployment (P4/P5)](#standard-deployment-p4p5) -- [P6 Deployment](#p6-deployment) +- [P4 Instance Deployment](#p4-instance-deployment) +- [P5 Instance Deployment](#p5-instance-deployment) +- [P6 Instance Deployment](#p6-instance-deployment) - [Important Considerations](#important-considerations) -- [Architecture Diagram](#architecture-diagram) ## Prerequisites From bbc0a76dda9049e5c5285ab34facd0d7c1682abd Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Thu, 30 Oct 2025 15:49:21 -0400 Subject: [PATCH 11/15] fix: correct VPC template filename reference in README --- 1.architectures/3.aws-batch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/1.architectures/3.aws-batch/README.md b/1.architectures/3.aws-batch/README.md index de2e55efb..bcb4d845c 100644 --- a/1.architectures/3.aws-batch/README.md +++ b/1.architectures/3.aws-batch/README.md @@ -14,7 +14,7 @@ This repository provides CloudFormation templates and examples for running distr ## Prerequisites -> **⚠️ Important**: You must deploy the VPC template [`2.vpc-one-az.yaml`](../../1.vpc_network/2.vpc-oneaz.yaml) before deploying any Batch template. The Batch templates automatically fetch the EFA Security Group ID and Subnet ID from the VPC template's exported values. +> **⚠️ Important**: You must deploy the VPC template [`2.vpc-one-az.yaml`](../../1.vpc_network/2.vpc-one-az.yaml) before deploying any Batch template. The Batch templates automatically fetch the EFA Security Group ID and Subnet ID from the VPC template's exported values. ## Architecture Overview From 5614275b694a612e4cabaa2678ec2c587a616e86 Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Thu, 30 Oct 2025 16:03:30 -0400 Subject: [PATCH 12/15] fix link --- 1.architectures/3.aws-batch/README.md | 2 +- README.md.bak | 112 ++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 README.md.bak diff --git a/1.architectures/3.aws-batch/README.md b/1.architectures/3.aws-batch/README.md index bcb4d845c..78b3ac563 100644 --- a/1.architectures/3.aws-batch/README.md +++ b/1.architectures/3.aws-batch/README.md @@ -14,7 +14,7 @@ This repository provides CloudFormation templates and examples for running distr ## Prerequisites -> **⚠️ Important**: You must deploy the VPC template [`2.vpc-one-az.yaml`](../../1.vpc_network/2.vpc-one-az.yaml) before deploying any Batch template. The Batch templates automatically fetch the EFA Security Group ID and Subnet ID from the VPC template's exported values. +> **⚠️ Important**: You must deploy the VPC template [`2.vpc-one-az.yaml`](../../1.architectures/1.vpc_network/2.vpc-one-az.yaml) before deploying any Batch template. The Batch templates automatically fetch the EFA Security Group ID and Subnet ID from the VPC template's exported values. ## Architecture Overview diff --git a/README.md.bak b/README.md.bak new file mode 100644 index 000000000..0aaa71117 --- /dev/null +++ b/README.md.bak @@ -0,0 +1,112 @@ +# ML Training Reference Architectures & Tests + +> **Warning** +> We are currently undergoing a major refactoring of this repository, particularly focused on the test cases section. If you prefer to use the previous directory structure and deprecated test cases, please refer to [v1.1.0](https://github.com/aws-samples/awsome-distributed-training/releases/tag/v1.1.0). + + +This repository contains reference architectures and test cases for distributed model training with [Amazon SageMaker Hyperpod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html), [AWS ParallelCluster](https://docs.aws.amazon.com/parallelcluster/latest/ug/what-is-aws-parallelcluster.html), [AWS Batch](https://docs.aws.amazon.com/batch/latest/userguide/what-is-batch.html), and [Amazon EKS](https://docs.aws.amazon.com/eks/latest/userguide/getting-started-console.html). The test cases cover different types and sizes of models as well as different frameworks and parallel optimizations (Pytorch DDP/FSDP, MegatronLM, NemoMegatron...). + +The major components of this directory are: + +```bash +reference-architectures/ +|-- 1.architectures/ # CloudFormation templates for reference arch +|-- 2.ami_and_containers/ # Scripts to create AMIs and container images +|-- 3.test_cases/ # Reference test cases and/or benchmark scripts +|-- 4.validation_observability/ # Tools to measure performance or troubleshoot +`-- ... +``` + +**NOTE**: the architectures are designed to work with the S3 bucket and VPC created using reference templates `1.architectures/0.s3/` and `1.architectures/1.vpc_network/`. _You're strongly recommended to deploy these two templates **before** deploying any of the reference architectures._ + +## 0. Workshops + +You can follow the workshop below to train models on AWS. Each contains examples for several test cases as well as nuggets of information on operating a cluster for LLM training. + +| Name | Comments +| ------------------------------------------------------------------------------ | ------------------------------------------------------------------- | +| [Amazon SageMaker HyperPod](https://catalog.workshops.aws/sagemaker-hyperpod/en-US) | Workshop for SageMaker HyperPod, shows how to deploy and monitor it | +| [AWS ParallelCluster](https://catalog.workshops.aws/ml-on-aws-parallelcluster) | Similar workshop as HyperPod but on ParallelCluster | +| [Amazon SageMaker HyperPod EKS](https://catalog.workshops.aws/sagemaker-hyperpod-eks) | Workshop for SageMaker HyperPod EKS, shows how to deploy and monitor it | + +## 1. Architectures + +Architectures are located in `1.architectures` and consists of utilities and service related architectures. + +| Name | Category | Usage | +| ------------------------------------------------------------------ | -------- | --------------------------------------------------- | +| [`0.s3`](./1.architectures/0.s3) | Storage | Create an S3 bucket | +| [`1.vpc_network`](./1.architectures/1.vpc_network) | Network | Create a VPC with subnets required resources | +| [`2.aws-parallelcluster`](./1.architectures/2.aws-parallelcluster) | Compute | Cluster templates for GPU & custom silicon training | +| [`3.aws-batch`](./1.architectures/3.aws-batch) | Compute | AWS Batch template for distributed training | +| [`4.amazon-eks`](./1.architectures/4.amazon-eks) | Compute | Manifest files to train with Amazon EKS | +| [`5.sagemaker-hyperpod`](./1.architectures/5.sagemaker-hyperpod) | Compute | SageMaker HyperPod template for distributed training| + +More will come, feel free to add new ones (ex. Ray?). You will also find [documentation](./1.architectures/efa-cheatsheet.md) for EFA and the recommended environment variables. + +## 2. Custom Amazon Machine Images + +Custom machine images can be built using [Packer](www.packer.io) for AWS ParallelCluster, Amazon EKS and plain EC2. These images are based are on Ansible roles and playbooks. + +## 3. Test cases + +Test cases are organized by framework and cover various distributed training scenarios. Each test case includes the necessary scripts and configurations to run distributed training jobs. + +### PyTorch Test Cases +- [`FSDP/`](./3.test_cases/pytorch/FSDP) - Fully Sharded Data Parallel training examples +- [`megatron-lm/`](./3.test_cases/pytorch/megatron-lm) - Megatron-LM distributed training examples +- [`nemo-launcher/`](./3.test_cases/pytorch/nemo-launcher) - NeMo Launcher examples for distributed training. This test case is for NeMo version 1.0 only. +- [`nemo-run/`](./3.test_cases/pytorch/nemo-run) - NeMo framework distributed training examples. This test case is for NeMo version 2.0+. +- [`neuronx-distributed/`](./3.test_cases/pytorch/neuronx-distributed) - AWS Trainium distributed training examples +- [`mosaicml-composer/`](./3.test_cases/pytorch/mosaicml-composer) - MosaicML Composer examples +- [`picotron/`](./3.test_cases/pytorch/picotron) - PicoTron distributed training examples +- [`torchtitan/`](./3.test_cases/pytorch/torchtitan) - TorchTitan examples +- [`cpu-ddp/`](./3.test_cases/pytorch/cpu-ddp) - CPU-based Distributed Data Parallel examples +- [`bionemo/`](./3.test_cases/pytorch/bionemo) - BioNeMo distributed training examples + +### JAX Test Cases +- [`jax/`](./3.test_cases/jax) - JAX-based distributed training examples using PaxML + +Each test case includes: +- Training scripts and configurations +- Container definitions (where applicable) +- Launch scripts for different cluster types +- Performance monitoring and validation tools + +## 4. Validation scripts + +Utilities scripts and micro-benchmarks examples are set under `4.validation_scripts/`. The EFA Prometheus exporter can be found in this [directory](./4.validation_and_observability/3.efa-node-exporter) + + +| Name | Comments | +| --------------------------------------------------------------------------------------- | --------------------------------------------------------------- | +| [`1.pytorch-env-validation`](./4.validation_and_observability/1.pytorch-env-validation) | Validates your PyTorch environment | +| [`3.efa-node-exporter`](./4.validation_and_observability/3.efa-node-exporter) | Node exporter with Amazon EFA monitoring modules | +| [`4.prometheus-grafana`](./4.validation_and_observability/4.prometheus-grafana) | Deployment assets to monitor SageMaker Hyperpod Clusters | +| [`5.nsight`](./4.validation_and_observability/5.nsight) | Shows how to run Nvidia Nsight Systems to profile your workload | +| [`efa-versions.py`](./1.architectures/efa-versions.py) | Get the versions of Nvidia libraries, drivers and EFA drivers | + + +## 5. CI + +Integration tests are written in [pytest](https://docs.pytest.org). Just run: + +```bash +pytest . +``` + +Alternatively you can run tests with out capturing stdout and keeping all docker images an other artifacts. + +```bash +pytest -s --keep-artifacts=t +``` + +## 6. Contributors + +Thanks to all the contributors for building, reviewing and testing. + +[![Contributors](https://contrib.rocks/image?repo=aws-samples/awsome-distributed-training)](https://github.com/aws-samples/awsome-distributed-training/graphs/contributors) + +## 7.Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=aws-samples/awsome-distributed-training&type=Date)](https://star-history.com/#aws-samples/awsome-distributed-training&Date) From c70d0fd98aac3b9ddf6247ed8ee62965d19f8628 Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Thu, 30 Oct 2025 16:05:45 -0400 Subject: [PATCH 13/15] delete backup file --- README.md.bak | 112 -------------------------------------------------- 1 file changed, 112 deletions(-) delete mode 100644 README.md.bak diff --git a/README.md.bak b/README.md.bak deleted file mode 100644 index 0aaa71117..000000000 --- a/README.md.bak +++ /dev/null @@ -1,112 +0,0 @@ -# ML Training Reference Architectures & Tests - -> **Warning** -> We are currently undergoing a major refactoring of this repository, particularly focused on the test cases section. If you prefer to use the previous directory structure and deprecated test cases, please refer to [v1.1.0](https://github.com/aws-samples/awsome-distributed-training/releases/tag/v1.1.0). - - -This repository contains reference architectures and test cases for distributed model training with [Amazon SageMaker Hyperpod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html), [AWS ParallelCluster](https://docs.aws.amazon.com/parallelcluster/latest/ug/what-is-aws-parallelcluster.html), [AWS Batch](https://docs.aws.amazon.com/batch/latest/userguide/what-is-batch.html), and [Amazon EKS](https://docs.aws.amazon.com/eks/latest/userguide/getting-started-console.html). The test cases cover different types and sizes of models as well as different frameworks and parallel optimizations (Pytorch DDP/FSDP, MegatronLM, NemoMegatron...). - -The major components of this directory are: - -```bash -reference-architectures/ -|-- 1.architectures/ # CloudFormation templates for reference arch -|-- 2.ami_and_containers/ # Scripts to create AMIs and container images -|-- 3.test_cases/ # Reference test cases and/or benchmark scripts -|-- 4.validation_observability/ # Tools to measure performance or troubleshoot -`-- ... -``` - -**NOTE**: the architectures are designed to work with the S3 bucket and VPC created using reference templates `1.architectures/0.s3/` and `1.architectures/1.vpc_network/`. _You're strongly recommended to deploy these two templates **before** deploying any of the reference architectures._ - -## 0. Workshops - -You can follow the workshop below to train models on AWS. Each contains examples for several test cases as well as nuggets of information on operating a cluster for LLM training. - -| Name | Comments -| ------------------------------------------------------------------------------ | ------------------------------------------------------------------- | -| [Amazon SageMaker HyperPod](https://catalog.workshops.aws/sagemaker-hyperpod/en-US) | Workshop for SageMaker HyperPod, shows how to deploy and monitor it | -| [AWS ParallelCluster](https://catalog.workshops.aws/ml-on-aws-parallelcluster) | Similar workshop as HyperPod but on ParallelCluster | -| [Amazon SageMaker HyperPod EKS](https://catalog.workshops.aws/sagemaker-hyperpod-eks) | Workshop for SageMaker HyperPod EKS, shows how to deploy and monitor it | - -## 1. Architectures - -Architectures are located in `1.architectures` and consists of utilities and service related architectures. - -| Name | Category | Usage | -| ------------------------------------------------------------------ | -------- | --------------------------------------------------- | -| [`0.s3`](./1.architectures/0.s3) | Storage | Create an S3 bucket | -| [`1.vpc_network`](./1.architectures/1.vpc_network) | Network | Create a VPC with subnets required resources | -| [`2.aws-parallelcluster`](./1.architectures/2.aws-parallelcluster) | Compute | Cluster templates for GPU & custom silicon training | -| [`3.aws-batch`](./1.architectures/3.aws-batch) | Compute | AWS Batch template for distributed training | -| [`4.amazon-eks`](./1.architectures/4.amazon-eks) | Compute | Manifest files to train with Amazon EKS | -| [`5.sagemaker-hyperpod`](./1.architectures/5.sagemaker-hyperpod) | Compute | SageMaker HyperPod template for distributed training| - -More will come, feel free to add new ones (ex. Ray?). You will also find [documentation](./1.architectures/efa-cheatsheet.md) for EFA and the recommended environment variables. - -## 2. Custom Amazon Machine Images - -Custom machine images can be built using [Packer](www.packer.io) for AWS ParallelCluster, Amazon EKS and plain EC2. These images are based are on Ansible roles and playbooks. - -## 3. Test cases - -Test cases are organized by framework and cover various distributed training scenarios. Each test case includes the necessary scripts and configurations to run distributed training jobs. - -### PyTorch Test Cases -- [`FSDP/`](./3.test_cases/pytorch/FSDP) - Fully Sharded Data Parallel training examples -- [`megatron-lm/`](./3.test_cases/pytorch/megatron-lm) - Megatron-LM distributed training examples -- [`nemo-launcher/`](./3.test_cases/pytorch/nemo-launcher) - NeMo Launcher examples for distributed training. This test case is for NeMo version 1.0 only. -- [`nemo-run/`](./3.test_cases/pytorch/nemo-run) - NeMo framework distributed training examples. This test case is for NeMo version 2.0+. -- [`neuronx-distributed/`](./3.test_cases/pytorch/neuronx-distributed) - AWS Trainium distributed training examples -- [`mosaicml-composer/`](./3.test_cases/pytorch/mosaicml-composer) - MosaicML Composer examples -- [`picotron/`](./3.test_cases/pytorch/picotron) - PicoTron distributed training examples -- [`torchtitan/`](./3.test_cases/pytorch/torchtitan) - TorchTitan examples -- [`cpu-ddp/`](./3.test_cases/pytorch/cpu-ddp) - CPU-based Distributed Data Parallel examples -- [`bionemo/`](./3.test_cases/pytorch/bionemo) - BioNeMo distributed training examples - -### JAX Test Cases -- [`jax/`](./3.test_cases/jax) - JAX-based distributed training examples using PaxML - -Each test case includes: -- Training scripts and configurations -- Container definitions (where applicable) -- Launch scripts for different cluster types -- Performance monitoring and validation tools - -## 4. Validation scripts - -Utilities scripts and micro-benchmarks examples are set under `4.validation_scripts/`. The EFA Prometheus exporter can be found in this [directory](./4.validation_and_observability/3.efa-node-exporter) - - -| Name | Comments | -| --------------------------------------------------------------------------------------- | --------------------------------------------------------------- | -| [`1.pytorch-env-validation`](./4.validation_and_observability/1.pytorch-env-validation) | Validates your PyTorch environment | -| [`3.efa-node-exporter`](./4.validation_and_observability/3.efa-node-exporter) | Node exporter with Amazon EFA monitoring modules | -| [`4.prometheus-grafana`](./4.validation_and_observability/4.prometheus-grafana) | Deployment assets to monitor SageMaker Hyperpod Clusters | -| [`5.nsight`](./4.validation_and_observability/5.nsight) | Shows how to run Nvidia Nsight Systems to profile your workload | -| [`efa-versions.py`](./1.architectures/efa-versions.py) | Get the versions of Nvidia libraries, drivers and EFA drivers | - - -## 5. CI - -Integration tests are written in [pytest](https://docs.pytest.org). Just run: - -```bash -pytest . -``` - -Alternatively you can run tests with out capturing stdout and keeping all docker images an other artifacts. - -```bash -pytest -s --keep-artifacts=t -``` - -## 6. Contributors - -Thanks to all the contributors for building, reviewing and testing. - -[![Contributors](https://contrib.rocks/image?repo=aws-samples/awsome-distributed-training)](https://github.com/aws-samples/awsome-distributed-training/graphs/contributors) - -## 7.Star History - -[![Star History Chart](https://api.star-history.com/svg?repos=aws-samples/awsome-distributed-training&type=Date)](https://star-history.com/#aws-samples/awsome-distributed-training&Date) From db999282d7b3bdcdfd74d6fc241e2e0fd2a51d97 Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Fri, 31 Oct 2025 17:57:00 -0400 Subject: [PATCH 14/15] fix: address security scan findings - Remove ECR repository (using public ECR image) - Add KMS encryption with key rotation for Secrets Manager - Convert inline IAM policies to managed policies - Remove explicit resource names for auto-generation - Enforce IMDSv2 on Launch Template - Add suppression for SSH key rotation (not applicable) --- .../aws-batch-distributed-training-p6.yaml | 113 ++++++++++-------- 1 file changed, 63 insertions(+), 50 deletions(-) diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml index 0785869b2..2ccfc5895 100644 --- a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml +++ b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml @@ -31,22 +31,36 @@ Metadata: Resources: ################### - ## Resource Group ## + ## KMS Key for Secrets Manager ## ################### - CapacityReservationResourceGroup: - Type: AWS::ResourceGroups::Group + SecretsKMSKey: + Type: AWS::KMS::Key Properties: - Name: !Sub "${AWS::StackName}-capacity-reservations" - Description: Capacity reservations for Batch P6 - ResourceQuery: - Type: TAG_FILTERS_1_0 - Query: - ResourceTypeFilters: - - AWS::EC2::CapacityReservation - TagFilters: - - Key: ManagedBy - Values: - - !Ref AWS::StackName + Description: KMS key for encrypting Batch SSH secrets + EnableKeyRotation: true + KeyPolicy: + Version: '2012-10-17' + Statement: + - Sid: Enable IAM User Permissions + Effect: Allow + Principal: + AWS: !Sub 'arn:aws:iam::${AWS::AccountId}:root' + Action: 'kms:*' + Resource: '*' + - Sid: Allow Secrets Manager to use the key + Effect: Allow + Principal: + Service: secretsmanager.amazonaws.com + Action: + - 'kms:Decrypt' + - 'kms:GenerateDataKey' + Resource: '*' + + SecretsKMSKeyAlias: + Type: AWS::KMS::Alias + Properties: + AliasName: !Sub 'alias/${AWS::StackName}-secrets' + TargetKeyId: !Ref SecretsKMSKey ################### ## EC2 Resources ## @@ -55,6 +69,10 @@ Resources: Type: AWS::EC2::LaunchTemplate Properties: LaunchTemplateData: + MetadataOptions: + HttpTokens: required # Enforce IMDSv2 + HttpEndpoint: enabled + HttpPutResponseHopLimit: 1 InstanceMarketOptions: MarketType: "capacity-block" CapacityReservationSpecification: @@ -149,6 +167,24 @@ Resources: ManagedPolicyArns: - arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole + SecretsManagerPolicy: + Type: AWS::IAM::ManagedPolicy + Properties: + Description: Policy for reading SSH secrets from Secrets Manager + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - secretsmanager:GetSecretValue + Resource: + - !Sub 'arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${AWS::StackName}-ssh-key-*' + - Effect: Allow + Action: + - kms:Decrypt + Resource: + - !GetAtt SecretsKMSKey.Arn + ECSTaskServiceRole: Type: AWS::IAM::Role Properties: @@ -164,16 +200,7 @@ Resources: - sts:AssumeRole ManagedPolicyArns: - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role - Policies: - - PolicyName: SecretsManagerReadForSSH - PolicyDocument: - Version: '2012-10-17' - Statement: - - Effect: Allow - Action: - - secretsmanager:GetSecretValue - Resource: - - !Sub 'arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${AWS::StackName}-ssh-key-*' + - !Ref SecretsManagerPolicy ECSTaskInstanceProfile: Type: AWS::IAM::InstanceProfile @@ -195,26 +222,27 @@ Resources: - ecs-tasks.amazonaws.com Action: - sts:AssumeRole - Policies: - - PolicyName: SecretsManagerReadForSSH - PolicyDocument: - Version: '2012-10-17' - Statement: - - Effect: Allow - Action: - - secretsmanager:GetSecretValue - Resource: - - !Sub 'arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${AWS::StackName}-ssh-key-*' + ManagedPolicyArns: + - !Ref SecretsManagerPolicy ## ## Secrets Manager - Placeholder for SSH key (populate manually after stack creation) ## SSHKeySecret: Type: AWS::SecretsManager::Secret + Metadata: + cfn_nag: + rules_to_suppress: + - id: W77 + reason: "SSH keys for Batch MNP do not require automatic rotation" + guard: + SuppressedRules: + - SECRETSMANAGER_ROTATION_ENABLED_CHECK Properties: Name: !Sub "${AWS::StackName}-ssh-key" Description: "SSH private key for Batch MNP jobs - populate with: ssh-keygen -t rsa -b 2048 -N '' -f /tmp/batch_key && aws secretsmanager put-secret-value --secret-id --secret-string file:///tmp/batch_key" SecretString: "PLACEHOLDER - Run the command in Description to generate and upload SSH key" + KmsKeyId: !Ref SecretsKMSKey ## ## Compute Environment and Job Definition @@ -256,11 +284,8 @@ Resources: State: "ENABLED" ## - ## ECR and AWS Batch Job definition + ## AWS Batch Job definition ## - NCCLTestRepository: - Type: AWS::ECR::Repository - NCCLTest: Type: AWS::Batch::JobDefinition Properties: @@ -431,18 +456,6 @@ Resources: Attempts: 1 Outputs: - CapacityReservationResourceGroupArn: - Description: ARN of the Capacity Reservation Resource Group (add your CBML to this group) - Value: !GetAtt CapacityReservationResourceGroup.Arn - - ECRRepository: - Description: ECR Repository for the containers - Value: !Ref NCCLTestRepository - - ECRRepositoryUrl: - Description: ECR Repository for the containers - Value: !Join ['', [!Ref 'AWS::AccountId','.dkr.ecr.', !Ref 'AWS::Region', '.amazonaws.com/', !Ref NCCLTestRepository ] ] - SSHKeySecretArn: Description: ARN of the SSH private key secret in Secrets Manager Value: !Sub 'arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${AWS::StackName}-ssh-key' From f0bd3b30634a470e4c2f67617cb8aa33b36c362e Mon Sep 17 00:00:00 2001 From: Yusong Wang Date: Sun, 9 Nov 2025 11:50:44 -0500 Subject: [PATCH 15/15] feat: update NCCL tests image to specific version for better P6 performance Use public.ecr.aws/hpc-cloud/nccl-tests:cuda12.8.1-efa1.42.0-ofiv1.16.0-ncclv2.27.5-1-testsv2.16.4 - CUDA 12.8.1 - EFA 1.42.0 - OFI (libfabric) 1.16.0 - NCCL 2.27.5 - NCCL tests 2.16.4 --- .../3.aws-batch/aws-batch-distributed-training-p6.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml index 2ccfc5895..f9eaf89c7 100644 --- a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml +++ b/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml @@ -297,8 +297,8 @@ Resources: NodeRangeProperties: - TargetNodes: '0:' Container: - # Use base nccl-tests image from public ECR - Image: public.ecr.aws/hpc-cloud/nccl-tests:latest + # Use specific nccl-tests image with CUDA 12.8.1, EFA 1.42.0, OFI 1.16.0, NCCL 2.27.5 + Image: public.ecr.aws/hpc-cloud/nccl-tests:cuda12.8.1-efa1.42.0-ofiv1.16.0-ncclv2.27.5-1-testsv2.16.4 JobRoleArn: !GetAtt BatchJobRole.Arn # Inline command that sets up SSH and hostfile, then runs NCCL test Command: