diff --git a/.gitignore b/.gitignore index 7ce80fb95..3dbba6003 100644 --- a/.gitignore +++ b/.gitignore @@ -120,6 +120,10 @@ ENV/ env.bak/ venv.bak/ +# User-specific environment variables (keep .example files) +**/env_vars +!**/env_vars.example + # IDE .c9/ .idea/ diff --git a/3.test_cases/pytorch/verl/rlvr/Dockerfile b/3.test_cases/pytorch/verl/rlvr/Dockerfile new file mode 100644 index 000000000..849b9b9df --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/Dockerfile @@ -0,0 +1,126 @@ +# Dockerfile for VERL with EFA support +# Using hiyouga/verl base image and adding EFA capabilities +FROM hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0 + +# EFA configuration +ARG OPEN_MPI_PATH=/opt/amazon/openmpi/ +ENV EFA_VERSION=1.43.3 + +# Install system dependencies including EFA requirements +RUN apt-get update && apt-get install -y \ + python3.11 \ + python3.11-dev \ + python3-pip \ + git \ + wget \ + curl \ + ninja-build \ + autoconf \ + build-essential \ + pciutils \ + environment-modules \ + tcl \ + tcl-dev \ + libnl-3-dev \ + libnl-route-3-dev \ + dmidecode \ + ethtool \ + iproute2 \ + libevent-dev \ + libhwloc-dev \ + openssh-server \ + openssh-client \ + && rm -rf /var/lib/apt/lists/* + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +# Install udev in a container-safe way +RUN apt-get update && apt-get install -y \ + systemd \ + udev \ + && rm -rf /var/lib/apt/lists/* + + +# Upgrade pip +RUN python3 -m pip install --upgrade pip setuptools wheel + +################################################# +## Clean up HPC-X to avoid conflicts with EFA +RUN rm -rf /opt/hpcx \ + && rm -rf /usr/local/mpi \ + && rm -f /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig + +################################################# +## EFA SETUP - Install EFA with all dependencies +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify + +# Set environment paths for EFA components (order matters!) +ENV PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:$PATH" +ENV LD_LIBRARY_PATH="/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" + +# OpenMPI configuration to use EFA and avoid conflicts +ENV OMPI_MCA_pml=^ucx +ENV OMPI_MCA_btl=tcp,self +ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent +ENV OPAL_PREFIX=/opt/amazon/openmpi + +# EFA/NCCL configuration for optimal performance +ENV FI_PROVIDER=efa +ENV FI_EFA_USE_DEVICE_RDMA=1 +ENV FI_EFA_FORK_SAFE=1 +ENV FI_EFA_ENABLE_SHM_TRANSFER=1 +ENV NCCL_PROTO=simple +ENV NCCL_NET_GDR_LEVEL=LOC +ENV NCCL_SOCKET_IFNAME=^docker,lo,veth +ENV NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so +ENV PMIX_MCA_gds=hash + +################################################# +## Optional: Install NCCL tests for verification +RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && make -j $(nproc) MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/opt/nccl/build + + +# Install core ML libraries +RUN pip install \ + transformers>=4.45.0 \ + datasets \ + accelerate \ + tokenizers \ + numpy \ + scipy \ + scikit-learn \ + vllm>=0.7.0 \ + hydra-core \ + omegaconf \ + wandb \ + tensorboard \ + boto3 \ + botocore \ + tenacity \ + s3torchconnector + +# Clone and install VERL +WORKDIR /workspace +RUN git clone https://github.com/volcengine/verl.git && \ + cd verl && \ + git checkout v0.6.1 +WORKDIR /workspace/verl + +# Install VERL in development mode +RUN pip install -e . + +# Set working directory +WORKDIR /workspace + +# Expose Ray ports +EXPOSE 8265 10001 6379 diff --git a/3.test_cases/pytorch/verl/rlvr/README.md b/3.test_cases/pytorch/verl/rlvr/README.md new file mode 100644 index 000000000..7a9afdbbe --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/README.md @@ -0,0 +1,106 @@ +# rlvr-recipe + +This repository provides a complete setup for running reinforcement learning from verifiable rewards (RLVR) on EKS clusters using Ray and verl. RLVR trains language models using verifiable rewards from math and coding tasks, where correctness can be automatically verified. The project uses verl, an efficient RL training framework from ByteDance, to run algorithms like GRPO (Group Relative Policy Optimization) and DAPO (Direct Advantage Policy Optimization) on distributed GPU clusters. + +## What is verl? + +[verl (Volcano Engine Reinforcement Learning)](https://github.com/volcengine/verl) is a flexible, production-ready RL training library for large language models. It provides seamless integration with popular frameworks like FSDP, Megatron-LM, vLLM, and Ray, enabling efficient distributed training with state-of-the-art throughput. This repo includes the full verl codebase with custom run scripts optimized for HyperPod. + +## What is RLVR? + +[Reinforcement Learning from Verifiable Rewards (RLVR)](https://arxiv.org/abs/2506.14245) is a training approach where models learn from tasks with objectively verifiable outcomes, such as math problems or code execution. Unlike human preference-based RL, RLVR uses ground-truth correctness as the reward signal, making it particularly effective for reasoning tasks. + +## Getting started + +### Prerequisites + +**Cluster**: +From here on out, we will assume you have an EKS cluster with GPU nodes (e.g., p5en.48xlarge). This example can be run on an EKS or HyperPod EKS cluster. + +This example was tested on 4 p5en.48xlarge nodes (8xH200 GPUs each). If you are using different node types, modify the cluster environment variables in `env_vars`. Feel free to change the model type/size, and training parameters to accomodate smaller or larger node types. + +**Storage**: +- This examples uses a FSx for Lustre file system that mounts to the pods via a pvc called `fsx-claim`. We store the dataset, as well as model checkpoints here. Feel free to substitute this claim with your own. + +**Versions**: +The example was tested on versions: +- EKS: 1.33 +- KubeRay: 1.4.2 +- VERL: v0.6.1 + +### Clone this repo +```bash +git clone https://github.com/aws-samples/awsome-distributed-training.git +cd awsome-distributed-training/3.test_cases/pytorch/verl/rlvr +``` + +### Install verl repository +This repository contains the verl framework and scripts needed for RLVR training. We install it to get access to the distributed RL training algorithms (GRPO, DAPO, and more) and the integration code that connects verl with EKS/Ray clusters for scalable language model fine-tuning on math and coding tasks. + +```bash +git clone https://github.com/volcengine/verl.git +cd verl +git checkout v0.6.1 +cd .. +``` + +### Create RayCluster + +Install KubeRay operator to manage Ray clusters on Kubernetes: +```bash +./setup/install-kuberay.sh +``` + +Configure your cluster settings (AWS region, cluster name, GPU counts, model paths): +```bash +# Copy the example file and customize it with your values +cp setup/env_vars.example setup/env_vars +vim setup/env_vars +``` + +> **Important**: The `env_vars` file contains sensitive information like your HuggingFace token, AWS account details, and cluster IDs. This file is gitignored to prevent accidentally committing credentials. Always use `env_vars.example` as your template. + +Load the environment variables into your shell session: +```bash +source setup/env_vars +``` + +Build a Docker image with verl, EFA networking support, and push to ECR: +```bash +./setup/build-push.sh +``` + +Deploy the Ray cluster with head and worker pods configured for distributed training: +```bash +envsubst < setup/raycluster.yaml | kubectl apply -f - +``` + +> **Note**: Considerations before applying raycluster.yaml +> - Ensure you have a file system before applying the RayCluster. This raycluster.yaml is assuming you have a pvc in place called `fsx-claim`. Feel free to modify the configuration depending on your file system setup +> - This Raycluster is assuming you have 4 p5en.48xlarge instance types. Modify your setup/env_vars and NodeSelector in the yaml to adjust for your cluster. + + +Download the GSM8K math dataset and prepare it for GRPO training: +```bash +./setup/load_data_grpo.sh +``` + +Forward the Ray dashboard to localhost for monitoring training progress: +```bash +./ray-expose.sh +``` + +Submit a GRPO training job to the Ray cluster. This trains a language model on math reasoning using group relative policy optimization: +```bash +./recipe/run_grpo_configurable.sh +``` + +The `verl/` directory contains the official verl framework, and `recipe/` includes custom run scripts (`run_grpo_configurable.sh`, `run_dapo_configurable.sh`) that integrate with your environment variables for easy configuration. + +### Observability + +For EKS: +Please see this documentation to set up Prometheus and Grafana dashboards for Ray clusters: [Using Prometheus & Grafana](https://docs.ray.io/en/latest/cluster/kubernetes/k8s-ecosystem/prometheus-grafana.html) + +For HyperPod EKS: +Check out the `observability/` directory to integrate Ray's native metrics dashboards with HyperPod's Amazon Managed Prometheus and Grafana \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/img/ray-dashboard.png b/3.test_cases/pytorch/verl/rlvr/img/ray-dashboard.png new file mode 100644 index 000000000..4c52ff25e Binary files /dev/null and b/3.test_cases/pytorch/verl/rlvr/img/ray-dashboard.png differ diff --git a/3.test_cases/pytorch/verl/rlvr/job-stop.sh b/3.test_cases/pytorch/verl/rlvr/job-stop.sh new file mode 100755 index 000000000..b72ec37d8 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/job-stop.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +./ray-expose.sh + +# Use RAY_ADDRESS from env_vars if set, otherwise default +RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} + +# Check if the submission_id is passed as an argument +if [ -z "$1" ]; then + echo "Error: No submission_id provided." + echo "Usage: ./job-stop.sh " + echo "List of jobs to choose from:" + echo "" + ray job list --address ${RAY_ADDRESS} | sed -n "s/.*submission_id='\([^']*\)'.*entrypoint='\([^']*\)'.*/submission_id: \1, entrypoint: \2/p" + echo -e "\n" + exit 1 +fi + +# Assign the user's input to a variable +submission_id=$(ray job list --address ${RAY_ADDRESS} | sed -n "s/.*submission_id='\([^']*\)'.*entrypoint='\([^']*\)'.*/submission_id: \1, entrypoint: \2/p" | grep $1 | head -n 1 | cut -d ' ' -f 2 | cut -d ',' -f 1) + +# submission_id=$1 + +CMD="ray job stop --address ${RAY_ADDRESS} $submission_id" + +if [ ! "$VERBOSE" == "false" ]; then echo -e "\n${CMD}\n"; fi +eval "$CMD" + +echo -e "\n" \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/observability/README.md b/3.test_cases/pytorch/verl/rlvr/observability/README.md new file mode 100644 index 000000000..93287e15e --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/observability/README.md @@ -0,0 +1,105 @@ +# Ray Observability for HyperPod EKS + +Set up Ray metrics monitoring using your existing Amazon Managed Prometheus and Grafana. + +## Setup + +### 1. Update OTEL Collector to Scrape Ray Metrics + +```bash +# Load environment variables +source setup/env_vars + +# Apply OTEL config with Ray scrape jobs +envsubst < observability/otel-collector-config-updated.yaml | kubectl apply -f - + +# Restart collector to pick up new config +kubectl rollout restart deployment hyperpod-observability-central-collector -n hyperpod-observability +kubectl rollout status deployment hyperpod-observability-central-collector -n hyperpod-observability +``` + +### 2. Deploy RayCluster with Grafana/Prometheus Config + +Your `setup/raycluster.yaml` already includes the necessary Grafana/Prometheus environment variables. + +```bash +# Apply RayCluster (if not already running) +source setup/env_vars +envsubst < raycluster-observability.yaml | kubectl apply -f - +``` + +### 3. Download Grafana Dashboards + +1. You can download them directly from your cluster head pod: +``` +HEAD_POD=$(kubectl get pods --selector ray.io/node-type=head,ray.io/cluster=rayml-efa -o jsonpath='{.items[0].metadata.name}') + +# Copy dashboard files from the pod +kubectl cp $HEAD_POD:/tmp/ray/session_latest/metrics/grafana/dashboards/ ./dashboards/ +``` + +2. Or you can download them directly from [KubeRay GitHub](https://github.com/ray-project/kuberay/tree/master/config/grafana): +``` +# Clone the repo +git clone https://github.com/ray-project/kuberay.git --depth 1 +cd kuberay/config/grafana +ls *_grafana_dashboard.json +``` + +### 4. Import Dashboards to Grafana + +- `default_grafana_dashboard.json` - Main Ray Dashboard +- `data_grafana_dashboard.json` - Ray Data metrics +- `serve_grafana_dashboard.json` - Ray Serve metrics +- `serve_deployment_grafana_dashboard.json` - Per-deployment metrics + +To import: +1. Open your Grafana workspace (check `$GRAFANA_ENDPOINT` in `setup/env_vars`) +2. Click "+" → "Import" +3. Upload each JSON file +4. Select your AMP data source +5. Use the "Cluster" dropdown to filter by `rayml-efa` + +## Verify It's Working + +```bash +# Check OTEL collector is scraping Ray (should see ray-head-metrics and ray-worker-metrics) +kubectl logs -n hyperpod-observability deployment/hyperpod-observability-central-collector --tail=50 | grep ray + +# Check Ray metrics endpoint is responding +HEAD_POD=$(kubectl get pods --selector ray.io/node-type=head,ray.io/cluster=rayml-efa -o jsonpath='{.items[0].metadata.name}') +kubectl exec $HEAD_POD -- curl -s http://localhost:8080 | head -n 20 +``` + +Wait 2-3 minutes for metrics to flow to AMP, then check your Grafana dashboards. + +## Troubleshooting + +**No metrics in Grafana?** +- Wait 2-3 minutes for data to propagate +- Check time range in Grafana (set to "Last 15 minutes") +- Verify "Cluster" dropdown shows `rayml-efa` +- Check OTEL collector logs for errors + +**OTEL collector not scraping Ray?** +- Verify the collector restarted: `kubectl get pods -n hyperpod-observability` +- Check for scrape errors: `kubectl logs -n hyperpod-observability deployment/hyperpod-observability-central-collector --tail=100` + +**Ray scrape configs disappeared?** + +HyperPod's ObservabilityConfig controller manages the OTEL collector ConfigMap. If you update the ObservabilityConfig (e.g., change scrape intervals), it will regenerate the ConfigMap and remove your Ray scrape configs. + +If this happens, simply reapply: +```bash +source setup/env_vars +envsubst < observability/otel-collector-config-updated.yaml | kubectl apply -f - +kubectl rollout restart deployment hyperpod-observability-central-collector -n hyperpod-observability +``` + +The configs will persist as long as you don't modify the ObservabilityConfig resource. + + +Results: +## Dashboard Preview + +![Ray Dashboard](../img/ray-dashboard.png) diff --git a/3.test_cases/pytorch/verl/rlvr/observability/otel-collector-config-updated.yaml b/3.test_cases/pytorch/verl/rlvr/observability/otel-collector-config-updated.yaml new file mode 100644 index 000000000..a08d4f90d --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/observability/otel-collector-config-updated.yaml @@ -0,0 +1,219 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: hyperpod-observability-central-collector-config + namespace: hyperpod-observability +data: + collector.yaml: | + receivers: + prometheus: + config: + scrape_configs: + - job_name: kubernetes-apiservers + scrape_interval: 30s + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + # Add cluster name and id as label + - target_label: cluster_id + replacement: ${CLUSTER_ID} + - target_label: cluster_name + replacement: ${CLUSTER_NAME} + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + + - job_name: kube-state-metrics + scrape_interval: 30s + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: hyp-obs-kube-state-metrics + source_labels: + - __meta_kubernetes_service_name + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + # Add cluster name and id as label + - target_label: cluster_id + replacement: ${CLUSTER_ID} + - target_label: cluster_name + replacement: ${CLUSTER_NAME} + + - job_name: kubeflow-trainer-metrics + scrape_interval: 30s + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: drop + regex: hp-training-operator-controller-manager-metrics-service + source_labels: + - __meta_kubernetes_service_name + - action: keep + regex: (.*kubeflow.*-controller-manager|training-operator) + source_labels: + - __meta_kubernetes_service_name + - action: keep + regex: monitoring-port + source_labels: + - __meta_kubernetes_service_port_name + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + # Add cluster name and id as label + - target_label: cluster_id + replacement: ${CLUSTER_ID} + - target_label: cluster_name + replacement: ${CLUSTER_NAME} + metrics_path: /metrics + + - job_name: hp-training-operator-metrics + scrape_interval: 30s + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: hp-training-operator-controller-manager-metrics-service + source_labels: + - __meta_kubernetes_service_name + - action: keep + regex: metrics-port + source_labels: + - __meta_kubernetes_service_port_name + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + # Add cluster name and id as label + - target_label: cluster_id + replacement: ${CLUSTER_ID} + - target_label: cluster_name + replacement: ${CLUSTER_NAME} + metrics_path: /metrics + scheme: http + + # ===== RAY METRICS SCRAPING - NEW SECTION ===== + - job_name: ray-head-metrics + scrape_interval: 30s + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - ${RAY_NAMESPACE} + relabel_configs: + # Keep only Ray head pods + - source_labels: [__meta_kubernetes_pod_label_ray_io_node_type] + action: keep + regex: head + # Keep only port 8080 (metrics endpoint) + - source_labels: [__meta_kubernetes_pod_container_port_number] + action: keep + regex: "8080" + # Add ray cluster name as label + - source_labels: [__meta_kubernetes_pod_label_ray_io_cluster] + target_label: ray_io_cluster + # Add pod name + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + # Add namespace + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + # Add container name + - source_labels: [__meta_kubernetes_pod_container_name] + target_label: container + # Add cluster name and id + - target_label: cluster_id + replacement: ${CLUSTER_ID} + - target_label: cluster_name + replacement: ${CLUSTER_NAME} + + - job_name: ray-worker-metrics + scrape_interval: 30s + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - ${RAY_NAMESPACE} + relabel_configs: + # Keep only Ray worker pods + - source_labels: [__meta_kubernetes_pod_label_ray_io_node_type] + action: keep + regex: worker + # Keep only port 8080 (metrics endpoint) + - source_labels: [__meta_kubernetes_pod_container_port_number] + action: keep + regex: "8080" + # Add ray cluster name as label + - source_labels: [__meta_kubernetes_pod_label_ray_io_cluster] + target_label: ray_io_cluster + # Add pod name + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + # Add namespace + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + # Add container name + - source_labels: [__meta_kubernetes_pod_container_name] + target_label: container + # Add cluster name and id + - target_label: cluster_id + replacement: ${CLUSTER_ID} + - target_label: cluster_name + replacement: ${CLUSTER_NAME} + # ===== END RAY METRICS SCRAPING ===== + + processors: + batch: + timeout: 5s + send_batch_size: 5000 + send_batch_max_size: 6000 + + exporters: + debug: + verbosity: detailed + prometheusremotewrite: + add_metric_suffixes: false + endpoint: ${AMP_ENDPOINT}/api/v1/remote_write + auth: + authenticator: sigv4auth + retry_on_failure: + enabled: true + initial_interval: 1s + max_interval: 10s + max_elapsed_time: 60s + + extensions: + sigv4auth: + region: ${AWS_REGION} + service: "aps" + + service: + extensions: [sigv4auth] + pipelines: + metrics: + receivers: [prometheus] + processors: [batch] + exporters: [prometheusremotewrite] diff --git a/3.test_cases/pytorch/verl/rlvr/observability/raycluster-observability.yaml b/3.test_cases/pytorch/verl/rlvr/observability/raycluster-observability.yaml new file mode 100644 index 000000000..a0d751b19 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/observability/raycluster-observability.yaml @@ -0,0 +1,173 @@ +apiVersion: ray.io/v1alpha1 +kind: RayCluster +metadata: + name: rayml-efa + labels: + controller-tools.k8s.io: "1.0" + annotations: + karpenter.sh/do-not-disrupt: "true" +spec: + # Ray head pod template + headGroupSpec: + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: + dashboard-host: '0.0.0.0' + metrics-export-port: '8080' # Explicitly set metrics port + #pod template + template: + spec: + nodeSelector: + node.kubernetes.io/instance-type: "ml.p5en.48xlarge" + sagemaker.amazonaws.com/node-health-status: Schedulable + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + containers: + - name: ray-head + image: ${REGISTRY}${IMAGE}:${TAG} ## IMAGE: Here you may choose which image your head pod will run + env: ## ENV: Here is where you can send stuff to the head pod + ## PROMETHEUS AND GRAFANA - AWS MANAGED SERVICES + - name: RAY_GRAFANA_IFRAME_HOST + value: ${GRAFANA_ENDPOINT} + - name: RAY_GRAFANA_HOST + value: ${GRAFANA_ENDPOINT} + - name: RAY_PROMETHEUS_HOST + value: ${AMP_ENDPOINT} + ## EFA AND NCCL CONFIGURATION + - name: FI_PROVIDER + value: "efa" + - name: FI_EFA_USE_DEVICE_RDMA + value: "1" + - name: FI_EFA_FORK_SAFE + value: "1" + - name: NCCL_PROTO + value: "simple" + - name: NCCL_SOCKET_IFNAME + value: "^docker,lo,veth" + - name: NCCL_DEBUG + value: "INFO" + - name: TORCH_NCCL_DUMP_ON_TIMEOUT + value: "1" + - name: TORCH_NCCL_ASYNC_ERROR_HANDLING + value: "1" + - name: HF_TOKEN + value: ${HF_TOKEN} + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: ## LIMITS: Set resource limits for your head pod + cpu: 8 + memory: 32Gi + requests: ## REQUESTS: Set resource requests for your head pod + cpu: 8 + memory: 32Gi + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8080 + name: metrics + volumeMounts: ## VOLUMEMOUNTS: Mount your S3 CSI EKS Add-On to head pod + - name: fsx-storage + mountPath: /fsx + - name: ray-logs + mountPath: /tmp/ray + - name: checkpoint-logs + mountPath: /var/log/sagemaker_checkpointing + volumes: + - name: ray-logs + emptyDir: {} + - name: fsx-storage + persistentVolumeClaim: + claimName: fsx-claim + - name: checkpoint-logs + hostPath: + path: /var/logs/sagemaker_checkpointing + type: DirectoryOrCreate + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: $NUM_NODES ## REPLICAS: How many worker pods you want + minReplicas: 1 + maxReplicas: 10 + # logical group name, for this called small-group, also can be functional + groupName: gpu-group + rayStartParams: + num-gpus: "$NUM_GPU_PER_NODE" + metrics-export-port: '8080' # Explicitly set metrics port for workers + #pod template + template: + spec: + nodeSelector: + node.kubernetes.io/instance-type: "ml.p5en.48xlarge" + sagemaker.amazonaws.com/node-health-status: Schedulable + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + containers: + - name: ray-worker + image: ${REGISTRY}${IMAGE}:${TAG} ## IMAGE: Here you may choose which image your head node will run + env: + - name: FI_PROVIDER + value: "efa" + - name: FI_EFA_USE_DEVICE_RDMA + value: "1" + - name: FI_EFA_FORK_SAFE + value: "1" + - name: NCCL_PROTO + value: "simple" + - name: NCCL_SOCKET_IFNAME + value: "^docker,lo,veth" + - name: NCCL_DEBUG + value: "INFO" + - name: TORCH_NCCL_DUMP_ON_TIMEOUT + value: "1" + - name: TORCH_NCCL_ASYNC_ERROR_HANDLING + value: "1" + - name: HF_TOKEN + value: ${HF_TOKEN} + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: ## LIMITS: Set resource limits for your worker pods + cpu: 16 + memory: 200Gi + nvidia.com/gpu: $NUM_GPU_PER_NODE + vpc.amazonaws.com/efa: $NUM_EFA_PER_NODE + requests: ## REQUESTS: Set resource requests for your worker pods + cpu: 16 + memory: 200Gi + nvidia.com/gpu: $NUM_GPU_PER_NODE + vpc.amazonaws.com/efa: $NUM_EFA_PER_NODE + ports: + - containerPort: 8080 + name: metrics + volumeMounts: ## VOLUMEMOUNTS: Mount your S3 CSI EKS Add-On to worker pods + - name: ray-logs + mountPath: /tmp/ray + - name: fsx-storage + mountPath: /fsx + - name: checkpoint-logs + mountPath: /var/log/sagemaker_checkpointing + volumes: + - name: fsx-storage + persistentVolumeClaim: + claimName: fsx-claim + - name: ray-logs + emptyDir: {} + - name: checkpoint-logs + hostPath: + path: /var/logs/sagemaker_checkpointing + type: DirectoryOrCreate diff --git a/3.test_cases/pytorch/verl/rlvr/ray-expose.sh b/3.test_cases/pytorch/verl/rlvr/ray-expose.sh new file mode 100755 index 000000000..d2f9e2a01 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/ray-expose.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Use RAY_DASHBOARD_PORT from env_vars, default to 8265 if not set +RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265} + +EXPOSED=$(lsof -i :${RAY_DASHBOARD_PORT}) +if [ "$?" == 0 ]; then + echo "Ray is exposed on port ${RAY_DASHBOARD_PORT}" +else + PID_FILE="$HOME/port-forward.pid" + export SERVICEHEAD=$(kubectl get service | grep head-svc | awk '{print $1}' | head -n 1) + + kubectl port-forward --address 0.0.0.0 service/${SERVICEHEAD} ${RAY_DASHBOARD_PORT}:8265 > /dev/null 2>&1 & + echo $! > "$PID_FILE" + echo "Port-forward started, PID $! saved in $PID_FILE" + sleep 1 + echo "Port forwarded, visit http://localhost:${RAY_DASHBOARD_PORT}" +fi \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/ray-hide.sh b/3.test_cases/pytorch/verl/rlvr/ray-hide.sh new file mode 100755 index 000000000..a0012e5af --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/ray-hide.sh @@ -0,0 +1,19 @@ +#!/bin/bash +PID_FILE="$HOME/port-forward.pid" +if [ -f "$PID_FILE" ]; then + PID=$(cat "$PID_FILE") + if [ -z "$PID" ]; then + echo "PID file is empty." + rm -f "$PID_FILE" + exit 1 + fi + if ps -p $PID > /dev/null; then + kill $PID + echo "Process $PID stopped." + else + echo "No process found with PID $PID." + fi + rm -f "$PID_FILE" +else + echo "PID file not found." +fi \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/recipe/run_dapo_configurable.sh b/3.test_cases/pytorch/verl/rlvr/recipe/run_dapo_configurable.sh new file mode 100755 index 000000000..1a0c17d30 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/recipe/run_dapo_configurable.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +# Source environment variables if not already loaded +if [ -z "${MODEL_NAME:-}" ]; then + echo "Loading environment variables from setup/env_vars..." + source ../../setup/env_vars +fi + +# Project configuration +project_name='DAPO' +exp_name="DAPO-${MODEL_NAME}" + +# DAPO Algorithm parameters +adv_estimator=grpo +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Token length configuration +# max_prompt_length=$((1024 * 2)) +max_prompt_length=512 +# max_response_length=$((1024 * 20)) +max_response_length=1024 +enable_overlong_buffer=True +overlong_buffer_len=512 # Must be less than max_response_length +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +# Training configuration +enable_filter_groups=True +filter_groups_metric=acc +max_num_gen_batches=0 # 0 = unlimited retries, or set to higher value like 10 +train_prompt_bsz=${TRAIN_BATCH_SIZE:-512} +gen_prompt_bsz=${GEN_BATCH_SIZE:-$((train_prompt_bsz * 3))} +n_resp_per_prompt=${N_RESP_PER_PROMPT:-16} +train_prompt_mini_bsz=32 + +# Ray configuration from env_vars +RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +RUNTIME_ENV="${WORKING_DIR}/recipe/dapo/runtime_env.yaml" + + +# Cluster configuration from env_vars +NNODES=${NUM_NODES:-2} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} + +# Model and data paths from env_vars +MODEL_NAME=${MODEL_NAME:-"Qwen3-8B"} +MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-8B"} +RAY_DATA_HOME=${RAY_DATA_HOME:-"/fsx/verl"} +CKPTS_DIR="${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}" +TRAIN_FILE="${RAY_DATA_HOME}/data/dapo-math-17k.parquet" +TEST_FILE="${RAY_DATA_HOME}/data/aime-2024.parquet" + +# Algorithm parameters +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance parameters (adjust based on your cluster size) +sp_size=8 +use_dynamic_bsz=True +actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) +infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) +offload=True +gen_tp=4 + +# Print configuration for verification +echo "=== DAPO Training Configuration ===" +echo "Project: ${project_name}" +echo "Experiment: ${exp_name}" +echo "Model: ${MODEL_NAME} (${MODEL_PATH})" +echo "Nodes: ${NNODES}" +echo "GPUs per node: ${GPUS_PER_NODE}" +echo "Total GPUs: $((NNODES * GPUS_PER_NODE))" +echo "Data home: ${RAY_DATA_HOME}" +echo "Checkpoints: ${CKPTS_DIR}" +echo "Ray address: ${RAY_ADDRESS}" +echo "==================================" + +# Submit Ray job +ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ + --working-dir "${WORKING_DIR}" \ + -- python3 -m recipe.dapo.main_dapo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + algorithm.filter_groups.enable=${enable_filter_groups} \ + algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \ + algorithm.filter_groups.metric=${filter_groups_metric} \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k="${top_k}" \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \ + reward_model.reward_manager=dapo \ + reward_model.overlong_buffer.enable=${enable_overlong_buffer} \ + reward_model.overlong_buffer.len=${overlong_buffer_len} \ + reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \ + trainer.logger='["console"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=${GPUS_PER_NODE} \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=5 \ + trainer.save_freq=5 \ + trainer.total_epochs=1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/recipe/run_grpo_configurable.sh b/3.test_cases/pytorch/verl/rlvr/recipe/run_grpo_configurable.sh new file mode 100755 index 000000000..e3992e176 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/recipe/run_grpo_configurable.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +# Project configuration +project_name='GRPO' +exp_name="GRPO-${MODEL_NAME}" + +# GRPO Algorithm parameters +adv_estimator=grpo +use_kl_in_reward=False +use_kl_loss=True +kl_loss_coef=0.001 +kl_loss_type=low_var_kl +entropy_coeff=0 + +# Token length configuration +max_prompt_length=512 +max_response_length=1024 +filter_overlong_prompts=True +truncation='error' + +# Training configuration +train_prompt_bsz=${TRAIN_BATCH_SIZE:-32} # Reduced from 256 for faster testing +gen_prompt_bsz=${GEN_BATCH_SIZE:-$train_prompt_bsz} +n_resp_per_prompt=${N_RESP_PER_PROMPT:-2} # Reduced from 5 for faster testing +train_prompt_mini_bsz=16 # Must be <= train_prompt_bsz +train_prompt_micro_bsz_per_gpu=1 + +# Ray configuration from env_vars +RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} + +# Cluster configuration from env_vars +NNODES=${NUM_NODES:-4} +GPUS_PER_NODE=${NUM_GPU_PER_NODE:-8} + +# Model and data paths from env_vars +MODEL_NAME=${MODEL_NAME:-"Qwen3-8B"} +MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-8B"} +RAY_DATA_HOME=${RAY_DATA_HOME:-"/fsx/verl"} +CKPTS_DIR="${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}" + +# Data files - using GSM8K dataset +TRAIN_FILE="${RAY_DATA_HOME}/data/gsm8k/train.parquet" +TEST_FILE="${RAY_DATA_HOME}/data/gsm8k/test.parquet" + +# S3 checkpoint configuration (for managed tiered checkpointing) +S3_CHECKPOINT_BASE=${S3_CHECKPOINT_BASE:-"s3://sagemaker-mvincig-rlvr-e66849d3-bucket/checkpoints"} +CHECKPOINT_NAMESPACE="${exp_name}-$(date +%s)" + +# Checkpoint configuration +CHECKPOINT_ASYNC_SAVE=True # Enable async checkpointing +CHECKPOINT_SAVE_TO_S3_FREQ=5 # Save to S3 every N steps (in addition to in-memory) + +# Performance parameters +gen_tp=2 +log_prob_micro_bsz_per_gpu=32 +gpu_memory_utilization=0.6 + +# Memory optimization +param_offload=False +optimizer_offload=False +ref_param_offload=True + +# Print configuration for verification +echo "=== GRPO Training Configuration ===" +echo "Project: ${project_name}" +echo "Experiment: ${exp_name}" +echo "Model: ${MODEL_NAME} (${MODEL_PATH})" +echo "Nodes: ${NNODES}" +echo "GPUs per node: ${GPUS_PER_NODE}" +echo "Total GPUs: $((NNODES * GPUS_PER_NODE))" +echo "Data home: ${RAY_DATA_HOME}" +echo "Checkpoints: ${CKPTS_DIR}" +echo "S3 Checkpoints: ${S3_CHECKPOINT_BASE}" +echo "Ray address: ${RAY_ADDRESS}" +echo "==================================" + +# Submit Ray job +ray job submit --no-wait \ + --working-dir "${WORKING_DIR}" \ + -- python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=${adv_estimator} \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=question \ + data.train_batch_size=${train_prompt_bsz} \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.filter_overlong_prompts=${filter_overlong_prompts} \ + data.truncation=${truncation} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_prompt_micro_bsz_per_gpu} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.kl_loss_type=${kl_loss_type} \ + actor_rollout_ref.actor.entropy_coeff=${entropy_coeff} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${param_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${optimizer_offload} \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${log_prob_micro_bsz_per_gpu} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${log_prob_micro_bsz_per_gpu} \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_param_offload} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + trainer.critic_warmup=0 \ + trainer.logger='["console"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=${GPUS_PER_NODE} \ + trainer.nnodes=${NNODES} \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.save_freq=1 \ + trainer.test_freq=2 \ + trainer.total_epochs=2 diff --git a/3.test_cases/pytorch/verl/rlvr/setup/build-push.sh b/3.test_cases/pytorch/verl/rlvr/setup/build-push.sh new file mode 100755 index 000000000..5ceec5caf --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/setup/build-push.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -e # Exit on any error + +echo "Building image ${REGISTRY}${IMAGE}:${TAG}" +if DOCKER_BUILDKIT=1 docker build --platform linux/amd64 -f Dockerfile -t ${REGISTRY}${IMAGE}:${TAG} .; then + echo "Done building image!" + echo "" +else + echo "Build failed!" + exit 1 +fi + +echo "Pushing image to ECR..." +# Create registry if needed +REGISTRY_COUNT=$(aws ecr describe-repositories | grep "${IMAGE}" | wc -l) +if [ "$REGISTRY_COUNT" == "0" ]; then + aws ecr create-repository --repository-name ${IMAGE} +fi + +# Login to registry +echo "Logging in to $REGISTRY ..." +if aws ecr get-login-password | docker login --username AWS --password-stdin $REGISTRY; then + echo "Login successful" +else + echo "Login failed!" + exit 1 +fi + +# Push image to registry +echo "Pushing image ${REGISTRY}${IMAGE}:${TAG}" +if docker image push ${REGISTRY}${IMAGE}:${TAG}; then + echo "Done pushing image!" + echo "" +else + echo "Push failed!" + exit 1 +fi \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example b/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example new file mode 100644 index 000000000..8e2bc92ef --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example @@ -0,0 +1,46 @@ +# Docker file +export AWS_REGION=$(aws ec2 describe-availability-zones --output text --query 'AvailabilityZones[0].[RegionName]') +export ACCOUNT=$(aws sts get-caller-identity --query Account --output text) +export REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/ +export IMAGE=rlvr +export TAG=ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0 + +# Cluster Details +export EKS_CLUSTER_NAME="" +export INSTANCE_TYPE="" # Example: "p5en.48xlarge" +export NUM_NODES=4 # Single source of truth for number of nodes +export NUM_GPU_PER_NODE=8 +export NUM_EFA_PER_NODE=16 +export PRIVATE_SUBNET_ID="subnet-xxxxxxxxxxxxxxxxx" +export SECURITY_GROUP_ID="sg-xxxxxxxxxxxxxxxxx" + + +# Ray configs +export MODEL_NAME="Qwen3-8B" +export MODEL_PATH="Qwen/Qwen3-8B" # Set this to load model from HuggingFace +export RAY_DATA_HOME="/fsx/verl" # Shared storage path for checkpoints and data +export VERL_HOME="fsx/verl" +export RAY_DASHBOARD_PORT=8265 # Local port for Ray dashboard (forwarded from cluster) +export RAY_ADDRESS="http://localhost:${RAY_DASHBOARD_PORT}" +export WORKING_DIR="$(pwd)/verl" + +# Job Env Vars (using NUM_NODES for consistency) +export HF_TOKEN= +export GPUS_PER_NODE=8 +export NCCL_DEBUG=INFO +# Memory optimization settings +export RAY_memory_usage_threshold=0.85 + +# Add these to reduce I/O pressure +export TRAIN_BATCH_SIZE=32 # Reduced from 512 +export GEN_BATCH_SIZE=384 # Reduced from 1536 +export N_RESP_PER_PROMPT=2 # Reduced from 16 + +# Observability with HyperPod - Amazon Managed Prometheus & Grafana +export AMP_WORKSPACE_ID="ws-xxxxxxxxxxxxxxxxx" +export AMP_ENDPOINT="https://aps-workspaces.${AWS_REGION}.amazonaws.com/workspaces/${AMP_WORKSPACE_ID}" +export GRAFANA_WORKSPACE_ID="g-xxxxxxxxxxxxxxxxx" +export GRAFANA_ENDPOINT="https://${GRAFANA_WORKSPACE_ID}.grafana-workspace.${AWS_REGION}.amazonaws.com" +export CLUSTER_ID="" +export CLUSTER_NAME="" +export RAY_NAMESPACE="default" # Namespace where RayCluster runs diff --git a/3.test_cases/pytorch/verl/rlvr/setup/install-kuberay.sh b/3.test_cases/pytorch/verl/rlvr/setup/install-kuberay.sh new file mode 100755 index 000000000..cb7b4e89c --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/setup/install-kuberay.sh @@ -0,0 +1,14 @@ +# Deploy KubeRay +NS_COUNT=$(kubectl get namespace kuberay | grep kuberay | wc -l) +if [ "$NS_COUNT" == "1" ]; then + echo "Namespace kuberay already exists" +else + kubectl create namespace kuberay +fi +# Deploy the KubeRay operator with the Helm chart repository +helm repo add kuberay https://ray-project.github.io/kuberay-helm/ +helm repo update +#Install both CRDs and Kuberay operator v1.2.0 +helm install kuberay-operator kuberay/kuberay-operator --version 1.4.2 --namespace kuberay +# Kuberay operator pod will be deployed onto head pod +kubectl get pods --namespace kuberay \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/setup/load_data_dapo.sh b/3.test_cases/pytorch/verl/rlvr/setup/load_data_dapo.sh new file mode 100755 index 000000000..22836d4b5 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/setup/load_data_dapo.sh @@ -0,0 +1,18 @@ +# First, get the head pod name +export HEAD_POD=$(kubectl get pods --selector=ray.io/node-type=head -o custom-columns=POD:metadata.name --no-headers) + +# Use RAY_DASHBOARD_PORT from env_vars, default to 8265 if not set +RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265} + +# Then run the commands +kubectl exec -it $HEAD_POD -- /bin/bash -c " +if [ ! -d \"verl\" ]; then + git clone https://github.com/volcengine/verl +fi +cd verl && \ +export VERL_HOME=/fsx/verl && \ +export RAY_ADDRESS=\"http://localhost:${RAY_DASHBOARD_PORT}\" && \ +export RAY_DATA_HOME=\"/fsx/verl\" && \ +export OVERWRITE=1 && \ +bash recipe/dapo/prepare_dapo_data.sh +" \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/setup/load_data_grpo.sh b/3.test_cases/pytorch/verl/rlvr/setup/load_data_grpo.sh new file mode 100755 index 000000000..70cbff367 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/setup/load_data_grpo.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +# Create data directory +DATA_DIR="${RAY_DATA_HOME}/data/gsm8k" +echo "Creating data directory: ${DATA_DIR}" + +# Get the head pod name +HEAD_POD=$(kubectl get pods -l ray.io/node-type=head -o jsonpath='{.items[0].metadata.name}') + +if [ -z "$HEAD_POD" ]; then + echo "Error: Could not find Ray head pod. Is your cluster running?" + exit 1 +fi + +echo "Using Ray head pod: ${HEAD_POD}" + +# Create Python script to download GSM8K data +cat > /tmp/download_gsm8k.py << 'EOF' +from datasets import load_dataset +import os +import sys +import re + +# Get data directory from environment or use default +data_dir = os.environ.get('DATA_DIR', '/fsx/verl/data/gsm8k') + +print(f"Creating directory: {data_dir}") +os.makedirs(data_dir, exist_ok=True) + +def extract_answer(answer_str): + """Extract the final numerical answer from GSM8K format (e.g., '#### 18')""" + match = re.search(r'####\s*(-?[\d,\.]+)', answer_str) + if match: + return match.group(1).replace(',', '') + return None + +print("Loading GSM8K dataset from HuggingFace...") +try: + dataset = load_dataset("openai/gsm8k", "main") +except Exception as e: + print(f"Error loading dataset: {e}") + sys.exit(1) + +print("Adding VERL-required columns (data_source and reward_model)...") + +def add_verl_columns(example): + """Add required columns for VERL reward computation""" + ground_truth = extract_answer(example['answer']) + return { + **example, + 'data_source': 'openai/gsm8k', + 'reward_model': {'ground_truth': ground_truth} + } + +# Process both splits +train_dataset = dataset['train'].map(add_verl_columns) +test_dataset = dataset['test'].map(add_verl_columns) + +print("Saving train split to parquet...") +train_path = os.path.join(data_dir, 'train.parquet') +train_dataset.to_parquet(train_path) +print(f"Saved {len(train_dataset)} training samples to {train_path}") + +print("Saving test split to parquet...") +test_path = os.path.join(data_dir, 'test.parquet') +test_dataset.to_parquet(test_path) +print(f"Saved {len(test_dataset)} test samples to {test_path}") + +print("\nDataset info:") +print(f"Train samples: {len(train_dataset)}") +print(f"Test samples: {len(test_dataset)}") +print(f"\nSample train example:") +sample = train_dataset[0] +print(f" question: {sample['question'][:100]}...") +print(f" answer: {sample['answer'][:100]}...") +print(f" data_source: {sample['data_source']}") +print(f" reward_model: {sample['reward_model']}") + +print("\nGSM8K data successfully downloaded and preprocessed for VERL!") +EOF + +# Copy script to pod +echo "Copying download script to pod..." +kubectl cp /tmp/download_gsm8k.py ${HEAD_POD}:/tmp/download_gsm8k.py + +# Execute the script in the pod +echo "Downloading GSM8K data..." +kubectl exec ${HEAD_POD} -- bash -c "export DATA_DIR=${DATA_DIR}" +kubectl exec ${HEAD_POD} -- python3 /tmp/download_gsm8k.py + + +# Verify the files exist +echo "Verifying downloaded files..." +kubectl exec ${HEAD_POD} -- ls -lh ${DATA_DIR}/ + +echo "GSM8K data download complete!" +echo "Data location: ${DATA_DIR}" +echo " - train.parquet: ~7.5K samples" +echo " - test.parquet: ~1.3K samples" diff --git a/3.test_cases/pytorch/verl/rlvr/setup/raycluster.yaml b/3.test_cases/pytorch/verl/rlvr/setup/raycluster.yaml new file mode 100644 index 000000000..73b1ef555 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/setup/raycluster.yaml @@ -0,0 +1,173 @@ +apiVersion: ray.io/v1alpha1 +kind: RayCluster +metadata: + name: rayml-efa + labels: + controller-tools.k8s.io: "1.0" + annotations: + karpenter.sh/do-not-disrupt: "true" +spec: + # Ray head pod template + headGroupSpec: + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: + dashboard-host: '0.0.0.0' + metrics-export-port: '8080' # Explicitly set metrics port + #pod template + template: + spec: + nodeSelector: + node.kubernetes.io/instance-type: ${INSTANCE_TYPE} + sagemaker.amazonaws.com/node-health-status: Schedulable + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + containers: + - name: ray-head + image: ${REGISTRY}${IMAGE}:${TAG} ## IMAGE: Here you may choose which image your head pod will run + env: ## ENV: Here is where you can send stuff to the head pod + ## PROMETHEUS AND GRAFANA - AWS MANAGED SERVICES + - name: RAY_GRAFANA_IFRAME_HOST + value: http://localhost:3000 + - name: RAY_GRAFANA_HOST + value: http://prometheus-grafana.prometheus-system.svc:80 + - name: RAY_PROMETHEUS_HOST + value: http://prometheus-kube-prometheus-prometheus.prometheus-system.svc:9090 + ## EFA AND NCCL CONFIGURATION + - name: FI_PROVIDER + value: "efa" + - name: FI_EFA_USE_DEVICE_RDMA + value: "1" + - name: FI_EFA_FORK_SAFE + value: "1" + - name: NCCL_PROTO + value: "simple" + - name: NCCL_SOCKET_IFNAME + value: "^docker,lo,veth" + - name: NCCL_DEBUG + value: "INFO" + - name: TORCH_NCCL_DUMP_ON_TIMEOUT + value: "1" + - name: TORCH_NCCL_ASYNC_ERROR_HANDLING + value: "1" + - name: HF_TOKEN + value: ${HF_TOKEN} + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: ## LIMITS: Set resource limits for your head pod + cpu: 8 + memory: 32Gi + requests: ## REQUESTS: Set resource requests for your head pod + cpu: 8 + memory: 32Gi + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8080 + name: metrics + volumeMounts: ## VOLUMEMOUNTS: Mount your S3 CSI EKS Add-On to head pod + - name: fsx-storage + mountPath: /fsx + - name: ray-logs + mountPath: /tmp/ray + # - name: checkpoint-logs + # mountPath: /var/log/sagemaker_checkpointing + volumes: + - name: ray-logs + emptyDir: {} + - name: fsx-storage + persistentVolumeClaim: + claimName: fsx-claim + # - name: checkpoint-logs + # hostPath: + # path: /var/logs/sagemaker_checkpointing + # type: DirectoryOrCreate + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: $NUM_NODES ## REPLICAS: How many worker pods you want + minReplicas: 1 + maxReplicas: 10 + # logical group name, for this called small-group, also can be functional + groupName: gpu-group + rayStartParams: + num-gpus: "$NUM_GPU_PER_NODE" + metrics-export-port: '8080' # Explicitly set metrics port for workers + #pod template + template: + spec: + nodeSelector: + node.kubernetes.io/instance-type: ${INSTANCE_TYPE} + sagemaker.amazonaws.com/node-health-status: Schedulable + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + containers: + - name: ray-worker + image: ${REGISTRY}${IMAGE}:${TAG} ## IMAGE: Here you may choose which image your head node will run + env: + - name: FI_PROVIDER + value: "efa" + - name: FI_EFA_USE_DEVICE_RDMA + value: "1" + - name: FI_EFA_FORK_SAFE + value: "1" + - name: NCCL_PROTO + value: "simple" + - name: NCCL_SOCKET_IFNAME + value: "^docker,lo,veth" + - name: NCCL_DEBUG + value: "INFO" + - name: TORCH_NCCL_DUMP_ON_TIMEOUT + value: "1" + - name: TORCH_NCCL_ASYNC_ERROR_HANDLING + value: "1" + - name: HF_TOKEN + value: ${HF_TOKEN} + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: ## LIMITS: Set resource limits for your worker pods + cpu: 16 + memory: 200Gi + nvidia.com/gpu: $NUM_GPU_PER_NODE + vpc.amazonaws.com/efa: $NUM_EFA_PER_NODE + requests: ## REQUESTS: Set resource requests for your worker pods + cpu: 16 + memory: 200Gi + nvidia.com/gpu: $NUM_GPU_PER_NODE + vpc.amazonaws.com/efa: $NUM_EFA_PER_NODE + ports: + - containerPort: 8080 + name: metrics + volumeMounts: ## VOLUMEMOUNTS: Mount your S3 CSI EKS Add-On to worker pods + - name: ray-logs + mountPath: /tmp/ray + - name: fsx-storage + mountPath: /fsx + # - name: checkpoint-logs + # mountPath: /var/log/sagemaker_checkpointing + volumes: + - name: fsx-storage + persistentVolumeClaim: + claimName: fsx-claim + - name: ray-logs + emptyDir: {} + # - name: checkpoint-logs + # hostPath: + # path: /var/logs/sagemaker_checkpointing + # type: DirectoryOrCreate