From 2f83fbde19374ea10f4e3e41b2e0a235eac9bc90 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <MuhammadAbdelghaffar.Awad@amd.com>
Date: Thu, 10 Jul 2025 13:33:44 -0500
Subject: [PATCH 1/8] Add initial CI

Signed-off-by: Muhammad Awad <MuhammadAbdelghaffar.Awad@amd.com>
---
 .github/workflows/examples-ci.yml     | 206 ++++++++++++++++++++++++++
 .github/workflows/scripts/examples.sh |  29 ++++
 docs/DEVCLOUD.md                      |  66 +++++++++
 examples/README.md                    |   3 +-
 4 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/examples-ci.yml
 create mode 100755 .github/workflows/scripts/examples.sh
 create mode 100644 docs/DEVCLOUD.md

diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
new file mode 100644
index 0000000..c0f68e3
--- /dev/null
+++ b/.github/workflows/examples-ci.yml
@@ -0,0 +1,206 @@
+name: Iris examples CI
+
+on:
+  pull_request:
+    branches: [ main ]
+  push:
+    branches: [ main ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-iris:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Install doctl
+      uses: digitalocean/action-doctl@v2
+      with:
+        token: ${{ secrets.DEV_CLOUD_KEY }}
+
+    - name: Install jq
+      timeout-minutes: 20
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y jq
+
+    - name: Create Droplet
+      id: create
+      run: |
+        DROPLET_NAME="iris-$(date +%s)"
+        
+        # Create droplet and capture JSON output
+        DROPLET_JSON=$(doctl compute droplet create \
+          --image 188571990 \
+          --size gpu-mi300x8-1536gb \
+          --region atl1 \
+          --ssh-keys ${{ secrets.SSH_KEY_ID }} \
+          "$DROPLET_NAME" \
+          -o json \
+          --wait)
+        
+        # Check if droplet creation was successful
+        if [ $? -ne 0 ]; then
+          echo "❌ Failed to create droplet"
+          exit 1
+        fi
+        
+        # Extract droplet ID and IP
+        DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id')
+        PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address')
+        
+        # Set outputs for other steps
+        echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT
+        echo "public_ip=$PUBLIC_IP" >> $GITHUB_OUTPUT
+        
+        echo "✅ Droplet created successfully!"
+
+    - name: Setup SSH key
+      run: |
+        mkdir -p ~/.ssh
+        echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
+        chmod 600 ~/.ssh/id_rsa
+        ssh-keyscan -H ${{ steps.create.outputs.public_ip }} >> ~/.ssh/known_hosts 2>/dev/null || true
+
+    - name: Wait for SSH to be ready
+      run: |
+        echo "⏳ Waiting for SSH to be ready..."
+        for i in {1..30}; do
+          if ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }} "echo 'SSH ready'" 2>/dev/null; then
+            echo "✅ SSH is ready!"
+            break
+          fi
+          echo "Attempt $i/30: SSH not ready yet, waiting 10 seconds..."
+          sleep 10
+        done
+
+    - name: Determine commit hash
+      id: commit_hash
+      run: |
+        if [ "${{ github.event_name }}" == "pull_request" ]; then
+          echo "commit_hash=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT
+        else
+          echo "commit_hash=${{ github.sha }}" >> $GITHUB_OUTPUT
+        fi
+
+    - name: Install Iris and run tests
+      run: |
+        echo "🚀Iris installation..."
+        
+        # Setup SSH, clone repo, and install dependencies
+        ssh -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }} "
+          set -e
+          
+          # Remove any stale dpkg locks
+          sudo rm -f /var/lib/apt/lists/lock
+          sudo rm -f /var/cache/apt/archives/lock
+          sudo rm -f /var/lib/dpkg/lock*
+          
+          # Setup SSH key for git access
+          mkdir -p ~/.ssh
+          echo '${{ secrets.SSH_PRIVATE_KEY }}' > ~/.ssh/id_rsa
+          chmod 600 ~/.ssh/id_rsa
+          ssh-keyscan -H github.com >> ~/.ssh/known_hosts
+          
+          # Set environment variables
+          export ROCM_PATH=/opt/rocm
+          export PATH=\$ROCM_PATH/bin:\$PATH
+          export LD_LIBRARY_PATH=\$ROCM_PATH/lib:\$LD_LIBRARY_PATH
+          
+          # Install system dependencies
+          sudo apt-get update && sudo apt-get install -y python3-venv cmake openmpi-bin libopenmpi-dev
+
+          # Clone the repository
+          git clone git@github.com:ROCm/iris.git
+          cd iris
+          echo 'Checking out commit ${{ steps.commit_hash.outputs.commit_hash }}'
+          git checkout ${{ steps.commit_hash.outputs.commit_hash }}
+          
+          # Setup Python environment
+          python3 -m venv iris_env
+          source iris_env/bin/activate
+
+          pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4
+          pip3 install -e .
+          
+          # Create results directory
+          mkdir -p /iris_results
+          
+          # Run the Iris examples
+          ./.github/workflows/scripts/examples.sh
+        "
+
+    - name: Download test outputs
+      if: always()
+      run: |
+        echo "📥 Downloading test outputs..."
+        mkdir -p test_outputs
+        scp -r -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }}:/iris_results/ ./test_outputs/ || echo "No results directory found"
+        
+        # Create tar artifact
+        tar -czf iris_test_outputs.tar.gz -C test_outputs .
+        echo "✅ Test outputs archived as iris_test_outputs.tar.gz"
+        
+        # Print test results summary with GitHub Actions annotations
+        echo "📊 Iris Test Results Summary:"
+        
+        # Check each test result by parsing the success field
+        check_test_result() {
+          local file="$1"
+          local test_name="$2"
+          if [ -f "$file" ]; then
+            if jq -e '.success == true' "$file" >/dev/null 2>&1; then
+              echo "::notice::✅ $test_name: PASSED"
+              return 0
+            else
+              echo "::warning::❌ $test_name: FAILED"
+              return 1
+            fi
+          else
+            echo "::warning::❌ $test_name: FAILED (file not found)"
+            return 1
+          fi
+        }
+        
+        # Track overall success
+        overall_success=true
+        
+        check_test_result "./test_outputs/iris_results/load_bench.json" "Load" || overall_success=false
+        check_test_result "./test_outputs/iris_results/store_bench.json" "Store" || overall_success=false
+        check_test_result "./test_outputs/iris_results/all_load_bench.json" "All Load" || overall_success=false
+        check_test_result "./test_outputs/iris_results/all_store_bench.json" "All Store" || overall_success=false
+        check_test_result "./test_outputs/iris_results/atomic_add_bench.json" "Atomic Add" || overall_success=false
+        check_test_result "./test_outputs/iris_results/atomic_xchg_bench.json" "Atomic Xchg" || overall_success=false
+        check_test_result "./test_outputs/iris_results/message_passing_load_store.json" "Message Passing Load Store" || overall_success=false
+        check_test_result "./test_outputs/iris_results/message_passing_put.json" "Message Passing Put" || overall_success=false
+        check_test_result "./test_outputs/iris_results/gemm_all_scatter_bench.json" "GEMM All Scatter" || overall_success=false
+        check_test_result "./test_outputs/iris_results/gemm_atomics_all_reduce_bench.json" "GEMM Atomics All Reduce" || overall_success=false
+        check_test_result "./test_outputs/iris_results/gemm_one_shot_all_reduce_bench.json" "GEMM One Shot All Reduce" || overall_success=false
+        
+        echo ""
+        if [ "$overall_success" = true ]; then
+          echo "::notice::🎯 All Iris tests PASSED! ✅"
+        else
+          echo "::error::⚠️ Some Iris tests FAILED! ❌"
+        fi
+
+    - name: Upload test outputs as artifact
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: iris-test-outputs
+        path: iris_test_outputs.tar.gz
+        retention-days: 15
+
+    - name: Auto-destroy droplet after use
+      if: always()
+      run: |
+        echo "🗑️ Auto-destroying droplet ${{ steps.create.outputs.droplet_id }}..."
+        doctl compute droplet delete ${{ steps.create.outputs.droplet_id }} --force
+        echo "✅ Droplet auto-destroyed successfully!"
diff --git a/.github/workflows/scripts/examples.sh b/.github/workflows/scripts/examples.sh
new file mode 100755
index 0000000..486d48e
--- /dev/null
+++ b/.github/workflows/scripts/examples.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+set -e
+
+export OMPI_ALLOW_RUN_AS_ROOT=1
+export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+
+# Run examples and store outputs
+echo 'Running Iris examples...'
+
+mkdir -p /iris_results
+
+# Examples
+mpirun -np 8 python examples/00_load/load_bench.py -o /iris_results/load_bench.json
+mpirun -np 8 python examples/01_store/store_bench.py -o /iris_results/store_bench.json
+
+
+mpirun -np 8 python examples/02_all_load/all_load_bench.py -o /iris_results/all_load_bench.json
+mpirun -np 8 python examples/03_all_store/all_store_bench.py -o /iris_results/all_store_bench.json
+
+
+mpirun -np 8 python examples/04_atomic_add/atomic_add_bench.py  -o /iris_results/atomic_add_bench.json
+mpirun -np 8 python examples/05_atomic_xchg/atomic_xchg_bench.py -o /iris_results/atomic_xchg_bench.json
+
+mpirun -np 2 python examples/06_message_passing/message_passing_load_store.py 
+mpirun -np 2 python examples/06_message_passing/message_passing_put.py
+
+mpirun -np 8 python examples/07_gemm_all_scatter/benchmark.py --benchmark --validate -o /iris_results/gemm_all_scatter_bench.json
+mpirun -np 8 python examples/08_gemm_atomics_all_reduce/benchmark.py --benchmark --validate -o /iris_results/gemm_atomics_all_reduce_bench.json
+mpirun -np 8 python examples/09_gemm_one_shot_all_reduce/benchmark.py --benchmark --validate -o /iris_results/gemm_one_shot_all_reduce_bench.json
diff --git a/docs/DEVCLOUD.md b/docs/DEVCLOUD.md
new file mode 100644
index 0000000..b6dce8b
--- /dev/null
+++ b/docs/DEVCLOUD.md
@@ -0,0 +1,66 @@
+# AMD Developer Cloud Setup Guide
+
+This guide provides step-by-step instructions for setting up Iris on the AMD Developer Cloud environment.
+
+## Prerequisites
+
+Before starting, ensure you have access to an AMD Developer Cloud and create a GPU Droplet.
+
+## Environment Setup
+
+### 1. Set ROCm Environment Variables
+
+First, set up the ROCm environment variables:
+
+```bash
+export ROCM_PATH=/opt/rocm
+export PATH=$ROCM_PATH/bin:$PATH
+export LD_LIBRARY_PATH=$ROCM_PATH/lib
+```
+
+**Note**: You may want to add these to your shell profile (`.bashrc`, `.zshrc`, etc.) for persistence across sessions.
+
+### 2. Install System Dependencies
+
+Install the required system packages:
+
+```bash
+sudo apt-get update && sudo apt-get install -y python3-venv cmake openmpi-bin libopenmpi-dev
+```
+
+### 1. Create and Activate Virtual Environment
+
+Create a Python virtual environment to isolate Iris dependencies:
+
+```bash
+# Create virtual environment
+python3 -m venv iris_env
+
+# Activate virtual environment
+source iris_env/bin/activate
+```
+
+### Install Python Dependencies
+```bash
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4
+```
+
+
+## Iris Installation
+
+### 1. Clone the Repository
+
+```bash
+git clone git@github.com:ROCm/iris.git
+cd iris
+```
+
+### 2. Install Iris
+
+Install Iris in development mode:
+
+```bash
+pip install -e .
+```
+
+Next, you can run the examples! See the [Examples README](../examples/README.md) for detailed information about available examples and how to run them.
diff --git a/examples/README.md b/examples/README.md
index e626558..0b6a8b9 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -42,7 +42,8 @@ mpirun -np 8 python examples/04_atomic_add/atomic_add_bench.py  # Atomic add acr
 mpirun -np 8 python examples/05_atomic_xchg/atomic_xchg_bench.py  # Atomic exchange across GPUs
 
 # Example command to run message passing
-python examples/06_message_passing/message_passing.py
+mpirun -np 2 python examples/06_message_passing/message_passing_load_store.py
+mpirun -np 2 python examples/06_message_passing/message_passing_put.py
 ```
 
 ### GEMM Operations

From 4d63f23fbb7596b7f9cc0016c6233a435d1ad1be Mon Sep 17 00:00:00 2001
From: Muhammad Awad <MuhammadAbdelghaffar.Awad@amd.com>
Date: Thu, 10 Jul 2025 13:38:30 -0500
Subject: [PATCH 2/8] Address copilot comments

Signed-off-by: Muhammad Awad <MuhammadAbdelghaffar.Awad@amd.com>
---
 .github/workflows/examples-ci.yml | 4 ++--
 docs/DEVCLOUD.md                  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
index c0f68e3..5e263ba 100644
--- a/.github/workflows/examples-ci.yml
+++ b/.github/workflows/examples-ci.yml
@@ -177,8 +177,8 @@ jobs:
         check_test_result "./test_outputs/iris_results/all_store_bench.json" "All Store" || overall_success=false
         check_test_result "./test_outputs/iris_results/atomic_add_bench.json" "Atomic Add" || overall_success=false
         check_test_result "./test_outputs/iris_results/atomic_xchg_bench.json" "Atomic Xchg" || overall_success=false
-        check_test_result "./test_outputs/iris_results/message_passing_load_store.json" "Message Passing Load Store" || overall_success=false
-        check_test_result "./test_outputs/iris_results/message_passing_put.json" "Message Passing Put" || overall_success=false
+        #check_test_result "./test_outputs/iris_results/message_passing_load_store.json" "Message Passing Load Store" || overall_success=false
+        #check_test_result "./test_outputs/iris_results/message_passing_put.json" "Message Passing Put" || overall_success=false
         check_test_result "./test_outputs/iris_results/gemm_all_scatter_bench.json" "GEMM All Scatter" || overall_success=false
         check_test_result "./test_outputs/iris_results/gemm_atomics_all_reduce_bench.json" "GEMM Atomics All Reduce" || overall_success=false
         check_test_result "./test_outputs/iris_results/gemm_one_shot_all_reduce_bench.json" "GEMM One Shot All Reduce" || overall_success=false
diff --git a/docs/DEVCLOUD.md b/docs/DEVCLOUD.md
index b6dce8b..d5f3590 100644
--- a/docs/DEVCLOUD.md
+++ b/docs/DEVCLOUD.md
@@ -28,7 +28,7 @@ Install the required system packages:
 sudo apt-get update && sudo apt-get install -y python3-venv cmake openmpi-bin libopenmpi-dev
 ```
 
-### 1. Create and Activate Virtual Environment
+### 3. Create and Activate Virtual Environment
 
 Create a Python virtual environment to isolate Iris dependencies:
 
@@ -40,7 +40,7 @@ python3 -m venv iris_env
 source iris_env/bin/activate
 ```
 
-### Install Python Dependencies
+### 4. Install Python Dependencies
 ```bash
 pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4
 ```

From 057460ba1432e78535765ca72dff19d59e37b2df Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhaawad@amd.com>
Date: Thu, 17 Jul 2025 18:37:29 +0000
Subject: [PATCH 3/8] Use variables for allocating runner

Signed-off-by: Muhammad Awad <muhaawad@amd.com>
---
 .github/workflows/examples-ci.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
index 5e263ba..c38f4db 100644
--- a/.github/workflows/examples-ci.yml
+++ b/.github/workflows/examples-ci.yml
@@ -14,6 +14,8 @@ concurrency:
 jobs:
   test-iris:
     runs-on: ubuntu-latest
+    env:
+      DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }}
     
     steps:
     - name: Checkout code
@@ -25,29 +27,32 @@ jobs:
         token: ${{ secrets.DEV_CLOUD_KEY }}
 
     - name: Install jq
-      timeout-minutes: 20
       run: |
         sudo apt-get update
         sudo apt-get install -y jq
 
     - name: Create Droplet
       id: create
+      env:
+        DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }}
       run: |
         DROPLET_NAME="iris-$(date +%s)"
         
         # Create droplet and capture JSON output
         DROPLET_JSON=$(doctl compute droplet create \
           --image 188571990 \
-          --size gpu-mi300x8-1536gb \
+          --size ${{ secrets.DIGITALOCEAN_SIZE }} \
           --region atl1 \
           --ssh-keys ${{ secrets.SSH_KEY_ID }} \
           "$DROPLET_NAME" \
           -o json \
-          --wait)
+          --wait 2>&1)
         
         # Check if droplet creation was successful
         if [ $? -ne 0 ]; then
           echo "❌ Failed to create droplet"
+          echo "Error details:"
+          echo "$DROPLET_JSON"
           exit 1
         fi
         
@@ -200,6 +205,8 @@ jobs:
 
     - name: Auto-destroy droplet after use
       if: always()
+      env:
+        DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }}
       run: |
         echo "🗑️ Auto-destroying droplet ${{ steps.create.outputs.droplet_id }}..."
         doctl compute droplet delete ${{ steps.create.outputs.droplet_id }} --force

From 2ce9fd8028daa579c840095ccc6d18d75a85283f Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhaawad@amd.com>
Date: Thu, 17 Jul 2025 20:36:36 +0000
Subject: [PATCH 4/8] Use PyTest

Signed-off-by: Muhammad Awad <muhaawad@amd.com>
---
 .github/workflows/examples-ci.yml | 57 ++++++-------------------------
 1 file changed, 10 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
index c38f4db..fdc3828 100644
--- a/.github/workflows/examples-ci.yml
+++ b/.github/workflows/examples-ci.yml
@@ -41,16 +41,18 @@ jobs:
         # Create droplet and capture JSON output
         DROPLET_JSON=$(doctl compute droplet create \
           --image 188571990 \
-          --size ${{ secrets.DIGITALOCEAN_SIZE }} \
+          --size "${{ secrets.DIGITALOCEAN_SIZE }}" \
           --region atl1 \
-          --ssh-keys ${{ secrets.SSH_KEY_ID }} \
+          --ssh-keys "${{ secrets.SSH_KEY_ID }}" \
           "$DROPLET_NAME" \
           -o json \
           --wait 2>&1)
         
+        DROPLET_EXIT_CODE=$?
+        
         # Check if droplet creation was successful
-        if [ $? -ne 0 ]; then
-          echo "❌ Failed to create droplet"
+        if [ $DROPLET_EXIT_CODE -ne 0 ]; then
+          echo "❌ Failed to create droplet (exit code: $DROPLET_EXIT_CODE)"
           echo "Error details:"
           echo "$DROPLET_JSON"
           exit 1
@@ -137,8 +139,8 @@ jobs:
           # Create results directory
           mkdir -p /iris_results
           
-          # Run the Iris examples
-          ./.github/workflows/scripts/examples.sh
+          # Run pytest tests
+          pytest tests/ -v
         "
 
     - name: Download test outputs
@@ -152,48 +154,9 @@ jobs:
         tar -czf iris_test_outputs.tar.gz -C test_outputs .
         echo "✅ Test outputs archived as iris_test_outputs.tar.gz"
         
-        # Print test results summary with GitHub Actions annotations
+        # Print test results summary
         echo "📊 Iris Test Results Summary:"
-        
-        # Check each test result by parsing the success field
-        check_test_result() {
-          local file="$1"
-          local test_name="$2"
-          if [ -f "$file" ]; then
-            if jq -e '.success == true' "$file" >/dev/null 2>&1; then
-              echo "::notice::✅ $test_name: PASSED"
-              return 0
-            else
-              echo "::warning::❌ $test_name: FAILED"
-              return 1
-            fi
-          else
-            echo "::warning::❌ $test_name: FAILED (file not found)"
-            return 1
-          fi
-        }
-        
-        # Track overall success
-        overall_success=true
-        
-        check_test_result "./test_outputs/iris_results/load_bench.json" "Load" || overall_success=false
-        check_test_result "./test_outputs/iris_results/store_bench.json" "Store" || overall_success=false
-        check_test_result "./test_outputs/iris_results/all_load_bench.json" "All Load" || overall_success=false
-        check_test_result "./test_outputs/iris_results/all_store_bench.json" "All Store" || overall_success=false
-        check_test_result "./test_outputs/iris_results/atomic_add_bench.json" "Atomic Add" || overall_success=false
-        check_test_result "./test_outputs/iris_results/atomic_xchg_bench.json" "Atomic Xchg" || overall_success=false
-        #check_test_result "./test_outputs/iris_results/message_passing_load_store.json" "Message Passing Load Store" || overall_success=false
-        #check_test_result "./test_outputs/iris_results/message_passing_put.json" "Message Passing Put" || overall_success=false
-        check_test_result "./test_outputs/iris_results/gemm_all_scatter_bench.json" "GEMM All Scatter" || overall_success=false
-        check_test_result "./test_outputs/iris_results/gemm_atomics_all_reduce_bench.json" "GEMM Atomics All Reduce" || overall_success=false
-        check_test_result "./test_outputs/iris_results/gemm_one_shot_all_reduce_bench.json" "GEMM One Shot All Reduce" || overall_success=false
-        
-        echo ""
-        if [ "$overall_success" = true ]; then
-          echo "::notice::🎯 All Iris tests PASSED! ✅"
-        else
-          echo "::error::⚠️ Some Iris tests FAILED! ❌"
-        fi
+        echo "Pytest tests completed. Check the logs above for detailed results."
 
     - name: Upload test outputs as artifact
       if: always()

From aa5ac5d31659bea6f86981992acad822d9f3ce85 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhaawad@amd.com>
Date: Fri, 18 Jul 2025 16:44:36 +0000
Subject: [PATCH 5/8] Install dev reqs

Signed-off-by: Muhammad Awad <muhaawad@amd.com>
---
 .github/workflows/examples-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
index fdc3828..bce82e8 100644
--- a/.github/workflows/examples-ci.yml
+++ b/.github/workflows/examples-ci.yml
@@ -134,7 +134,7 @@ jobs:
           source iris_env/bin/activate
 
           pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4
-          pip3 install -e .
+          pip install -e '.[dev]'
           
           # Create results directory
           mkdir -p /iris_results

From c0f47004f89307a17fb6b07ff38c9db75f9f73a6 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhaawad@amd.com>
Date: Fri, 18 Jul 2025 16:50:54 +0000
Subject: [PATCH 6/8] Log droplet creation output

Signed-off-by: Muhammad Awad <muhaawad@amd.com>
---
 .github/workflows/examples-ci.yml | 32 +++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
index bce82e8..f6e3ffb 100644
--- a/.github/workflows/examples-ci.yml
+++ b/.github/workflows/examples-ci.yml
@@ -38,7 +38,7 @@ jobs:
       run: |
         DROPLET_NAME="iris-$(date +%s)"
         
-        # Create droplet and capture JSON output
+        # Create droplet and capture error output
         DROPLET_JSON=$(doctl compute droplet create \
           --image 188571990 \
           --size "${{ secrets.DIGITALOCEAN_SIZE }}" \
@@ -52,15 +52,35 @@ jobs:
         
         # Check if droplet creation was successful
         if [ $DROPLET_EXIT_CODE -ne 0 ]; then
-          echo "❌ Failed to create droplet (exit code: $DROPLET_EXIT_CODE)"
-          echo "Error details:"
           echo "$DROPLET_JSON"
           exit 1
         fi
         
-        # Extract droplet ID and IP
-        DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id')
-        PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address')
+        # Validate that we got valid JSON
+        if ! echo "$DROPLET_JSON" | jq . >/dev/null 2>&1; then
+          echo "❌ Invalid JSON response from doctl"
+          echo "Raw response:"
+          echo "$DROPLET_JSON"
+          exit 1
+        fi
+        
+        # Extract droplet ID and IP with error checking
+        DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id // empty')
+        PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address // empty')
+        
+        if [ -z "$DROPLET_ID" ]; then
+          echo "❌ Failed to extract droplet ID from response"
+          echo "JSON response:"
+          echo "$DROPLET_JSON"
+          exit 1
+        fi
+        
+        if [ -z "$PUBLIC_IP" ]; then
+          echo "❌ Failed to extract public IP from response"
+          echo "JSON response:"
+          echo "$DROPLET_JSON"
+          exit 1
+        fi
         
         # Set outputs for other steps
         echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT

From dc52b1ad809526ea998641be23ca1254a7a2074e Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhaawad@amd.com>
Date: Fri, 18 Jul 2025 17:10:23 +0000
Subject: [PATCH 7/8] Attempt multiple times

Signed-off-by: Muhammad Awad <muhaawad@amd.com>
---
 .github/workflows/examples-ci.yml | 97 ++++++++++++++-----------------
 1 file changed, 45 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
index f6e3ffb..ddc214a 100644
--- a/.github/workflows/examples-ci.yml
+++ b/.github/workflows/examples-ci.yml
@@ -33,60 +33,53 @@ jobs:
 
     - name: Create Droplet
       id: create
+      uses: nick-fields/retry@v3
       env:
         DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }}
-      run: |
-        DROPLET_NAME="iris-$(date +%s)"
-        
-        # Create droplet and capture error output
-        DROPLET_JSON=$(doctl compute droplet create \
-          --image 188571990 \
-          --size "${{ secrets.DIGITALOCEAN_SIZE }}" \
-          --region atl1 \
-          --ssh-keys "${{ secrets.SSH_KEY_ID }}" \
-          "$DROPLET_NAME" \
-          -o json \
-          --wait 2>&1)
-        
-        DROPLET_EXIT_CODE=$?
-        
-        # Check if droplet creation was successful
-        if [ $DROPLET_EXIT_CODE -ne 0 ]; then
-          echo "$DROPLET_JSON"
-          exit 1
-        fi
-        
-        # Validate that we got valid JSON
-        if ! echo "$DROPLET_JSON" | jq . >/dev/null 2>&1; then
-          echo "❌ Invalid JSON response from doctl"
-          echo "Raw response:"
-          echo "$DROPLET_JSON"
-          exit 1
-        fi
-        
-        # Extract droplet ID and IP with error checking
-        DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id // empty')
-        PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address // empty')
-        
-        if [ -z "$DROPLET_ID" ]; then
-          echo "❌ Failed to extract droplet ID from response"
-          echo "JSON response:"
-          echo "$DROPLET_JSON"
-          exit 1
-        fi
-        
-        if [ -z "$PUBLIC_IP" ]; then
-          echo "❌ Failed to extract public IP from response"
-          echo "JSON response:"
-          echo "$DROPLET_JSON"
-          exit 1
-        fi
-        
-        # Set outputs for other steps
-        echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT
-        echo "public_ip=$PUBLIC_IP" >> $GITHUB_OUTPUT
-        
-        echo "✅ Droplet created successfully!"
+      with:
+        max_attempts: 10
+        retry_wait_seconds: 60
+        command: |
+          DROPLET_NAME="iris-$(date +%s)"
+          
+          # Create droplet and capture error output
+          DROPLET_JSON=$(doctl compute droplet create \
+            --image 188571990 \
+            --size "${{ secrets.DIGITALOCEAN_SIZE }}" \
+            --region atl1 \
+            --ssh-keys "${{ secrets.SSH_KEY_ID }}" \
+            "$DROPLET_NAME" \
+            -o json \
+            --wait 2>&1)
+          
+          DROPLET_EXIT_CODE=$?
+          
+          # Check if droplet creation was successful
+          if [ $DROPLET_EXIT_CODE -ne 0 ]; then
+            echo "$DROPLET_JSON"
+            exit 1
+          fi
+          
+          # Validate that we got valid JSON
+          if ! echo "$DROPLET_JSON" | jq . >/dev/null 2>&1; then
+            echo "$DROPLET_JSON"
+            exit 1
+          fi
+          
+          # Extract droplet ID and IP with error checking
+          DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id // empty')
+          PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address // empty')
+          
+          if [ -z "$DROPLET_ID" ] || [ -z "$PUBLIC_IP" ]; then
+            echo "$DROPLET_JSON"
+            exit 1
+          fi
+          
+          # Set outputs for other steps
+          echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT
+          echo "public_ip=$PUBLIC_IP" >> $GITHUB_OUTPUT
+          
+          echo "✅ Droplet created successfully!"
 
     - name: Setup SSH key
       run: |

From 51036aeb7dff3fe7b240f234be83f50916def2c1 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhaawad@amd.com>
Date: Fri, 18 Jul 2025 17:11:46 +0000
Subject: [PATCH 8/8] Add timemout

Signed-off-by: Muhammad Awad <muhaawad@amd.com>
---
 .github/workflows/examples-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
index ddc214a..b13e9b9 100644
--- a/.github/workflows/examples-ci.yml
+++ b/.github/workflows/examples-ci.yml
@@ -37,6 +37,7 @@ jobs:
       env:
         DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }}
       with:
+        timeout_minutes: 5
         max_attempts: 10
         retry_wait_seconds: 60
         command: |