From 2f83fbde19374ea10f4e3e41b2e0a235eac9bc90 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 10 Jul 2025 13:33:44 -0500 Subject: [PATCH 1/8] Add initial CI Signed-off-by: Muhammad Awad --- .github/workflows/examples-ci.yml | 206 ++++++++++++++++++++++++++ .github/workflows/scripts/examples.sh | 29 ++++ docs/DEVCLOUD.md | 66 +++++++++ examples/README.md | 3 +- 4 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/examples-ci.yml create mode 100755 .github/workflows/scripts/examples.sh create mode 100644 docs/DEVCLOUD.md diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml new file mode 100644 index 0000000..c0f68e3 --- /dev/null +++ b/.github/workflows/examples-ci.yml @@ -0,0 +1,206 @@ +name: Iris examples CI + +on: + pull_request: + branches: [ main ] + push: + branches: [ main ] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-iris: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install doctl + uses: digitalocean/action-doctl@v2 + with: + token: ${{ secrets.DEV_CLOUD_KEY }} + + - name: Install jq + timeout-minutes: 20 + run: | + sudo apt-get update + sudo apt-get install -y jq + + - name: Create Droplet + id: create + run: | + DROPLET_NAME="iris-$(date +%s)" + + # Create droplet and capture JSON output + DROPLET_JSON=$(doctl compute droplet create \ + --image 188571990 \ + --size gpu-mi300x8-1536gb \ + --region atl1 \ + --ssh-keys ${{ secrets.SSH_KEY_ID }} \ + "$DROPLET_NAME" \ + -o json \ + --wait) + + # Check if droplet creation was successful + if [ $? -ne 0 ]; then + echo "❌ Failed to create droplet" + exit 1 + fi + + # Extract droplet ID and IP + DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id') + PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address') + + # Set outputs for other steps + echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT + echo "public_ip=$PUBLIC_IP" >> $GITHUB_OUTPUT + + echo "βœ… Droplet created successfully!" + + - name: Setup SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H ${{ steps.create.outputs.public_ip }} >> ~/.ssh/known_hosts 2>/dev/null || true + + - name: Wait for SSH to be ready + run: | + echo "⏳ Waiting for SSH to be ready..." + for i in {1..30}; do + if ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }} "echo 'SSH ready'" 2>/dev/null; then + echo "βœ… SSH is ready!" + break + fi + echo "Attempt $i/30: SSH not ready yet, waiting 10 seconds..." + sleep 10 + done + + - name: Determine commit hash + id: commit_hash + run: | + if [ "${{ github.event_name }}" == "pull_request" ]; then + echo "commit_hash=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT + else + echo "commit_hash=${{ github.sha }}" >> $GITHUB_OUTPUT + fi + + - name: Install Iris and run tests + run: | + echo "πŸš€Iris installation..." + + # Setup SSH, clone repo, and install dependencies + ssh -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }} " + set -e + + # Remove any stale dpkg locks + sudo rm -f /var/lib/apt/lists/lock + sudo rm -f /var/cache/apt/archives/lock + sudo rm -f /var/lib/dpkg/lock* + + # Setup SSH key for git access + mkdir -p ~/.ssh + echo '${{ secrets.SSH_PRIVATE_KEY }}' > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H github.com >> ~/.ssh/known_hosts + + # Set environment variables + export ROCM_PATH=/opt/rocm + export PATH=\$ROCM_PATH/bin:\$PATH + export LD_LIBRARY_PATH=\$ROCM_PATH/lib:\$LD_LIBRARY_PATH + + # Install system dependencies + sudo apt-get update && sudo apt-get install -y python3-venv cmake openmpi-bin libopenmpi-dev + + # Clone the repository + git clone git@github.com:ROCm/iris.git + cd iris + echo 'Checking out commit ${{ steps.commit_hash.outputs.commit_hash }}' + git checkout ${{ steps.commit_hash.outputs.commit_hash }} + + # Setup Python environment + python3 -m venv iris_env + source iris_env/bin/activate + + pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4 + pip3 install -e . + + # Create results directory + mkdir -p /iris_results + + # Run the Iris examples + ./.github/workflows/scripts/examples.sh + " + + - name: Download test outputs + if: always() + run: | + echo "πŸ“₯ Downloading test outputs..." + mkdir -p test_outputs + scp -r -o StrictHostKeyChecking=no root@${{ steps.create.outputs.public_ip }}:/iris_results/ ./test_outputs/ || echo "No results directory found" + + # Create tar artifact + tar -czf iris_test_outputs.tar.gz -C test_outputs . + echo "βœ… Test outputs archived as iris_test_outputs.tar.gz" + + # Print test results summary with GitHub Actions annotations + echo "πŸ“Š Iris Test Results Summary:" + + # Check each test result by parsing the success field + check_test_result() { + local file="$1" + local test_name="$2" + if [ -f "$file" ]; then + if jq -e '.success == true' "$file" >/dev/null 2>&1; then + echo "::notice::βœ… $test_name: PASSED" + return 0 + else + echo "::warning::❌ $test_name: FAILED" + return 1 + fi + else + echo "::warning::❌ $test_name: FAILED (file not found)" + return 1 + fi + } + + # Track overall success + overall_success=true + + check_test_result "./test_outputs/iris_results/load_bench.json" "Load" || overall_success=false + check_test_result "./test_outputs/iris_results/store_bench.json" "Store" || overall_success=false + check_test_result "./test_outputs/iris_results/all_load_bench.json" "All Load" || overall_success=false + check_test_result "./test_outputs/iris_results/all_store_bench.json" "All Store" || overall_success=false + check_test_result "./test_outputs/iris_results/atomic_add_bench.json" "Atomic Add" || overall_success=false + check_test_result "./test_outputs/iris_results/atomic_xchg_bench.json" "Atomic Xchg" || overall_success=false + check_test_result "./test_outputs/iris_results/message_passing_load_store.json" "Message Passing Load Store" || overall_success=false + check_test_result "./test_outputs/iris_results/message_passing_put.json" "Message Passing Put" || overall_success=false + check_test_result "./test_outputs/iris_results/gemm_all_scatter_bench.json" "GEMM All Scatter" || overall_success=false + check_test_result "./test_outputs/iris_results/gemm_atomics_all_reduce_bench.json" "GEMM Atomics All Reduce" || overall_success=false + check_test_result "./test_outputs/iris_results/gemm_one_shot_all_reduce_bench.json" "GEMM One Shot All Reduce" || overall_success=false + + echo "" + if [ "$overall_success" = true ]; then + echo "::notice::🎯 All Iris tests PASSED! βœ…" + else + echo "::error::⚠️ Some Iris tests FAILED! ❌" + fi + + - name: Upload test outputs as artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: iris-test-outputs + path: iris_test_outputs.tar.gz + retention-days: 15 + + - name: Auto-destroy droplet after use + if: always() + run: | + echo "πŸ—‘οΈ Auto-destroying droplet ${{ steps.create.outputs.droplet_id }}..." + doctl compute droplet delete ${{ steps.create.outputs.droplet_id }} --force + echo "βœ… Droplet auto-destroyed successfully!" diff --git a/.github/workflows/scripts/examples.sh b/.github/workflows/scripts/examples.sh new file mode 100755 index 0000000..486d48e --- /dev/null +++ b/.github/workflows/scripts/examples.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +export OMPI_ALLOW_RUN_AS_ROOT=1 +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + +# Run examples and store outputs +echo 'Running Iris examples...' + +mkdir -p /iris_results + +# Examples +mpirun -np 8 python examples/00_load/load_bench.py -o /iris_results/load_bench.json +mpirun -np 8 python examples/01_store/store_bench.py -o /iris_results/store_bench.json + + +mpirun -np 8 python examples/02_all_load/all_load_bench.py -o /iris_results/all_load_bench.json +mpirun -np 8 python examples/03_all_store/all_store_bench.py -o /iris_results/all_store_bench.json + + +mpirun -np 8 python examples/04_atomic_add/atomic_add_bench.py -o /iris_results/atomic_add_bench.json +mpirun -np 8 python examples/05_atomic_xchg/atomic_xchg_bench.py -o /iris_results/atomic_xchg_bench.json + +mpirun -np 2 python examples/06_message_passing/message_passing_load_store.py +mpirun -np 2 python examples/06_message_passing/message_passing_put.py + +mpirun -np 8 python examples/07_gemm_all_scatter/benchmark.py --benchmark --validate -o /iris_results/gemm_all_scatter_bench.json +mpirun -np 8 python examples/08_gemm_atomics_all_reduce/benchmark.py --benchmark --validate -o /iris_results/gemm_atomics_all_reduce_bench.json +mpirun -np 8 python examples/09_gemm_one_shot_all_reduce/benchmark.py --benchmark --validate -o /iris_results/gemm_one_shot_all_reduce_bench.json diff --git a/docs/DEVCLOUD.md b/docs/DEVCLOUD.md new file mode 100644 index 0000000..b6dce8b --- /dev/null +++ b/docs/DEVCLOUD.md @@ -0,0 +1,66 @@ +# AMD Developer Cloud Setup Guide + +This guide provides step-by-step instructions for setting up Iris on the AMD Developer Cloud environment. + +## Prerequisites + +Before starting, ensure you have access to an AMD Developer Cloud and create a GPU Droplet. + +## Environment Setup + +### 1. Set ROCm Environment Variables + +First, set up the ROCm environment variables: + +```bash +export ROCM_PATH=/opt/rocm +export PATH=$ROCM_PATH/bin:$PATH +export LD_LIBRARY_PATH=$ROCM_PATH/lib +``` + +**Note**: You may want to add these to your shell profile (`.bashrc`, `.zshrc`, etc.) for persistence across sessions. + +### 2. Install System Dependencies + +Install the required system packages: + +```bash +sudo apt-get update && sudo apt-get install -y python3-venv cmake openmpi-bin libopenmpi-dev +``` + +### 1. Create and Activate Virtual Environment + +Create a Python virtual environment to isolate Iris dependencies: + +```bash +# Create virtual environment +python3 -m venv iris_env + +# Activate virtual environment +source iris_env/bin/activate +``` + +### Install Python Dependencies +```bash +pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4 +``` + + +## Iris Installation + +### 1. Clone the Repository + +```bash +git clone git@github.com:ROCm/iris.git +cd iris +``` + +### 2. Install Iris + +Install Iris in development mode: + +```bash +pip install -e . +``` + +Next, you can run the examples! See the [Examples README](../examples/README.md) for detailed information about available examples and how to run them. diff --git a/examples/README.md b/examples/README.md index e626558..0b6a8b9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -42,7 +42,8 @@ mpirun -np 8 python examples/04_atomic_add/atomic_add_bench.py # Atomic add acr mpirun -np 8 python examples/05_atomic_xchg/atomic_xchg_bench.py # Atomic exchange across GPUs # Example command to run message passing -python examples/06_message_passing/message_passing.py +mpirun -np 2 python examples/06_message_passing/message_passing_load_store.py +mpirun -np 2 python examples/06_message_passing/message_passing_put.py ``` ### GEMM Operations From 4d63f23fbb7596b7f9cc0016c6233a435d1ad1be Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 10 Jul 2025 13:38:30 -0500 Subject: [PATCH 2/8] Address copilot comments Signed-off-by: Muhammad Awad --- .github/workflows/examples-ci.yml | 4 ++-- docs/DEVCLOUD.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml index c0f68e3..5e263ba 100644 --- a/.github/workflows/examples-ci.yml +++ b/.github/workflows/examples-ci.yml @@ -177,8 +177,8 @@ jobs: check_test_result "./test_outputs/iris_results/all_store_bench.json" "All Store" || overall_success=false check_test_result "./test_outputs/iris_results/atomic_add_bench.json" "Atomic Add" || overall_success=false check_test_result "./test_outputs/iris_results/atomic_xchg_bench.json" "Atomic Xchg" || overall_success=false - check_test_result "./test_outputs/iris_results/message_passing_load_store.json" "Message Passing Load Store" || overall_success=false - check_test_result "./test_outputs/iris_results/message_passing_put.json" "Message Passing Put" || overall_success=false + #check_test_result "./test_outputs/iris_results/message_passing_load_store.json" "Message Passing Load Store" || overall_success=false + #check_test_result "./test_outputs/iris_results/message_passing_put.json" "Message Passing Put" || overall_success=false check_test_result "./test_outputs/iris_results/gemm_all_scatter_bench.json" "GEMM All Scatter" || overall_success=false check_test_result "./test_outputs/iris_results/gemm_atomics_all_reduce_bench.json" "GEMM Atomics All Reduce" || overall_success=false check_test_result "./test_outputs/iris_results/gemm_one_shot_all_reduce_bench.json" "GEMM One Shot All Reduce" || overall_success=false diff --git a/docs/DEVCLOUD.md b/docs/DEVCLOUD.md index b6dce8b..d5f3590 100644 --- a/docs/DEVCLOUD.md +++ b/docs/DEVCLOUD.md @@ -28,7 +28,7 @@ Install the required system packages: sudo apt-get update && sudo apt-get install -y python3-venv cmake openmpi-bin libopenmpi-dev ``` -### 1. Create and Activate Virtual Environment +### 3. Create and Activate Virtual Environment Create a Python virtual environment to isolate Iris dependencies: @@ -40,7 +40,7 @@ python3 -m venv iris_env source iris_env/bin/activate ``` -### Install Python Dependencies +### 4. Install Python Dependencies ```bash pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4 ``` From 057460ba1432e78535765ca72dff19d59e37b2df Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 17 Jul 2025 18:37:29 +0000 Subject: [PATCH 3/8] Use variables for allocating runner Signed-off-by: Muhammad Awad --- .github/workflows/examples-ci.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml index 5e263ba..c38f4db 100644 --- a/.github/workflows/examples-ci.yml +++ b/.github/workflows/examples-ci.yml @@ -14,6 +14,8 @@ concurrency: jobs: test-iris: runs-on: ubuntu-latest + env: + DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }} steps: - name: Checkout code @@ -25,29 +27,32 @@ jobs: token: ${{ secrets.DEV_CLOUD_KEY }} - name: Install jq - timeout-minutes: 20 run: | sudo apt-get update sudo apt-get install -y jq - name: Create Droplet id: create + env: + DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }} run: | DROPLET_NAME="iris-$(date +%s)" # Create droplet and capture JSON output DROPLET_JSON=$(doctl compute droplet create \ --image 188571990 \ - --size gpu-mi300x8-1536gb \ + --size ${{ secrets.DIGITALOCEAN_SIZE }} \ --region atl1 \ --ssh-keys ${{ secrets.SSH_KEY_ID }} \ "$DROPLET_NAME" \ -o json \ - --wait) + --wait 2>&1) # Check if droplet creation was successful if [ $? -ne 0 ]; then echo "❌ Failed to create droplet" + echo "Error details:" + echo "$DROPLET_JSON" exit 1 fi @@ -200,6 +205,8 @@ jobs: - name: Auto-destroy droplet after use if: always() + env: + DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }} run: | echo "πŸ—‘οΈ Auto-destroying droplet ${{ steps.create.outputs.droplet_id }}..." doctl compute droplet delete ${{ steps.create.outputs.droplet_id }} --force From 2ce9fd8028daa579c840095ccc6d18d75a85283f Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 17 Jul 2025 20:36:36 +0000 Subject: [PATCH 4/8] Use PyTest Signed-off-by: Muhammad Awad --- .github/workflows/examples-ci.yml | 57 ++++++------------------------- 1 file changed, 10 insertions(+), 47 deletions(-) diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml index c38f4db..fdc3828 100644 --- a/.github/workflows/examples-ci.yml +++ b/.github/workflows/examples-ci.yml @@ -41,16 +41,18 @@ jobs: # Create droplet and capture JSON output DROPLET_JSON=$(doctl compute droplet create \ --image 188571990 \ - --size ${{ secrets.DIGITALOCEAN_SIZE }} \ + --size "${{ secrets.DIGITALOCEAN_SIZE }}" \ --region atl1 \ - --ssh-keys ${{ secrets.SSH_KEY_ID }} \ + --ssh-keys "${{ secrets.SSH_KEY_ID }}" \ "$DROPLET_NAME" \ -o json \ --wait 2>&1) + DROPLET_EXIT_CODE=$? + # Check if droplet creation was successful - if [ $? -ne 0 ]; then - echo "❌ Failed to create droplet" + if [ $DROPLET_EXIT_CODE -ne 0 ]; then + echo "❌ Failed to create droplet (exit code: $DROPLET_EXIT_CODE)" echo "Error details:" echo "$DROPLET_JSON" exit 1 @@ -137,8 +139,8 @@ jobs: # Create results directory mkdir -p /iris_results - # Run the Iris examples - ./.github/workflows/scripts/examples.sh + # Run pytest tests + pytest tests/ -v " - name: Download test outputs @@ -152,48 +154,9 @@ jobs: tar -czf iris_test_outputs.tar.gz -C test_outputs . echo "βœ… Test outputs archived as iris_test_outputs.tar.gz" - # Print test results summary with GitHub Actions annotations + # Print test results summary echo "πŸ“Š Iris Test Results Summary:" - - # Check each test result by parsing the success field - check_test_result() { - local file="$1" - local test_name="$2" - if [ -f "$file" ]; then - if jq -e '.success == true' "$file" >/dev/null 2>&1; then - echo "::notice::βœ… $test_name: PASSED" - return 0 - else - echo "::warning::❌ $test_name: FAILED" - return 1 - fi - else - echo "::warning::❌ $test_name: FAILED (file not found)" - return 1 - fi - } - - # Track overall success - overall_success=true - - check_test_result "./test_outputs/iris_results/load_bench.json" "Load" || overall_success=false - check_test_result "./test_outputs/iris_results/store_bench.json" "Store" || overall_success=false - check_test_result "./test_outputs/iris_results/all_load_bench.json" "All Load" || overall_success=false - check_test_result "./test_outputs/iris_results/all_store_bench.json" "All Store" || overall_success=false - check_test_result "./test_outputs/iris_results/atomic_add_bench.json" "Atomic Add" || overall_success=false - check_test_result "./test_outputs/iris_results/atomic_xchg_bench.json" "Atomic Xchg" || overall_success=false - #check_test_result "./test_outputs/iris_results/message_passing_load_store.json" "Message Passing Load Store" || overall_success=false - #check_test_result "./test_outputs/iris_results/message_passing_put.json" "Message Passing Put" || overall_success=false - check_test_result "./test_outputs/iris_results/gemm_all_scatter_bench.json" "GEMM All Scatter" || overall_success=false - check_test_result "./test_outputs/iris_results/gemm_atomics_all_reduce_bench.json" "GEMM Atomics All Reduce" || overall_success=false - check_test_result "./test_outputs/iris_results/gemm_one_shot_all_reduce_bench.json" "GEMM One Shot All Reduce" || overall_success=false - - echo "" - if [ "$overall_success" = true ]; then - echo "::notice::🎯 All Iris tests PASSED! βœ…" - else - echo "::error::⚠️ Some Iris tests FAILED! ❌" - fi + echo "Pytest tests completed. Check the logs above for detailed results." - name: Upload test outputs as artifact if: always() From aa5ac5d31659bea6f86981992acad822d9f3ce85 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Fri, 18 Jul 2025 16:44:36 +0000 Subject: [PATCH 5/8] Install dev reqs Signed-off-by: Muhammad Awad --- .github/workflows/examples-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml index fdc3828..bce82e8 100644 --- a/.github/workflows/examples-ci.yml +++ b/.github/workflows/examples-ci.yml @@ -134,7 +134,7 @@ jobs: source iris_env/bin/activate pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4 - pip3 install -e . + pip install -e '.[dev]' # Create results directory mkdir -p /iris_results From c0f47004f89307a17fb6b07ff38c9db75f9f73a6 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Fri, 18 Jul 2025 16:50:54 +0000 Subject: [PATCH 6/8] Log droplet creation output Signed-off-by: Muhammad Awad --- .github/workflows/examples-ci.yml | 32 +++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml index bce82e8..f6e3ffb 100644 --- a/.github/workflows/examples-ci.yml +++ b/.github/workflows/examples-ci.yml @@ -38,7 +38,7 @@ jobs: run: | DROPLET_NAME="iris-$(date +%s)" - # Create droplet and capture JSON output + # Create droplet and capture error output DROPLET_JSON=$(doctl compute droplet create \ --image 188571990 \ --size "${{ secrets.DIGITALOCEAN_SIZE }}" \ @@ -52,15 +52,35 @@ jobs: # Check if droplet creation was successful if [ $DROPLET_EXIT_CODE -ne 0 ]; then - echo "❌ Failed to create droplet (exit code: $DROPLET_EXIT_CODE)" - echo "Error details:" echo "$DROPLET_JSON" exit 1 fi - # Extract droplet ID and IP - DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id') - PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address') + # Validate that we got valid JSON + if ! echo "$DROPLET_JSON" | jq . >/dev/null 2>&1; then + echo "❌ Invalid JSON response from doctl" + echo "Raw response:" + echo "$DROPLET_JSON" + exit 1 + fi + + # Extract droplet ID and IP with error checking + DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id // empty') + PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address // empty') + + if [ -z "$DROPLET_ID" ]; then + echo "❌ Failed to extract droplet ID from response" + echo "JSON response:" + echo "$DROPLET_JSON" + exit 1 + fi + + if [ -z "$PUBLIC_IP" ]; then + echo "❌ Failed to extract public IP from response" + echo "JSON response:" + echo "$DROPLET_JSON" + exit 1 + fi # Set outputs for other steps echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT From dc52b1ad809526ea998641be23ca1254a7a2074e Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Fri, 18 Jul 2025 17:10:23 +0000 Subject: [PATCH 7/8] Attempt multiple times Signed-off-by: Muhammad Awad --- .github/workflows/examples-ci.yml | 97 ++++++++++++++----------------- 1 file changed, 45 insertions(+), 52 deletions(-) diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml index f6e3ffb..ddc214a 100644 --- a/.github/workflows/examples-ci.yml +++ b/.github/workflows/examples-ci.yml @@ -33,60 +33,53 @@ jobs: - name: Create Droplet id: create + uses: nick-fields/retry@v3 env: DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }} - run: | - DROPLET_NAME="iris-$(date +%s)" - - # Create droplet and capture error output - DROPLET_JSON=$(doctl compute droplet create \ - --image 188571990 \ - --size "${{ secrets.DIGITALOCEAN_SIZE }}" \ - --region atl1 \ - --ssh-keys "${{ secrets.SSH_KEY_ID }}" \ - "$DROPLET_NAME" \ - -o json \ - --wait 2>&1) - - DROPLET_EXIT_CODE=$? - - # Check if droplet creation was successful - if [ $DROPLET_EXIT_CODE -ne 0 ]; then - echo "$DROPLET_JSON" - exit 1 - fi - - # Validate that we got valid JSON - if ! echo "$DROPLET_JSON" | jq . >/dev/null 2>&1; then - echo "❌ Invalid JSON response from doctl" - echo "Raw response:" - echo "$DROPLET_JSON" - exit 1 - fi - - # Extract droplet ID and IP with error checking - DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id // empty') - PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address // empty') - - if [ -z "$DROPLET_ID" ]; then - echo "❌ Failed to extract droplet ID from response" - echo "JSON response:" - echo "$DROPLET_JSON" - exit 1 - fi - - if [ -z "$PUBLIC_IP" ]; then - echo "❌ Failed to extract public IP from response" - echo "JSON response:" - echo "$DROPLET_JSON" - exit 1 - fi - - # Set outputs for other steps - echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT - echo "public_ip=$PUBLIC_IP" >> $GITHUB_OUTPUT - - echo "βœ… Droplet created successfully!" + with: + max_attempts: 10 + retry_wait_seconds: 60 + command: | + DROPLET_NAME="iris-$(date +%s)" + + # Create droplet and capture error output + DROPLET_JSON=$(doctl compute droplet create \ + --image 188571990 \ + --size "${{ secrets.DIGITALOCEAN_SIZE }}" \ + --region atl1 \ + --ssh-keys "${{ secrets.SSH_KEY_ID }}" \ + "$DROPLET_NAME" \ + -o json \ + --wait 2>&1) + + DROPLET_EXIT_CODE=$? + + # Check if droplet creation was successful + if [ $DROPLET_EXIT_CODE -ne 0 ]; then + echo "$DROPLET_JSON" + exit 1 + fi + + # Validate that we got valid JSON + if ! echo "$DROPLET_JSON" | jq . >/dev/null 2>&1; then + echo "$DROPLET_JSON" + exit 1 + fi + + # Extract droplet ID and IP with error checking + DROPLET_ID=$(echo "$DROPLET_JSON" | jq -r '.[0].id // empty') + PUBLIC_IP=$(echo "$DROPLET_JSON" | jq -r '.[0].networks.v4[] | select(.type=="public") | .ip_address // empty') + + if [ -z "$DROPLET_ID" ] || [ -z "$PUBLIC_IP" ]; then + echo "$DROPLET_JSON" + exit 1 + fi + + # Set outputs for other steps + echo "droplet_id=$DROPLET_ID" >> $GITHUB_OUTPUT + echo "public_ip=$PUBLIC_IP" >> $GITHUB_OUTPUT + + echo "βœ… Droplet created successfully!" - name: Setup SSH key run: | From 51036aeb7dff3fe7b240f234be83f50916def2c1 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Fri, 18 Jul 2025 17:11:46 +0000 Subject: [PATCH 8/8] Add timemout Signed-off-by: Muhammad Awad --- .github/workflows/examples-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml index ddc214a..b13e9b9 100644 --- a/.github/workflows/examples-ci.yml +++ b/.github/workflows/examples-ci.yml @@ -37,6 +37,7 @@ jobs: env: DIGITALOCEAN_API_URL: ${{ secrets.DIGITALOCEAN_API_URL }} with: + timeout_minutes: 5 max_attempts: 10 retry_wait_seconds: 60 command: |