Skip to content

Commit 123dbb7

Browse files
committed
update worklflow
1 parent 40ba831 commit 123dbb7

File tree

4 files changed

+29
-17
lines changed

4 files changed

+29
-17
lines changed

.github/workflows/01A_Deployment_Train_And_Store.yaml

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,17 @@ jobs:
8181
run: |
8282
echo "Latest commit ID is: ${{ steps.get_commit_id.outputs.commit_id }}"
8383
84-
# - name: Build and Push Docker Image
85-
# id: build-image
86-
# env:
87-
# ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
88-
# IMAGE_TAG: latest
89-
# run: |
90-
# # Build development container image and push to ECR
91-
# echo "Building and pushing image to $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
92-
# docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
93-
# docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
94-
# echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
84+
- name: Build and Push Docker Image
85+
id: build-image
86+
env:
87+
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
88+
IMAGE_TAG: latest
89+
run: |
90+
# Build development container image and push to ECR
91+
echo "Building and pushing image to $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
92+
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
93+
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
94+
echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
9595
9696
#----------------------------------------
9797
# JOB 2: Launch EC2 instance with GPU for training
@@ -156,7 +156,7 @@ jobs:
156156
runs-on: ${{ needs.launch-runner.outputs.label }}
157157
outputs:
158158
commit_id: ${{ steps.get_commit_id_ec2.outputs.commit_id }}
159-
timeout-minutes: 45
159+
timeout-minutes: 20
160160

161161
steps:
162162
- name: Checkout Code
@@ -190,9 +190,21 @@ jobs:
190190

191191
# 306093656765.dkr.ecr.ap-south-1.amazonaws.com/emlo-session-10-image
192192

193+
- name: Verify GPU Access via torch and lightining
194+
run: |
195+
docker run --gpus all \
196+
--privileged --ipc=host \
197+
-e NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics \
198+
-e NVIDIA_VISIBLE_DEVICES=all \
199+
--rm ${{ secrets.AWS_ECR_LOGIN_URI }}/${{ secrets.ECR_REPOSITORY_NAME }}:latest \
200+
python -c "import torch; from lightning.pytorch.accelerators import CUDAAccelerator; print(f'PyTorch CUDA: {torch.cuda.is_available()}'); print(f'Lightning GPUs: {CUDAAccelerator.auto_device_count()}')"
201+
193202
- name: Run DVC commands in container
194203
run: |
195-
docker run --gpus=all \
204+
docker run --gpus all \
205+
--privileged --ipc=host \
206+
-e NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics \
207+
-e NVIDIA_VISIBLE_DEVICES=all \
196208
--name session-18-container \
197209
--shm-size=8g \
198210
-v "$(pwd):/workspace" \

.github/workflows/01D_Deployment_PR_Train_And_Store.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Deployment 01E - PR - Train and Store
1+
name: Deployment 01D - PR - Train and Store
22

33
on:
44
workflow_call:
@@ -158,7 +158,7 @@ jobs:
158158
runs-on: ${{ needs.launch-runner.outputs.label }}
159159
# outputs:
160160
# commit_id: ${{ steps.get_commit_id_ec2.outputs.commit_id }}
161-
timeout-minutes: 30
161+
timeout-minutes: 23
162162

163163
steps:
164164
# - uses: iterative/setup-cml@v2

.github/workflows/01E_Deployment_PR_Push_Image.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Deployment 01D - PR - Build Image
1+
name: Deployment 01E - PR - Build Image
22

33
on:
44
workflow_call:

.github/workflows/01E_Deployment_PR_Train_And_Store.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Deployment 01D - PR - Train and Store
1+
name: Deployment 01E - PR - Train and Store
22

33
on:
44
workflow_call:

0 commit comments

Comments
 (0)