@@ -81,17 +81,17 @@ jobs:
8181 run : |
8282 echo "Latest commit ID is: ${{ steps.get_commit_id.outputs.commit_id }}"
8383
84- # - name: Build and Push Docker Image
85- # id: build-image
86- # env:
87- # ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
88- # IMAGE_TAG: latest
89- # run: |
90- # # Build development container image and push to ECR
91- # echo "Building and pushing image to $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
92- # docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
93- # docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
94- # echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
84+ - name : Build and Push Docker Image
85+ id : build-image
86+ env :
87+ ECR_REGISTRY : ${{ steps.login-ecr.outputs.registry }}
88+ IMAGE_TAG : latest
89+ run : |
90+ # Build development container image and push to ECR
91+ echo "Building and pushing image to $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
92+ docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
93+ docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
94+ echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
9595
9696 # ----------------------------------------
9797 # JOB 2: Launch EC2 instance with GPU for training
@@ -156,7 +156,7 @@ jobs:
156156 runs-on : ${{ needs.launch-runner.outputs.label }}
157157 outputs :
158158 commit_id : ${{ steps.get_commit_id_ec2.outputs.commit_id }}
159- timeout-minutes : 45
159+ timeout-minutes : 20
160160
161161 steps :
162162 - name : Checkout Code
@@ -190,9 +190,21 @@ jobs:
190190
191191 # 306093656765.dkr.ecr.ap-south-1.amazonaws.com/emlo-session-10-image
192192
193+ - name : Verify GPU Access via torch and lightining
194+ run : |
195+ docker run --gpus all \
196+ --privileged --ipc=host \
197+ -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics \
198+ -e NVIDIA_VISIBLE_DEVICES=all \
199+ --rm ${{ secrets.AWS_ECR_LOGIN_URI }}/${{ secrets.ECR_REPOSITORY_NAME }}:latest \
200+ python -c "import torch; from lightning.pytorch.accelerators import CUDAAccelerator; print(f'PyTorch CUDA: {torch.cuda.is_available()}'); print(f'Lightning GPUs: {CUDAAccelerator.auto_device_count()}')"
201+
193202 - name : Run DVC commands in container
194203 run : |
195- docker run --gpus=all \
204+ docker run --gpus all \
205+ --privileged --ipc=host \
206+ -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics \
207+ -e NVIDIA_VISIBLE_DEVICES=all \
196208 --name session-18-container \
197209 --shm-size=8g \
198210 -v "$(pwd):/workspace" \
0 commit comments