multiple gpus on one PE using pytorch and hapi callbacks (#303)

ritvikrao · web-flow · commit a067434616f2 · 2025-06-06T19:29:09.000-04:00
* multiple gpus on one PE using pytorch and hapi callbacks

* print future ids, change if statement
diff --git a/examples/cuda/hapi/multi_gpu_callback.py b/examples/cuda/hapi/multi_gpu_callback.py
@@ -0,0 +1,54 @@
+'''
+Use one process to launch two torch matmul kernels, each on a separate device
+A HAPI callback is registered for each kernel
+which triggers two different methods
+Must run this program with 2 different gpus
+'''
+
+from charm4py import charm
+import torch
+
+def main(args):
+
+    N=10000
+
+    if not torch.cuda.is_available():
+        print("Error: No GPU detected")
+        charm.exit()
+    if torch.cuda.device_count() < 2:
+        print("Error: fewer than 2 GPUs, only " + str(torch.cuda.device_count()) + " gpus found")
+        charm.exit()
+    
+    cuda0 = torch.device('cuda:0') #first device
+    cuda1 = torch.device('cuda:1') #second device
+
+    stream0 = torch.cuda.Stream(device=cuda0)
+    stream1 = torch.cuda.Stream(device=cuda1)
+
+    #allocate tensors on device 0
+    with cuda0:
+        a0 = torch.randn(N,N)
+        b0 = torch.randn(N,N)
+        c0 = torch.mm(a0, b0)
+    
+    #allocate tensors on device 1
+    with cuda1:
+        a1 = torch.randn(N,N)
+        b1 = torch.randn(N,N)
+        c1 = torch.mm(a1, b1)
+    
+    #create callbacks (should we implement callbacks to entry methods?)
+    future0 = charm.Future()
+    future1 = charm.Future()
+    print("Future 0 id: ", future0.fid)
+    print("Future 1 id: ", future1.fid)
+    futures = [future0, future1]
+    charm.hapiAddCudaCallback(stream0.cuda_stream, future0)
+    charm.hapiAddCudaCallback(stream1.cuda_stream, future1)
+
+    for fut_object in charm.iwait(futures):
+        print('One device kernel complete, id: ', fut_object.fid)
+
+    charm.exit()
+
+charm.start(main)