|
| 1 | +''' |
| 2 | +Use one process to launch two torch matmul kernels, each on a separate device |
| 3 | +A HAPI callback is registered for each kernel |
| 4 | +which triggers two different methods |
| 5 | +Must run this program with 2 different gpus |
| 6 | +''' |
| 7 | + |
| 8 | +from charm4py import charm |
| 9 | +import torch |
| 10 | + |
| 11 | +def main(args): |
| 12 | + |
| 13 | + N=10000 |
| 14 | + |
| 15 | + if not torch.cuda.is_available(): |
| 16 | + print("Error: No GPU detected") |
| 17 | + charm.exit() |
| 18 | + if torch.cuda.device_count() < 2: |
| 19 | + print("Error: fewer than 2 GPUs, only " + str(torch.cuda.device_count()) + " gpus found") |
| 20 | + charm.exit() |
| 21 | + |
| 22 | + cuda0 = torch.device('cuda:0') #first device |
| 23 | + cuda1 = torch.device('cuda:1') #second device |
| 24 | + |
| 25 | + stream0 = torch.cuda.Stream(device=cuda0) |
| 26 | + stream1 = torch.cuda.Stream(device=cuda1) |
| 27 | + |
| 28 | + #allocate tensors on device 0 |
| 29 | + with cuda0: |
| 30 | + a0 = torch.randn(N,N) |
| 31 | + b0 = torch.randn(N,N) |
| 32 | + c0 = torch.mm(a0, b0) |
| 33 | + |
| 34 | + #allocate tensors on device 1 |
| 35 | + with cuda1: |
| 36 | + a1 = torch.randn(N,N) |
| 37 | + b1 = torch.randn(N,N) |
| 38 | + c1 = torch.mm(a1, b1) |
| 39 | + |
| 40 | + #create callbacks (should we implement callbacks to entry methods?) |
| 41 | + future0 = charm.Future() |
| 42 | + future1 = charm.Future() |
| 43 | + print("Future 0 id: ", future0.fid) |
| 44 | + print("Future 1 id: ", future1.fid) |
| 45 | + futures = [future0, future1] |
| 46 | + charm.hapiAddCudaCallback(stream0.cuda_stream, future0) |
| 47 | + charm.hapiAddCudaCallback(stream1.cuda_stream, future1) |
| 48 | + |
| 49 | + for fut_object in charm.iwait(futures): |
| 50 | + print('One device kernel complete, id: ', fut_object.fid) |
| 51 | + |
| 52 | + charm.exit() |
| 53 | + |
| 54 | +charm.start(main) |
0 commit comments