allgather causes SEGFAULT

## Summary

Calling `torch.distributed.all_gather()` when using the `'ccl'` backend results in a SEGFAULT if the tensors being gathered are larger than a few megabytes.

This problem also seems to occur with `gather()`.

## Steps to Reproduce

See my minimal reproducible example repo here: <https://github.com/Iain-S/torch-ccl-segfault/tree/main>

Using a tensor of around 11MiB is enough to cause a segfault.

## Expected Behaviour

I would not expect a SEGFAULT to be raised.

## Actual Behaviour

I get the following output

```text
Caught signal 11 (Segmentation fault: address not mapped to object at address 0xff00000002600000)

LIBXSMM_VERSION: main_stable-1.17-3651 (25693763)==== backtrace (tid:  43554) ====
 0 0x0000000000012cf0 __funlockfile()  :0
 1 0x00000000000cee73 __memmove_avx_unaligned_erms()  :0
 2 0x00000000004b80c3 zeKernelSetIndirectAccessTracing()  ???:0
 3 0x0000000000106cb8 zetGetMetricGroupExpProcAddrTable()  ???:0
 4 0x00000000000fc2da zetGetMetricGroupExpProcAddrTable()  ???:0
 5 0x00000000001967a4 zetGetMetricGroupExpProcAddrTable()  ???:0
 6 0x00000000001a5aa8 zetGetMetricGroupExpProcAddrTable()  ???:0
 7 0x00000000000f1f34 ???()  /lib64/libze_intel_gpu.so.1:0
 8 0x0000000000046103 std::vector<void*, std::allocator<void*> >::resize()  ???:0
 9 0x0000000000012975 zeGetFabricVertexExpProcAddrTable()  ???:0
10 0x000000000052cce9 ze_cmd_memory_copy::ze_call()  :0
11 0x000000000052d933 ze_copy_entry::init_ze_hook()  :0
12 0x000000000050d14d ze_base_entry::init()  :0
13 0x000000000050df3a ze_base_entry::init_entries()  :0
14 0x000000000050e25f ze_base_entry::start()  :0
15 0x0000000000470578 sched_entry::do_progress()  :0
16 0x0000000000482985 ccl_sched::do_progress()  :0
17 0x00000000003fe969 ccl_worker::process_sched_bin()  :0
18 0x00000000003fe540 ccl_worker::process_sched_queue()  :0
19 0x00000000003fd31b ccl_worker::do_work()  :0
20 0x00000000003f84bd ccl_executor::wait()  :0
21 0x00000000003061fb ccl_coll_create()  coll-f53f59.cpp:0
22 0x00000000003056a6 ccl_allgatherv_impl()  :0
23 0x000000000035aa1e ccl_comm::allgatherv_impl()  :0
24 0x000000000036d5cb ccl_comm::allgatherv()  :0
25 0x00000000004c4be1 ccl::v1::allgatherv()  ???:0
26 0x0000000000035fdd oneccl_bindings_for_pytorch::CollectiveAsyncWorkCCL<oneccl_bindings_for_pytorch::XPUCCLStubs::allgather_(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&, c10d::ProcessGroupCCL&)::{lambda(at::Tensor, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, ccl::v1::allgatherv_attr, c
31 0x0000000000030aee c10d::ProcessGroupCCL::allgather()  ???:0
32 0x000000000002a3fa c10d::ops::allgather_xpu_()  ???:0
33 0x000000000003cecf c10::impl::make_boxed_from_unboxed_functor<c10::impl::detail::WrapFunctionIntoRuntimeFunctor_<std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > > (*)(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long), std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > >, c10::guts::typelist::typelist<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long> >, false>::call()  ???:0
34 0x0000000004c8d308 c10::impl::BoxedKernelWrapper<std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > > (std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long), void>::call()  :0
35 0x0000000004c98b45 c10d::ProcessGroup::allgather()  :0
36 0x0000000000b7e8b0 pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> >, c10d::ProcessGroup, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&, pybind11::name, pybind11::is_method, pybind11::sibling, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::call_guard<pybind11::gil_scoped_release> >(c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > (c10d::ProcessGroup::*)(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::alloca
::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::T
ensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::call_guard<pybind11::gil_scoped_release> const&)::{lambda(c10d::ProcessGroup*, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&)#1}&&, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > (*)(c10d::ProcessGroup*, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::call_guard<pybind11::gil_scoped_release> const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN()  :0
37 0x00000000003975c7 pybind11::cpp_function::dispatcher()  :0
38 0x00000000001639fb PyCFunction_Call()  ???:0
39 0x000000000019046b _PyObject_MakeTpCall()  ???:0
40 0x00000000001c8c17 PyEval_EvalCodeEx()  ???:0
41 0x000000000020bcb1 _PyEval_EvalFrameDefault()  ???:0
42 0x00000000001cb465 _PyFunction_Vectorcall()  ???:0
43 0x00000000001638bb PyObject_Call()  ???:0
44 0x0000000000209030 _PyEval_EvalFrameDefault()  ???:0
45 0x00000000001cb465 _PyFunction_Vectorcall()  ???:0
46 0x000000000020bcb1 _PyEval_EvalFrameDefault()  ???:0
47 0x00000000001cb465 _PyFunction_Vectorcall()  ???:0
48 0x0000000000206f4d _PyEval_EvalFrameDefault()  ???:0
49 0x00000000001c7713 PyList_SetSlice()  ???:0
50 0x00000000001c870f _PyEval_EvalCodeWithName()  ???:0
51 0x00000000001c8743 PyEval_EvalCode()  ???:0
52 0x0000000000279dad _PyImport_FixupBuiltin()  ???:0
53 0x000000000028db0a PyAST_CompileObject()  ???:0
54 0x000000000011d2f6 PyRun_String()  ???:0
55 0x000000000028e325 PyRun_SimpleFileExFlags()  ???:0
56 0x000000000028e7d2 Py_RunMain()  ???:0
57 0x000000000028e919 Py_BytesMain()  ???:0
```

## Versions

* Python 3.9
* For Python package versions, see README in my repo.
* CCL  v2021.11.2
* Running on an Intel(R) Data Center GPU Max 1550


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

allgather causes SEGFAULT #56

Summary

Steps to Reproduce

Expected Behaviour

Actual Behaviour

Versions

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

allgather causes SEGFAULT #56

Description

Summary

Steps to Reproduce

Expected Behaviour

Actual Behaviour

Versions

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions