-
Notifications
You must be signed in to change notification settings - Fork 32
Open
Description
Summary
Calling torch.distributed.all_gather()
when using the 'ccl'
backend results in a SEGFAULT if the tensors being gathered are larger than a few megabytes.
This problem also seems to occur with gather()
.
Steps to Reproduce
See my minimal reproducible example repo here: https://github.com/Iain-S/torch-ccl-segfault/tree/main
Using a tensor of around 11MiB is enough to cause a segfault.
Expected Behaviour
I would not expect a SEGFAULT to be raised.
Actual Behaviour
I get the following output
Caught signal 11 (Segmentation fault: address not mapped to object at address 0xff00000002600000)
LIBXSMM_VERSION: main_stable-1.17-3651 (25693763)==== backtrace (tid: 43554) ====
0 0x0000000000012cf0 __funlockfile() :0
1 0x00000000000cee73 __memmove_avx_unaligned_erms() :0
2 0x00000000004b80c3 zeKernelSetIndirectAccessTracing() ???:0
3 0x0000000000106cb8 zetGetMetricGroupExpProcAddrTable() ???:0
4 0x00000000000fc2da zetGetMetricGroupExpProcAddrTable() ???:0
5 0x00000000001967a4 zetGetMetricGroupExpProcAddrTable() ???:0
6 0x00000000001a5aa8 zetGetMetricGroupExpProcAddrTable() ???:0
7 0x00000000000f1f34 ???() /lib64/libze_intel_gpu.so.1:0
8 0x0000000000046103 std::vector<void*, std::allocator<void*> >::resize() ???:0
9 0x0000000000012975 zeGetFabricVertexExpProcAddrTable() ???:0
10 0x000000000052cce9 ze_cmd_memory_copy::ze_call() :0
11 0x000000000052d933 ze_copy_entry::init_ze_hook() :0
12 0x000000000050d14d ze_base_entry::init() :0
13 0x000000000050df3a ze_base_entry::init_entries() :0
14 0x000000000050e25f ze_base_entry::start() :0
15 0x0000000000470578 sched_entry::do_progress() :0
16 0x0000000000482985 ccl_sched::do_progress() :0
17 0x00000000003fe969 ccl_worker::process_sched_bin() :0
18 0x00000000003fe540 ccl_worker::process_sched_queue() :0
19 0x00000000003fd31b ccl_worker::do_work() :0
20 0x00000000003f84bd ccl_executor::wait() :0
21 0x00000000003061fb ccl_coll_create() coll-f53f59.cpp:0
22 0x00000000003056a6 ccl_allgatherv_impl() :0
23 0x000000000035aa1e ccl_comm::allgatherv_impl() :0
24 0x000000000036d5cb ccl_comm::allgatherv() :0
25 0x00000000004c4be1 ccl::v1::allgatherv() ???:0
26 0x0000000000035fdd oneccl_bindings_for_pytorch::CollectiveAsyncWorkCCL<oneccl_bindings_for_pytorch::XPUCCLStubs::allgather_(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&, c10d::ProcessGroupCCL&)::{lambda(at::Tensor, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, ccl::v1::allgatherv_attr, c
31 0x0000000000030aee c10d::ProcessGroupCCL::allgather() ???:0
32 0x000000000002a3fa c10d::ops::allgather_xpu_() ???:0
33 0x000000000003cecf c10::impl::make_boxed_from_unboxed_functor<c10::impl::detail::WrapFunctionIntoRuntimeFunctor_<std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > > (*)(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long), std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > >, c10::guts::typelist::typelist<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long> >, false>::call() ???:0
34 0x0000000004c8d308 c10::impl::BoxedKernelWrapper<std::tuple<std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > > (std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, c10::ArrayRef<at::Tensor>, c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, long), void>::call() :0
35 0x0000000004c98b45 c10d::ProcessGroup::allgather() :0
36 0x0000000000b7e8b0 pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> >, c10d::ProcessGroup, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&, pybind11::name, pybind11::is_method, pybind11::sibling, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::call_guard<pybind11::gil_scoped_release> >(c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > (c10d::ProcessGroup::*)(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::alloca
::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::T
ensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::call_guard<pybind11::gil_scoped_release> const&)::{lambda(c10d::ProcessGroup*, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&)#1}&&, c10::intrusive_ptr<c10d::Work, c10::detail::intrusive_target_default_null_type<c10d::Work> > (*)(c10d::ProcessGroup*, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::call_guard<pybind11::gil_scoped_release> const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN() :0
37 0x00000000003975c7 pybind11::cpp_function::dispatcher() :0
38 0x00000000001639fb PyCFunction_Call() ???:0
39 0x000000000019046b _PyObject_MakeTpCall() ???:0
40 0x00000000001c8c17 PyEval_EvalCodeEx() ???:0
41 0x000000000020bcb1 _PyEval_EvalFrameDefault() ???:0
42 0x00000000001cb465 _PyFunction_Vectorcall() ???:0
43 0x00000000001638bb PyObject_Call() ???:0
44 0x0000000000209030 _PyEval_EvalFrameDefault() ???:0
45 0x00000000001cb465 _PyFunction_Vectorcall() ???:0
46 0x000000000020bcb1 _PyEval_EvalFrameDefault() ???:0
47 0x00000000001cb465 _PyFunction_Vectorcall() ???:0
48 0x0000000000206f4d _PyEval_EvalFrameDefault() ???:0
49 0x00000000001c7713 PyList_SetSlice() ???:0
50 0x00000000001c870f _PyEval_EvalCodeWithName() ???:0
51 0x00000000001c8743 PyEval_EvalCode() ???:0
52 0x0000000000279dad _PyImport_FixupBuiltin() ???:0
53 0x000000000028db0a PyAST_CompileObject() ???:0
54 0x000000000011d2f6 PyRun_String() ???:0
55 0x000000000028e325 PyRun_SimpleFileExFlags() ???:0
56 0x000000000028e7d2 Py_RunMain() ???:0
57 0x000000000028e919 Py_BytesMain() ???:0
Versions
- Python 3.9
- For Python package versions, see README in my repo.
- CCL v2021.11.2
- Running on an Intel(R) Data Center GPU Max 1550
Metadata
Metadata
Assignees
Labels
No labels