-
Notifications
You must be signed in to change notification settings - Fork 31
Description
Dear domainslib maintainers,
While working on a repro case for an unrelated data race, I observed that the fib_par.ml
example in the README exhibits a TSAN data race!
(cli) ntaylor@smgrp ~/c/p/re_fun> opam switch
# switch compiler description
→ semgrep-tsan ocaml-option-tsan.1,ocaml-variants.5.3.0+options
ocaml-variants = 5.3.0+options & ocaml-option-tsan
module T = Domainslib.Task
let num_domains = 4
let n = 10
(* Sequential Fibonacci *)
let rec fib n =
if n < 2 then 1 else fib (n - 1) + fib (n - 2)
let rec fib_par pool n =
if n > 20 then begin
let a = T.async pool (fun _ -> fib_par pool (n-1)) in
let b = T.async pool (fun _ -> fib_par pool (n-2)) in
T.await pool a + T.await pool b
end else
(* Call sequential Fibonacci if the available work is small *)
fib n
let main () =
let pool = T.setup_pool ~num_domains:(num_domains - 1) () in
let res = T.run pool (fun _ -> fib_par pool n) in
T.teardown_pool pool;
Printf.printf "fib(%d) = %d\n" n res
let _ = main ()
(cli) ntaylor@smgrp ~/c/p/re_fun> export TSAN_OPTIONS="exitcode=0 history_size=7"
(cli) ntaylor@smgrp ~/c/p/re_fun> dune exec --profile tsan re_fun
[...]
==================
WARNING: ThreadSanitizer: data race (pid=79645)
Read of size 8 at 0x000100e1b908 by thread T4 (mutexes: write M0):
#0 camlSaturn__Ws_deque.steal_as_578 <null> (main.exe:arm64+0x100011798)
#1 camlDomainslib__Multi_channel.recv_poll_loop_648 <null> (main.exe:arm64+0x100006e74)
#2 camlDomainslib__Multi_channel.recv_poll_repeated_665 <null> (main.exe:arm64+0x100007684)
#3 camlDomainslib__Multi_channel.recv_669 <null> (main.exe:arm64+0x10000791c)
#4 camlDomainslib__Task.worker_628 <null> (main.exe:arm64+0x1000091f8)
#5 camlStdlib__Fun.protect_326 <null> (main.exe:arm64+0x100099b94)
#6 camlStdlib__Domain.body_741 <null> (main.exe:arm64+0x100071820)
#7 caml_start_program <null> (main.exe:arm64+0x100106000)
#8 caml_callback_exn callback.c:206 (main.exe:arm64+0x1000c0b64)
#9 caml_callback_res callback.c:321 (main.exe:arm64+0x1000c1444)
#10 domain_thread_func domain.c:1245 (main.exe:arm64+0x1000c4f88)
Previous atomic write of size 8 at 0x000100e1b908 by thread T6 (mutexes: write M1):
#0 caml_atomic_cas memory.c:388 (main.exe:arm64+0x1000e87ac)
#1 camlSaturn__Ws_deque.steal_as_578 <null> (main.exe:arm64+0x100011860)
#2 camlDomainslib__Multi_channel.recv_poll_loop_648 <null> (main.exe:arm64+0x100006e74)
#3 camlDomainslib__Multi_channel.recv_poll_repeated_665 <null> (main.exe:arm64+0x100007684)
#4 camlDomainslib__Multi_channel.recv_669 <null> (main.exe:arm64+0x10000791c)
#5 camlDomainslib__Task.worker_628 <null> (main.exe:arm64+0x1000091f8)
#6 camlStdlib__Fun.protect_326 <null> (main.exe:arm64+0x100099b94)
#7 camlStdlib__Domain.body_741 <null> (main.exe:arm64+0x100071820)
#8 caml_start_program <null> (main.exe:arm64+0x100106000)
#9 caml_callback_exn callback.c:206 (main.exe:arm64+0x1000c0b64)
#10 caml_callback_res callback.c:321 (main.exe:arm64+0x1000c1444)
#11 domain_thread_func domain.c:1245 (main.exe:arm64+0x1000c4f88)
Mutex M0 (0x000105600328) created at:
#0 pthread_mutex_init <null> (libclang_rt.tsan_osx_dynamic.dylib:arm64e+0x34594)
#1 caml_plat_mutex_init platform.c:59 (main.exe:arm64+0x1000f27bc)
#2 caml_init_domains domain.c:974 (main.exe:arm64+0x1000c43fc)
#3 caml_init_gc gc_ctrl.c:348 (main.exe:arm64+0x1000d4168)
#4 caml_startup_common startup_nat.c:106 (main.exe:arm64+0x100104c98)
#5 caml_main startup_nat.c:146 (main.exe:arm64+0x100104e68)
#6 main main.c:37 (main.exe:arm64+0x1000e251c)
[...]
SUMMARY: ThreadSanitizer: data race (main.exe:arm64+0x100011798) in camlSaturn__Ws_deque.steal_as_578+0x68
==================
fib(10) = 89
ThreadSanitizer: reported 1 warnings
ThreadSanitizer: reported 1 warnings
Technically, I can trigger this with a far simpler repro case:
(cli) ntaylor@smgrp ~/c/p/re_fun> cat bin/main.ml
module T = Domainslib.Task
let () =
let pool = T.setup_pool ~num_domains:4 () in
Printf.printf "%d\n" (T.run pool (fun () -> 42));
T.teardown_pool pool;
(cli) ntaylor@smgrp ~/c/p/re_fun>
This appears to be similar to ocaml-multicore/saturn#39 and perhaps this issue should be filed with the Saturn maintainers too - however, the Saturn issue was closed without a fix.
It's a frictive experience for TSAN warnings in libraries, even if they are de-facto benign, to be emitted in application code, and given how simple the repro case is, one imagines every domainslib program will report a race under TSAN, so IMHO a fix to squelch this issue (by removing the race altogether or some sort of annotation for TSAN) should be reconsidered.
Thanks for your consideration,
Nathan