Skip to content

Commit e0ba28b

Browse files
committed
Optimize mutex and semaphore for speed on low contention
1 parent 8eeef6f commit e0ba28b

File tree

3 files changed

+82
-20
lines changed

3 files changed

+82
-20
lines changed

lib/picos_std.sync/mutex.ml

Lines changed: 61 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,21 @@ type _ tdt =
99
| Nothing : [> `Nothing ] tdt
1010

1111
type state =
12+
| Locked
1213
| Unlocked
13-
| Locked of { fiber : Fiber.Maybe.t; waiters : [ `Entry ] tdt Q.t }
14+
| Queued of { fiber : Fiber.Maybe.t; waiters : [ `Entry ] tdt Q.t }
1415

1516
type t = state Atomic.t
1617

1718
let create ?padded () = Multicore_magic.copy_as ?padded @@ Atomic.make Unlocked
18-
let locked_nothing = Locked { fiber = Fiber.Maybe.nothing; waiters = T Zero }
1919

2020
let rec unlock_as owner t backoff =
2121
match Atomic.get t with
2222
| Unlocked -> unlocked ()
23-
| Locked r as before ->
23+
| Locked as before ->
24+
if not (Atomic.compare_and_set t before Unlocked) then
25+
unlock_as owner t (Backoff.once backoff)
26+
| Queued r as before ->
2427
if Fiber.Maybe.equal r.fiber owner then
2528
match r.waiters with
2629
| T Zero ->
@@ -29,42 +32,72 @@ let rec unlock_as owner t backoff =
2932
| T (One _ as q) ->
3033
let (Entry { trigger; fiber }) = Q.head q in
3134
let waiters = Q.tail q in
32-
let after = Locked { fiber; waiters } in
35+
let after = Queued { fiber; waiters } in
3336
if Atomic.compare_and_set t before after then Trigger.signal trigger
3437
else unlock_as owner t (Backoff.once backoff)
3538
else not_owner ()
3639

3740
let[@inline] unlock ?checked t =
38-
let owner = Fiber.Maybe.current_if checked in
39-
unlock_as owner t Backoff.default
41+
match checked with
42+
| Some false ->
43+
if
44+
(* The unlock operation will mutate the atomic location and will be
45+
sequentially consistent. The fenceless get potentially allows us to
46+
avoid performing a failed mutation attempt causing cache coherency
47+
traffic and fenceless get here performs better on ARM. *)
48+
Multicore_magic.fenceless_get t != Locked
49+
|| not (Atomic.compare_and_set t Locked Unlocked)
50+
then unlock_as Fiber.Maybe.nothing t Backoff.default
51+
| None | Some true ->
52+
let owner = Fiber.Maybe.of_fiber (Fiber.current ()) in
53+
unlock_as owner t Backoff.default
4054

4155
let rec cleanup_as (Entry entry_r as entry : [ `Entry ] tdt) t backoff =
4256
(* We have been canceled. If we are the owner, we must unlock the mutex.
4357
Otherwise we must remove our entry from the queue. *)
4458
match Atomic.get t with
45-
| Locked r as before -> begin
59+
| Queued r as before -> begin
4660
match r.waiters with
4761
| T Zero -> unlock_as entry_r.fiber t backoff
4862
| T (One _ as q) ->
4963
let waiters = Q.remove q entry in
5064
if r.waiters == waiters then unlock_as entry_r.fiber t backoff
5165
else
52-
let after = Locked { fiber = r.fiber; waiters } in
66+
let after = Queued { fiber = r.fiber; waiters } in
5367
if not (Atomic.compare_and_set t before after) then
5468
cleanup_as entry t (Backoff.once backoff)
5569
end
70+
| Locked -> unlock_as entry_r.fiber t backoff
5671
| Unlocked -> unlocked ()
5772

5873
let rec lock_as fiber t entry backoff =
5974
match Atomic.get t with
6075
| Unlocked as before ->
6176
let after =
62-
if fiber == Fiber.Maybe.nothing then locked_nothing
63-
else Locked { fiber; waiters = T Zero }
77+
if fiber == Fiber.Maybe.nothing then Locked
78+
else Queued { fiber; waiters = T Zero }
6479
in
6580
if not (Atomic.compare_and_set t before after) then
6681
lock_as fiber t entry (Backoff.once backoff)
67-
| Locked r as before ->
82+
| Locked as before ->
83+
let (Entry entry_r as entry : [ `Entry ] tdt) =
84+
match entry with
85+
| Nothing ->
86+
let trigger = Trigger.create () in
87+
Entry { trigger; fiber }
88+
| Entry _ as entry -> entry
89+
in
90+
let waiters = Q.singleton entry in
91+
let after = Queued { fiber = Fiber.Maybe.nothing; waiters } in
92+
if Atomic.compare_and_set t before after then begin
93+
match Trigger.await entry_r.trigger with
94+
| None -> ()
95+
| Some (exn, bt) ->
96+
cleanup_as entry t Backoff.default;
97+
Printexc.raise_with_backtrace exn bt
98+
end
99+
else lock_as fiber t entry (Backoff.once backoff)
100+
| Queued r as before ->
68101
if Fiber.Maybe.unequal r.fiber fiber then
69102
let (Entry entry_r as entry : [ `Entry ] tdt) =
70103
match entry with
@@ -74,7 +107,7 @@ let rec lock_as fiber t entry backoff =
74107
| Entry _ as entry -> entry
75108
in
76109
let waiters = Q.add r.waiters entry in
77-
let after = Locked { fiber = r.fiber; waiters } in
110+
let after = Queued { fiber = r.fiber; waiters } in
78111
if Atomic.compare_and_set t before after then begin
79112
match Trigger.await entry_r.trigger with
80113
| None -> ()
@@ -86,15 +119,27 @@ let rec lock_as fiber t entry backoff =
86119
else owner ()
87120

88121
let[@inline] lock ?checked t =
89-
let fiber = Fiber.Maybe.current_and_check_if checked in
90-
lock_as fiber t Nothing Backoff.default
122+
match checked with
123+
| Some false ->
124+
if
125+
(* The lock operation will mutate the atomic location and will be
126+
sequentially consistent. The fenceless get potentially allows us to
127+
avoid performing a failed mutation attempt causing cache coherency
128+
traffic and fenceless get here performs better on ARM. *)
129+
Multicore_magic.fenceless_get t != Unlocked
130+
|| not (Atomic.compare_and_set t Unlocked Locked)
131+
then lock_as Fiber.Maybe.nothing t Nothing Backoff.default
132+
| None | Some true ->
133+
let fiber = Fiber.current () in
134+
Fiber.check fiber;
135+
lock_as (Fiber.Maybe.of_fiber fiber) t Nothing Backoff.default
91136

92137
let try_lock ?checked t =
93138
let fiber = Fiber.Maybe.current_and_check_if checked in
94139
Atomic.get t == Unlocked
95140
&& Atomic.compare_and_set t Unlocked
96-
(if fiber == Fiber.Maybe.nothing then locked_nothing
97-
else Locked { fiber; waiters = T Zero })
141+
(if fiber == Fiber.Maybe.nothing then Locked
142+
else Queued { fiber; waiters = T Zero })
98143

99144
let protect ?checked t body =
100145
let fiber = Fiber.Maybe.current_and_check_if checked in

lib/picos_std.sync/picos_std_sync.mli

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,4 +525,12 @@ end
525525
false sharing}. For relatively long lived objects this can improve
526526
performance and make performance more stable at the cost of using more
527527
memory. It is not recommended to use [~padded:true] for short lived
528-
objects. *)
528+
objects.
529+
530+
The primitives provided by this library are generally optimized for low
531+
contention scenariors and size. Generally speaking, for best performance
532+
and scalability, you should try to avoid high contention scenarios by
533+
architecting your program to distribute processing such that sequential
534+
bottlenecks are avoided. If high contention is unavoidable then other
535+
communication and synchronization primitive implementations may provide
536+
better performance. *)

lib/picos_std.sync/semaphore.ml

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ module Counting = struct
1111
Atomic.make (Obj.repr count) |> Multicore_magic.copy_as ?padded
1212

1313
let rec release t backoff =
14-
let before = Atomic.get t in
14+
(* The release operation will mutate the atomic location and will be
15+
sequentially consistent. The fenceless get here performs better on
16+
ARM. *)
17+
let before = Multicore_magic.fenceless_get t in
1518
if Obj.is_int before then begin
1619
let count = Obj.obj before in
1720
if count < count + 1 then begin
@@ -39,7 +42,10 @@ module Counting = struct
3942
then cleanup t trigger (Backoff.once backoff)
4043

4144
let rec acquire t backoff =
42-
let before = Atomic.get t in
45+
(* The acquire operation will mutate the atomic location and will be
46+
sequentially consistent. The fenceless get here performs better on
47+
ARM. *)
48+
let before = Multicore_magic.fenceless_get t in
4349
if Obj.is_int before then
4450
let count = Obj.obj before in
4551
if 0 < count then begin
@@ -96,7 +102,10 @@ module Binary = struct
96102
let make ?padded initial = Counting.make ?padded (Bool.to_int initial)
97103

98104
let rec release t backoff =
99-
let before = Atomic.get t in
105+
(* The release operation will mutate the atomic location and will be
106+
sequentially consistent. The fenceless get here performs better on
107+
ARM. *)
108+
let before = Multicore_magic.fenceless_get t in
100109
if Obj.is_int before then begin
101110
let count = Obj.obj before in
102111
if count = 0 then

0 commit comments

Comments
 (0)