Skip to content

Commit 44fb57a

Browse files
authored
CA-401650: reduce open connections between pool members and the coordinator (#6110)
The GC timers in the stunnel cache were ineffective because we hardly ever run this GC code, and also because the maximums were very large (70+).
2 parents c69162b + f9a523d commit 44fb57a

File tree

5 files changed

+36
-4
lines changed

5 files changed

+36
-4
lines changed

ocaml/libs/stunnel/stunnel_cache.ml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,19 @@ let debug = if debug_enabled then debug else ignore_log
4040
type endpoint = {host: string; port: int}
4141

4242
(* Need to limit the absolute number of stunnels as well as the maximum age *)
43-
let max_stunnel = 70
43+
let max_stunnel = Atomic.make 70
4444

45-
let max_age = 180. *. 60. (* seconds *)
45+
let set_max_stunnel n =
46+
D.info "Setting max_stunnel = %d" n ;
47+
Atomic.set max_stunnel n
4648

47-
let max_idle = 5. *. 60. (* seconds *)
49+
let max_age = ref (180. *. 60.) (* seconds *)
50+
51+
let max_idle = ref (5. *. 60.) (* seconds *)
4852

4953
(* The add function adds the new stunnel before doing gc, so the cache *)
5054
(* can briefly contain one more than maximum. *)
51-
let capacity = max_stunnel + 1
55+
let capacity = Atomic.get max_stunnel + 1
5256

5357
(** An index of endpoints to stunnel IDs *)
5458
let index : (endpoint, int list) Hashtbl.t ref = ref (Hashtbl.create capacity)
@@ -104,6 +108,7 @@ let unlocked_gc () =
104108
let to_gc = ref [] in
105109
(* Find the ones which are too old *)
106110
let now = Unix.gettimeofday () in
111+
let max_age = !max_age and max_idle = !max_idle in
107112
Tbl.iter !stunnels (fun idx stunnel ->
108113
match Hashtbl.find_opt !times idx with
109114
| Some time ->
@@ -122,6 +127,7 @@ let unlocked_gc () =
122127
debug "%s: found no entry for idx=%d" __FUNCTION__ idx
123128
) ;
124129
let num_remaining = List.length all_ids - List.length !to_gc in
130+
let max_stunnel = Atomic.get max_stunnel in
125131
if num_remaining > max_stunnel then (
126132
let times' = Hashtbl.fold (fun k v acc -> (k, v) :: acc) !times [] in
127133
let times' =

ocaml/libs/stunnel/stunnel_cache.mli

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919
HTTP 1.1 should be used and the connection should be kept-alive.
2020
*)
2121

22+
val set_max_stunnel : int -> unit
23+
(** [set_max_stunnel] set the maximum number of unusued, but cached client stunnel connections.
24+
This should be a low number on pool members, to avoid hitting limits on the coordinator with large pools.
25+
*)
26+
2227
val with_connect :
2328
?use_fork_exec_helper:bool
2429
-> ?write_to_log:(string -> unit)
@@ -46,3 +51,9 @@ val flush : unit -> unit
4651

4752
val gc : unit -> unit
4853
(** GCs old stunnels *)
54+
55+
val max_age : float ref
56+
(** maximum time a connection is kept in the stunnel cache, counted from the time it got initially added to the cache *)
57+
58+
val max_idle : float ref
59+
(** maximum time a connection is kept in the stunnel cache, counted from the most recent time it got (re)added to the cache. *)

ocaml/xapi/xapi.ml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,8 @@ let server_init () =
11431143
] ;
11441144
( match Pool_role.get_role () with
11451145
| Pool_role.Master ->
1146+
Stunnel_cache.set_max_stunnel
1147+
!Xapi_globs.coordinator_max_stunnel_cache ;
11461148
()
11471149
| Pool_role.Broken ->
11481150
info "This node is broken; moving straight to emergency mode" ;
@@ -1151,6 +1153,7 @@ let server_init () =
11511153
server_run_in_emergency_mode ()
11521154
| Pool_role.Slave _ ->
11531155
info "Running in 'Pool Slave' mode" ;
1156+
Stunnel_cache.set_max_stunnel !Xapi_globs.member_max_stunnel_cache ;
11541157
(* Set emergency mode until we actually talk to the master *)
11551158
Xapi_globs.slave_emergency_mode := true ;
11561159
(* signal the init script that it should succeed even though we're bust *)

ocaml/xapi/xapi_globs.ml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,6 +1011,10 @@ let header_total_timeout_tcp = ref 60.
10111011
let max_header_length_tcp = ref 1024
10121012
(* Maximum accepted size of HTTP headers in bytes (on TCP only) *)
10131013

1014+
let coordinator_max_stunnel_cache = ref 70
1015+
1016+
let member_max_stunnel_cache = ref 70
1017+
10141018
let conn_limit_tcp = ref 800
10151019

10161020
let conn_limit_unix = ref 1024
@@ -1142,9 +1146,13 @@ let xapi_globs_spec =
11421146
; ("header_read_timeout_tcp", Float header_read_timeout_tcp)
11431147
; ("header_total_timeout_tcp", Float header_total_timeout_tcp)
11441148
; ("max_header_length_tcp", Int max_header_length_tcp)
1149+
; ("coordinator_max_stunnel_cache", Int coordinator_max_stunnel_cache)
1150+
; ("member_max_stunnel_cache", Int member_max_stunnel_cache)
11451151
; ("conn_limit_tcp", Int conn_limit_tcp)
11461152
; ("conn_limit_unix", Int conn_limit_unix)
11471153
; ("conn_limit_clientcert", Int conn_limit_clientcert)
1154+
; ("stunnel_cache_max_age", Float Stunnel_cache.max_age)
1155+
; ("stunnel_cache_max_idle", Float Stunnel_cache.max_idle)
11481156
; ("export_interval", Float export_interval)
11491157
; ("max_spans", Int max_spans)
11501158
; ("max_traces", Int max_traces)

ocaml/xapi/xapi_periodic_scheduler_init.ml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ let register ~__context =
114114
Xapi_host.alert_if_tls_verification_was_emergency_disabled ~__context
115115
)
116116
) ;
117+
let stunnel_period = !Stunnel_cache.max_idle /. 2. in
118+
Xapi_periodic_scheduler.add_to_queue "Check stunnel cache expiry"
119+
(Xapi_periodic_scheduler.Periodic stunnel_period) stunnel_period
120+
Stunnel_cache.gc ;
117121
if
118122
master
119123
&& Db.Pool.get_update_sync_enabled ~__context

0 commit comments

Comments
 (0)