From f30f1948b5eff52a6b84c79c3126743a89ca7735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Thu, 5 Jun 2025 08:13:09 +0200 Subject: [PATCH 1/6] default LBP: extract method with sanity checks `DefaultPolicy::pick()` begins with some sanity checks, whose goal is to warn the user about misconfiguration. Those checks are extracted into a separate method, `DefaultPolicy::pick_sanity_checks()`, so that `pick()` is decluttered. The next commits introduce more sanity checks. --- scylla/src/policies/load_balancing/default.rs | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/scylla/src/policies/load_balancing/default.rs b/scylla/src/policies/load_balancing/default.rs index 9f47bed244..3f613e5ea0 100644 --- a/scylla/src/policies/load_balancing/default.rs +++ b/scylla/src/policies/load_balancing/default.rs @@ -167,22 +167,8 @@ impl LoadBalancingPolicy for DefaultPolicy { * for the statement, so that we can pick one of them. */ let routing_info = self.routing_info(query, cluster); - if let Some(ref token_with_strategy) = routing_info.token_with_strategy { - if self.preferences.datacenter().is_some() - && !self.permit_dc_failover - && matches!( - token_with_strategy.strategy, - Strategy::SimpleStrategy { .. } - ) - { - warn!("\ -Combining SimpleStrategy with preferred_datacenter set to Some and disabled datacenter failover may lead to empty query plans for some tokens.\ -It is better to give up using one of them: either operate in a keyspace with NetworkTopologyStrategy, which explicitly states\ -how many replicas there are in each datacenter (you probably want at least 1 to avoid empty plans while preferring that datacenter), \ -or refrain from preferring datacenters (which may ban all other datacenters, if datacenter failover happens to be not possible)." - ); - } - } + /* Check for misconfiguration and warn if any is discovered. */ + self.pick_sanity_checks(&routing_info, cluster); /* LWT statements need to be routed differently: always to the same replica, to avoid Paxos contention. */ let statement_type = if query.is_confirmed_lwt { @@ -602,6 +588,26 @@ impl DefaultPolicy { routing_info } + /// Checks for misconfiguration and warns if any is discovered. + fn pick_sanity_checks(&self, routing_info: &ProcessedRoutingInfo, _cluster: &ClusterState) { + if let Some(ref token_with_strategy) = routing_info.token_with_strategy { + if self.preferences.datacenter().is_some() + && !self.permit_dc_failover + && matches!( + token_with_strategy.strategy, + Strategy::SimpleStrategy { .. } + ) + { + warn!("\ +Combining SimpleStrategy with preferred_datacenter set to Some and disabled datacenter failover may lead to empty query plans for some tokens.\ +It is better to give up using one of them: either operate in a keyspace with NetworkTopologyStrategy, which explicitly states\ +how many replicas there are in each datacenter (you probably want at least 1 to avoid empty plans while preferring that datacenter), \ +or refrain from preferring datacenters (which may ban all other datacenters, if datacenter failover happens to be not possible)." + ); + } + } + } + /// Returns all nodes in the local datacenter if one is given, /// or else all nodes in the cluster. fn preferred_node_set<'a>(&'a self, cluster: &'a ClusterState) -> &'a [Arc] { From c6edc7477bcccea99b5078f1d72d0e9bb29c3b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Thu, 5 Jun 2025 08:18:31 +0200 Subject: [PATCH 2/6] default LBP: pick_sanity_checks() slight refactor This makes the next commit cleaner. --- scylla/src/policies/load_balancing/default.rs | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/scylla/src/policies/load_balancing/default.rs b/scylla/src/policies/load_balancing/default.rs index 3f613e5ea0..7160b1bee0 100644 --- a/scylla/src/policies/load_balancing/default.rs +++ b/scylla/src/policies/load_balancing/default.rs @@ -590,20 +590,21 @@ impl DefaultPolicy { /// Checks for misconfiguration and warns if any is discovered. fn pick_sanity_checks(&self, routing_info: &ProcessedRoutingInfo, _cluster: &ClusterState) { - if let Some(ref token_with_strategy) = routing_info.token_with_strategy { - if self.preferences.datacenter().is_some() - && !self.permit_dc_failover - && matches!( - token_with_strategy.strategy, - Strategy::SimpleStrategy { .. } - ) - { - warn!("\ + if let Some(_preferred_dc) = self.preferences.datacenter() { + if let Some(ref token_with_strategy) = routing_info.token_with_strategy { + if !self.permit_dc_failover + && matches!( + token_with_strategy.strategy, + Strategy::SimpleStrategy { .. } + ) + { + warn!("\ Combining SimpleStrategy with preferred_datacenter set to Some and disabled datacenter failover may lead to empty query plans for some tokens.\ It is better to give up using one of them: either operate in a keyspace with NetworkTopologyStrategy, which explicitly states\ how many replicas there are in each datacenter (you probably want at least 1 to avoid empty plans while preferring that datacenter), \ or refrain from preferring datacenters (which may ban all other datacenters, if datacenter failover happens to be not possible)." - ); + ); + } } } } From e2ba54b8ad95f851d28c9f56071443a737664a15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Thu, 5 Jun 2025 08:21:37 +0200 Subject: [PATCH 3/6] default LBP: warn on preferred DC misconfiguration DefaultPolicy now issues respective warnings: 1. if the preferred datacenter is not present in the cluster; 2. if all nodes in the preferred datacenter are disabled by the HostFilter. This helps avoid confusion when the user expects requests to be sent to a specific datacenter, but it is not available. As warnings are going to be emitted on every request (because the configuration is most likely session-wide), the messages will quickly flood the logs and the user should notice. Alternatively, we could implement some frequency limiting of those warnings, but we rather prefer to: - keep it simple for now, - let the user notice the misconfiguration as quickly as possible. --- scylla/src/policies/load_balancing/default.rs | 55 ++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/scylla/src/policies/load_balancing/default.rs b/scylla/src/policies/load_balancing/default.rs index 7160b1bee0..6e037887ea 100644 --- a/scylla/src/policies/load_balancing/default.rs +++ b/scylla/src/policies/load_balancing/default.rs @@ -589,8 +589,9 @@ impl DefaultPolicy { } /// Checks for misconfiguration and warns if any is discovered. - fn pick_sanity_checks(&self, routing_info: &ProcessedRoutingInfo, _cluster: &ClusterState) { - if let Some(_preferred_dc) = self.preferences.datacenter() { + fn pick_sanity_checks(&self, routing_info: &ProcessedRoutingInfo, cluster: &ClusterState) { + if let Some(preferred_dc) = self.preferences.datacenter() { + // Preferred DC + no datacenter failover + SimpleStrategy is an anti-pattern. if let Some(ref token_with_strategy) = routing_info.token_with_strategy { if !self.permit_dc_failover && matches!( @@ -606,6 +607,56 @@ or refrain from preferring datacenters (which may ban all other datacenters, if ); } } + + // Verify that the preferred datacenter is actually present in the cluster. + // If not, shout a warning. + if cluster + .replica_locator() + .unique_nodes_in_datacenter_ring(preferred_dc) + .unwrap_or(&[]) + .is_empty() + { + if self.permit_dc_failover { + warn!( + "\ +The preferred datacenter (\"{preferred_dc}\") is not present in the cluster! \ +Datacenter failover is enabled, so the request will be always sent to remote DCs. \ +This is most likely not what you want!" + ); + } else { + warn!( + "\ +The preferred datacenter (\"{preferred_dc}\") is not present in the cluster! \ +Datacenter failover is disabled, so the query plans will be empty! \ +You won't be able to execute any requests!" + ); + } + } + // Verify that there exist any enabled nodes in the preferred datacenter. + // If not, shout a warning. + else if !cluster + .replica_locator() + .unique_nodes_in_datacenter_ring(preferred_dc) + .unwrap_or(&[]) + .iter() + .any(|node| node.is_enabled()) + { + if self.permit_dc_failover { + warn!( + "\ +All nodes in the preferred datacenter (\"{preferred_dc}\") are disabled by the HostFilter! \ +Datacenter failover is enabled, so the request will be always sent to remote DCs. \ +This is most likely a misconfiguration!" + ); + } else { + warn!( + "\ +All nodes in the preferred datacenter (\"{preferred_dc}\") are disabled by the HostFilter! \ +Datacenter failover is disabled, so the query plans will be empty! \ +You won't be able to execute any requests! This is most likely a misconfiguration!" + ); + } + } } } From c8f9a2ca3abf958be533f6b82db41e2b12501c5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Sun, 8 Jun 2025 19:46:48 +0200 Subject: [PATCH 4/6] observability: Introduce RateLimiter & macros RateLimiter is a new utility that allows rate-limiting arbitrary actions. It provides one central method, `try_acquire`, which checks if the action should be performed based on a specified interval. If the action has not been performed: 1. ever, or 2. in the last `interval`, it updates the last-performed timestamp and returns `true`, allowing the action to proceed. Otherwise, it returns `false`, indicating the action should be skipped due to rate limiting. This utility is designed to be efficient and thread-safe, using atomic operations to ensure that multiple threads can safely check and update the last-performed timestamp without locking. The `RateLimiter` is intended for use in scenarios where actions need to be performed at a controlled rate, such as logging, warning messages, or other operations that should not be executed too frequently to avoid spamming logs or overwhelming resources. For now, it's used to rate-limit warning messages emitted by the driver, in the implementation of `warn_rate_limited!` macro. NOTE to reviewers: this code has been generated with major help of the Claude Sonnet 4 AI model. Beware of common pitfalls of AI-generated code. --- scylla/src/observability/driver_tracing.rs | 6 +- scylla/src/observability/mod.rs | 3 + scylla/src/observability/rate_limiting.rs | 279 +++++++++++++++++++++ 3 files changed, 284 insertions(+), 4 deletions(-) create mode 100644 scylla/src/observability/rate_limiting.rs diff --git a/scylla/src/observability/driver_tracing.rs b/scylla/src/observability/driver_tracing.rs index 47b9c3a04d..9e0a549673 100644 --- a/scylla/src/observability/driver_tracing.rs +++ b/scylla/src/observability/driver_tracing.rs @@ -3,13 +3,11 @@ use crate::network::Connection; use crate::response::query_result::QueryResult; use crate::routing::{Shard, Token}; use itertools::{Either, Itertools}; -use scylla_cql::frame::response::result::ColumnSpec; -use scylla_cql::frame::response::result::RawMetadataAndRawRows; +use scylla_cql::frame::response::result::{ColumnSpec, RawMetadataAndRawRows}; use scylla_cql::value::deser_cql_value; use std::borrow::Borrow; use std::fmt::Display; -use std::sync::atomic::AtomicUsize; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use tracing::trace_span; diff --git a/scylla/src/observability/mod.rs b/scylla/src/observability/mod.rs index f5b16a4c70..4edaba9e9d 100644 --- a/scylla/src/observability/mod.rs +++ b/scylla/src/observability/mod.rs @@ -9,4 +9,7 @@ pub(crate) mod driver_tracing; pub mod history; #[cfg(feature = "metrics")] pub mod metrics; +mod rate_limiting; pub mod tracing; + +pub(crate) use rate_limiting::{rate_limited, warn_rate_limited, RateLimiter}; diff --git a/scylla/src/observability/rate_limiting.rs b/scylla/src/observability/rate_limiting.rs new file mode 100644 index 0000000000..a9dc6c8720 --- /dev/null +++ b/scylla/src/observability/rate_limiting.rs @@ -0,0 +1,279 @@ +//! Utilities for rate limiting actions in a lock-free manner. + +use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + OnceLock, + }, + time::{Duration, Instant}, +}; + +/// `RateLimiter` can be used to control the rate of permit acquisition. +/// It uses atomic operations to ensure that permits are acquired at most once per specified interval. +/// +/// # Example +/// +/// Ignored because `RateLimiter` is not public API. +/// ```ignore +/// let rate_limiter = RateLimiter::new(); +/// // Try to acquire a permit with a 1 second interval. +/// if rate_limiter.try_acquire(Duration::from_secs(1)) { +/// println!("Permit acquired!"); +/// } else { +/// println!("Rate limit exceeded, try again later."); +/// } +/// ``` +pub(crate) struct RateLimiter { + last_permit_nanos: AtomicU64, +} + +impl RateLimiter { + /// Creates a new `RateLimiter` instance, which is used to control the rate of permit acquisition. + pub(crate) const fn new() -> Self { + Self { + last_permit_nanos: AtomicU64::new(0), + } + } + + /// Attempts to acquire a permit, which is subject of rate limiting, + /// given the specified permit interval. + /// + /// If enough time has passed since the last permit was acquired, + /// updates the last permit time and returns `true`. Otherwise, + /// the rate is limited and `false` is returned. + /// + /// The first call to `try_acquire` will always succeed, allowing the first permit to be acquired immediately. + pub(crate) fn try_acquire(&self, interval: Duration) -> bool { + // Single global reference point for all rate limiters. + // Used as a necessary absolute reference point for comparing [std::time::Instant]. + static GLOBAL_EPOCH: OnceLock = OnceLock::new(); + + let now = Instant::now(); + let epoch = *GLOBAL_EPOCH.get_or_init(|| now); + + let now_nanos = now.duration_since(epoch).as_nanos() as u64; + let interval_nanos = interval.as_nanos() as u64; + + let last_permit = self.last_permit_nanos.load(Ordering::Relaxed); + + // Special case: if `last_permit` is 0, this is the first call, so allow it. + if last_permit == 0 { + // Try to update from 0 to current time. + return self + .last_permit_nanos + .compare_exchange( + 0, + now_nanos.max(1), // Ensure we never store 0 again. + Ordering::Relaxed, + Ordering::Relaxed, + ) + .is_ok(); + } + + // Normal case: check if enough time has passed. + if now_nanos.saturating_sub(last_permit) >= interval_nanos { + self.last_permit_nanos + .compare_exchange(last_permit, now_nanos, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + } else { + false + } + } +} + +/// A macro to perform an action if the rate limit allows it. +/// +/// This macro ensures that the given action is performed at most once per +/// specified interval, using efficient atomic operations for synchronization. +/// Each unique call site has its own independent rate limiting state. +/// +/// # Arguments +/// +/// * `$interval` - A `Duration` specifying the minimum time between actions. +/// * `$action` - A closure (of type `FnOnce()`) specifying an action executed +/// if the rate limit allows it. +/// +/// # Example +/// +/// Ignored because `rate_limited!` is not public API. +/// ```ignore +/// use std::time::Duration; +/// +/// // This will only print a message once per second, no matter how many times it's called. +/// rate_limited!(Duration::from_secs(1), || println!("This is a rate-limited print")); +/// ``` +/// +/// # Implementation Details +/// +/// - Uses `std::time::Instant` for monotonic time measurements immune to clock adjustments +/// - Uses `AtomicU64` for lock-free synchronization between threads +/// - Each macro call site gets its own static rate limiting state for independence +/// - Uses relaxed memory ordering for optimal performance +/// - Minimal macro expansion - just static definition + function call +/// +/// # Thread Safety +/// +/// This macro is fully thread-safe. Multiple threads can call the same rate-limited action +/// concurrently, and at most one thread will perform the action per interval. +macro_rules! rate_limited { + ($interval:expr, $action:expr) => {{ + use $crate::observability::RateLimiter; + + // Each call site gets its own static rate limiting state. + static RATE_LIMIT_STATE: RateLimiter = RateLimiter::new(); + + // Check if we should warn and emit if so + if RATE_LIMIT_STATE.try_acquire($interval) { + $action(); + } + }}; +} + +/// A rate-limited version of the `warn!()` macro that prevents spamming logs. +/// +/// This macro ensures that warning messages are only emitted at most once per +/// specified interval, using efficient atomic operations for synchronization. +/// Each unique call site has its own independent rate limiting state. +/// +/// # Arguments +/// +/// * `$interval` - A `Duration` specifying the minimum time between warnings +/// * `$args` - Arguments passed to the `warn!` macro (format string and values) +/// +/// # Examples +/// +/// Ignored because `warn_rate_limited!` is not public API. +/// ```ignore +/// use std::time::Duration; +/// +/// // This will only warn once per second, no matter how many times it's called +/// warn_rate_limited!(Duration::from_secs(1), "This is a rate-limited warning"); +/// +/// // With format arguments +/// warn_rate_limited!( +/// Duration::from_secs(5), +/// "Connection failed to {}: {}", +/// address, +/// error +/// ); +/// ``` +/// +/// # Implementation Details +/// +/// - Uses `std::time::Instant` for monotonic time measurements immune to clock adjustments +/// - Uses `AtomicU64` for lock-free synchronization between threads +/// - Each macro call site gets its own static rate limiting state for independence +/// - Uses relaxed memory ordering for optimal performance +/// - Minimal macro expansion - just static definition + function call +/// +/// # Thread Safety +/// +/// This macro is fully thread-safe. Multiple threads can call the same rate-limited warning +/// concurrently, and at most one thread will emit the warning per interval. +macro_rules! warn_rate_limited { + ($interval:expr $(, $args:expr)* $(,)?) => { + $crate::observability::rate_limited!($interval, || { + // Use the tracing crate to log the warning. + tracing::warn!($($args),*); + }); + }; +} + +pub(crate) use rate_limited; +pub(crate) use warn_rate_limited; + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + use std::thread; + use std::time::Duration; + + #[test] + fn test_rate_limit_state_basic() { + let state = RateLimiter::new(); + + // First call should return true + assert!(state.try_acquire(Duration::from_millis(100))); + + // Immediate second call should return false + assert!(!state.try_acquire(Duration::from_millis(100))); + + // After waiting, should return true again + std::thread::sleep(Duration::from_millis(150)); + assert!(state.try_acquire(Duration::from_millis(100))); + } + + #[test] + fn test_rate_limit_state_concurrent() { + let state = Arc::new(RateLimiter::new()); + let success_count = Arc::new(AtomicUsize::new(0)); + + let threads: Vec<_> = (0..10) + .map(|_| { + let state_clone = state.clone(); + let success_count_clone = success_count.clone(); + thread::spawn(move || { + for _ in 0..100 { + if state_clone.try_acquire(Duration::from_millis(10)) { + success_count_clone.fetch_add(1, Ordering::Relaxed); + } + } + }) + }) + .collect(); + + for t in threads { + t.join().unwrap(); + } + + // Should have significantly fewer successes than total attempts due to rate limiting + let successes = success_count.load(Ordering::Relaxed); + assert!( + successes > 0, + "Should have at least some successes (at least the first attempt)" + ); + assert!( + successes < 1000, + "Should be rate limited, got {} successes", + successes + ); + } + + #[test] + fn test_different_states_independent() { + let state1 = RateLimiter::new(); + let state2 = RateLimiter::new(); + + // Both should be able to acquire initially. + assert!(state1.try_acquire(Duration::from_millis(100))); + assert!(state2.try_acquire(Duration::from_millis(100))); + + // Both should be rate limited independently. + assert!(!state1.try_acquire(Duration::from_millis(100))); + assert!(!state2.try_acquire(Duration::from_millis(100))); + } + + #[test] + fn test_zero_interval() { + let state = RateLimiter::new(); + + // With zero interval, every call should succeed + assert!(state.try_acquire(Duration::ZERO)); + assert!(state.try_acquire(Duration::ZERO)); + assert!(state.try_acquire(Duration::ZERO)); + } + + #[test] + fn test_very_long_interval() { + let state = RateLimiter::new(); + + // First call succeeds + assert!(state.try_acquire(Duration::from_secs(3600))); // 1 hour + + // Subsequent calls should fail + assert!(!state.try_acquire(Duration::from_secs(3600))); + assert!(!state.try_acquire(Duration::from_secs(3600))); + } +} From a3923e406448c7ef41b696c27ca52a628b1d2445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Sun, 8 Jun 2025 20:01:51 +0200 Subject: [PATCH 5/6] default LBP: rate limit sanity checks warnings In case of misconfiguration, the default load balancing policy can throw a lot of warnings, which flood the logs and overwhelm the machine. To prevent this, we rate limit the warnings there to one per second. NOTE: each warning is rate limited separately, so if there are multiple misconfigurations, each will still be logged. --- scylla/src/policies/load_balancing/default.rs | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/scylla/src/policies/load_balancing/default.rs b/scylla/src/policies/load_balancing/default.rs index 6e037887ea..72626c9bfd 100644 --- a/scylla/src/policies/load_balancing/default.rs +++ b/scylla/src/policies/load_balancing/default.rs @@ -3,6 +3,7 @@ pub use self::latency_awareness::LatencyAwarenessBuilder; use super::{FallbackPlan, LoadBalancingPolicy, NodeRef, RoutingInfo}; use crate::cluster::ClusterState; +use crate::observability::warn_rate_limited; use crate::{ cluster::metadata::Strategy, cluster::node::Node, @@ -16,7 +17,7 @@ use rand_pcg::Pcg32; use scylla_cql::frame::response::result::TableSpec; use std::hash::{Hash, Hasher}; use std::{fmt, sync::Arc, time::Duration}; -use tracing::{debug, warn}; +use tracing::debug; use uuid::Uuid; #[derive(Clone, Copy)] @@ -590,6 +591,8 @@ impl DefaultPolicy { /// Checks for misconfiguration and warns if any is discovered. fn pick_sanity_checks(&self, routing_info: &ProcessedRoutingInfo, cluster: &ClusterState) { + const SANITY_CHECKS_WARNING_INTERVAL: std::time::Duration = + std::time::Duration::from_secs(1); if let Some(preferred_dc) = self.preferences.datacenter() { // Preferred DC + no datacenter failover + SimpleStrategy is an anti-pattern. if let Some(ref token_with_strategy) = routing_info.token_with_strategy { @@ -599,7 +602,7 @@ impl DefaultPolicy { Strategy::SimpleStrategy { .. } ) { - warn!("\ + warn_rate_limited!(SANITY_CHECKS_WARNING_INTERVAL, "\ Combining SimpleStrategy with preferred_datacenter set to Some and disabled datacenter failover may lead to empty query plans for some tokens.\ It is better to give up using one of them: either operate in a keyspace with NetworkTopologyStrategy, which explicitly states\ how many replicas there are in each datacenter (you probably want at least 1 to avoid empty plans while preferring that datacenter), \ @@ -617,14 +620,16 @@ or refrain from preferring datacenters (which may ban all other datacenters, if .is_empty() { if self.permit_dc_failover { - warn!( + warn_rate_limited!( + SANITY_CHECKS_WARNING_INTERVAL, "\ The preferred datacenter (\"{preferred_dc}\") is not present in the cluster! \ Datacenter failover is enabled, so the request will be always sent to remote DCs. \ This is most likely not what you want!" ); } else { - warn!( + warn_rate_limited!( + SANITY_CHECKS_WARNING_INTERVAL, "\ The preferred datacenter (\"{preferred_dc}\") is not present in the cluster! \ Datacenter failover is disabled, so the query plans will be empty! \ @@ -642,14 +647,16 @@ You won't be able to execute any requests!" .any(|node| node.is_enabled()) { if self.permit_dc_failover { - warn!( + warn_rate_limited!( + SANITY_CHECKS_WARNING_INTERVAL, "\ -All nodes in the preferred datacenter (\"{preferred_dc}\") are disabled by the HostFilter! \ -Datacenter failover is enabled, so the request will be always sent to remote DCs. \ -This is most likely a misconfiguration!" + All nodes in the preferred datacenter (\"{preferred_dc}\") are disabled by the HostFilter! \ + Datacenter failover is enabled, so the request will be always sent to remote DCs. \ + This is most likely a misconfiguration!" ); } else { - warn!( + warn_rate_limited!( + SANITY_CHECKS_WARNING_INTERVAL, "\ All nodes in the preferred datacenter (\"{preferred_dc}\") are disabled by the HostFilter! \ Datacenter failover is disabled, so the query plans will be empty! \ From 1458c5ab3cf2c33da39214c55b4c16ebdd15ff82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Sun, 8 Jun 2025 20:06:47 +0200 Subject: [PATCH 6/6] LBPs: rate limit various warnings There were some more warnings that were not rate-limited, which could flood the logs in case of misconfiguration. This commit adds rate limiting there, too. --- scylla/src/policies/load_balancing/default.rs | 7 ++++--- scylla/src/policies/load_balancing/single_target.rs | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scylla/src/policies/load_balancing/default.rs b/scylla/src/policies/load_balancing/default.rs index 72626c9bfd..bb2cea0fc8 100644 --- a/scylla/src/policies/load_balancing/default.rs +++ b/scylla/src/policies/load_balancing/default.rs @@ -20,6 +20,8 @@ use std::{fmt, sync::Arc, time::Duration}; use tracing::debug; use uuid::Uuid; +const SANITY_CHECKS_WARNING_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1); + #[derive(Clone, Copy)] enum NodeLocationCriteria<'a> { Any, @@ -591,8 +593,6 @@ impl DefaultPolicy { /// Checks for misconfiguration and warns if any is discovered. fn pick_sanity_checks(&self, routing_info: &ProcessedRoutingInfo, cluster: &ClusterState) { - const SANITY_CHECKS_WARNING_INTERVAL: std::time::Duration = - std::time::Duration::from_secs(1); if let Some(preferred_dc) = self.preferences.datacenter() { // Preferred DC + no datacenter failover + SimpleStrategy is an anti-pattern. if let Some(ref token_with_strategy) = routing_info.token_with_strategy { @@ -677,7 +677,8 @@ You won't be able to execute any requests! This is most likely a misconfiguratio { nodes } else { - tracing::warn!( + warn_rate_limited!( + SANITY_CHECKS_WARNING_INTERVAL, "Datacenter specified as the preferred one ({}) does not exist!", preferred_datacenter ); diff --git a/scylla/src/policies/load_balancing/single_target.rs b/scylla/src/policies/load_balancing/single_target.rs index f9e770722c..3470c437cd 100644 --- a/scylla/src/policies/load_balancing/single_target.rs +++ b/scylla/src/policies/load_balancing/single_target.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use uuid::Uuid; use crate::cluster::{ClusterState, Node, NodeRef}; +use crate::observability::warn_rate_limited; use crate::routing::Shard; use super::{LoadBalancingPolicy, RoutingInfo}; @@ -69,7 +70,8 @@ impl LoadBalancingPolicy for SingleTargetLoadBalancingPolicy { match node { Some(node) => Some((node, self.shard)), None => { - tracing::warn!( + warn_rate_limited!( + std::time::Duration::from_secs(1), "SingleTargetLoadBalancingPolicy failed to find requested node {:?} in cluster metadata.", self.node_identifier );