From 36e06cfb66c365da3fdc0f4df92ba9e1d2f146a4 Mon Sep 17 00:00:00 2001 From: Devesh Dama Date: Wed, 4 Jun 2025 05:15:12 +0000 Subject: [PATCH 1/2] retain valid certs on fetch failures --- src/identity/manager.rs | 88 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/src/identity/manager.rs b/src/identity/manager.rs index 4c7a73a26..9e13a2eb2 100644 --- a/src/identity/manager.rs +++ b/src/identity/manager.rs @@ -362,6 +362,10 @@ impl Worker { // Note that we are using a backoff-per-unique-identity-request. This is to prevent issues // when a cert cannot be fetched for Pod A, but that should not stall retries for // pods B, C, and D. + + // Check if we should retain the existing valid certificate + let existing_cert_info = self.get_existing_cert_info(&id).await; + let mut keyed_backoff = match pending_backoffs_by_id.remove(&id) { Some(backoff) => { backoff @@ -382,12 +386,24 @@ impl Worker { } } }; - let retry = keyed_backoff.next_backoff().unwrap_or(CERT_REFRESH_FAILURE_RETRY_DELAY_MAX_INTERVAL); + let retry_delay = keyed_backoff.next_backoff().unwrap_or(CERT_REFRESH_FAILURE_RETRY_DELAY_MAX_INTERVAL); // Store the per-key backoff, we're gonna retry. pending_backoffs_by_id.insert(id.clone(), keyed_backoff); - tracing::debug!(%id, "certificate fetch failed ({err}), retrying in {retry:?}"); - let refresh_at = Instant::now() + retry; - (CertState::Unavailable(err), refresh_at) + tracing::debug!(%id, "certificate fetch failed ({err}), retrying in {retry_delay:?}"); + let refresh_at = Instant::now() + retry_delay; + + match existing_cert_info { + // we do have a valid existing certificate, schedule retry + Some((valid_cert, cert_expiry_instant)) => { + let effective_refresh_at = std::cmp::min(refresh_at, cert_expiry_instant); + (CertState::Available(valid_cert), effective_refresh_at) + }, + // we don't have a valid existing certificate + None => { + tracing::debug!(%id, "certificate fetch failed ({err}) and no valid existing certificate, retrying in {retry_delay:?}"); + (CertState::Unavailable(err), refresh_at) + } + } }, Ok(certs) => { tracing::debug!(%id, "certificate fetch succeeded"); @@ -444,6 +460,40 @@ impl Worker { None => false, } } + + /// Returns existing valid certificate and its expiry time, or None if unavailable/expired + async fn get_existing_cert_info( + &self, + id: &Identity, + ) -> Option<(Arc, Instant)> { + if let Some(cert_channel) = self.certs.lock().await.get(id) { + match &*cert_channel.rx.borrow() { + CertState::Available(cert) => { + let now = self + .time_conv + .instant_to_system_time(std::time::Instant::now()); + if let Some(now) = now { + let cert_expiry = cert.cert.expiration().not_after; + + if now < cert_expiry { + if let Some(expiry_instant) = + self.time_conv.system_time_to_instant(cert_expiry) + { + tracing::debug!(%id, "existing certificate valid until {:?}", cert_expiry); + return Some((cert.clone(), expiry_instant.into())); + } + } else { + tracing::debug!(%id, "existing certificate expired at {:?}", cert_expiry); + } + } + } + _ => { + tracing::debug!(%id, "no valid certificate available to retain"); + } + } + } + None + } } // tokio::select evaluates each pattern before checking the (optional) associated condition. Work @@ -1109,6 +1159,36 @@ mod tests { assert!(sm.fetch_certificate(&id).await.is_ok()); } + #[tokio::test(start_paused = true)] + async fn test_get_existing_cert_info_basic() { + let test = setup(1); + let id = identity("basic-test"); + let info = test.secret_manager.worker.get_existing_cert_info(&id).await; + assert!(info.is_none()); + + // cleanup + test.tear_down().await; + } + + #[tokio::test(start_paused = true)] + async fn test_certificate_retention_functionality() { + let test = setup(1); + let id = identity("retention-test"); + let sm = test.secret_manager.clone(); + let info = test.secret_manager.worker.get_existing_cert_info(&id).await; + assert!(info.is_none()); + let _cert = sm.fetch_certificate(&id).await.unwrap(); + let info = test.secret_manager.worker.get_existing_cert_info(&id).await; + assert!(info.is_some()); + let (retained_cert, expiry) = info.unwrap(); + assert!(!retained_cert.cert.serial().is_empty()); + assert!(expiry > Instant::now()); + + // cleanup + sm.forget_certificate(&id).await; + std::mem::forget(test); + } + #[test] fn identity_from_string() { assert_eq!( From 1dd9e33d17488bee7e201e1fbafe4a595398157e Mon Sep 17 00:00:00 2001 From: Devesh Dama Date: Wed, 4 Jun 2025 23:10:29 +0000 Subject: [PATCH 2/2] better unit tests. - fetches now records failed attempts as well. - validate that valid certificate are retained across fetch attempts despite ca failures --- src/identity/caclient.rs | 2 +- src/identity/manager.rs | 38 ++++++++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/identity/caclient.rs b/src/identity/caclient.rs index 921f286cd..e059e9e31 100644 --- a/src/identity/caclient.rs +++ b/src/identity/caclient.rs @@ -226,13 +226,13 @@ pub mod mock { let not_after = not_before + self.cfg.cert_lifetime; let mut state = self.state.write().await; + state.fetches.push(id.to_owned()); if state.error { return Err(Error::Spiffe("injected test error".into())); } let certs = state .cert_gen .new_certs(&id.to_owned().into(), not_before, not_after); - state.fetches.push(id.to_owned()); Ok(certs) } diff --git a/src/identity/manager.rs b/src/identity/manager.rs index 9e13a2eb2..91b777730 100644 --- a/src/identity/manager.rs +++ b/src/identity/manager.rs @@ -1171,22 +1171,32 @@ mod tests { } #[tokio::test(start_paused = true)] - async fn test_certificate_retention_functionality() { - let test = setup(1); + async fn test_certificate_retention_on_refresh_failure() { + let mut test = setup(1); let id = identity("retention-test"); - let sm = test.secret_manager.clone(); - let info = test.secret_manager.worker.get_existing_cert_info(&id).await; - assert!(info.is_none()); - let _cert = sm.fetch_certificate(&id).await.unwrap(); - let info = test.secret_manager.worker.get_existing_cert_info(&id).await; - assert!(info.is_some()); - let (retained_cert, expiry) = info.unwrap(); - assert!(!retained_cert.cert.serial().is_empty()); - assert!(expiry > Instant::now()); + let start = Instant::now(); - // cleanup - sm.forget_certificate(&id).await; - std::mem::forget(test); + // get initial certificate + let initial_cert = test.secret_manager.fetch_certificate(&id).await.unwrap(); + let initial_serial = initial_cert.cert.serial().clone(); + let initial_fetch_count = test.caclient.fetches().await.len(); + + // simulate ca errors + test.caclient.set_error(true).await; + assert!(test.caclient.fetch_certificate(&id).await.is_err()); + + // wait for background refresh + tokio::time::sleep_until(start + CERT_HALFLIFE + SEC).await; + + // verify background refresh was attempted and valid certs were retained + let post_refresh_fetch_count = test.caclient.fetches().await.len(); + let current_cert = test.secret_manager.fetch_certificate(&id).await.unwrap(); + let current_serial = current_cert.cert.serial().clone(); + + assert!(post_refresh_fetch_count > initial_fetch_count); + assert_eq!(initial_serial, current_serial); + + test.tear_down().await; } #[test]